Refactored slightly some constants and operand handling in X86Assembler.

Refactored asmjit::x86 register definitions (now exported as a single symbol). Refactored bit utilities, now using proper naming like `or_`, `and_`, `andNot`. Refactored X86RegCount and X86RegMask to support K instead of Fp register. Refactored X86 instruction table (won't stay for long, new tool to export it is in development). Renamed instruction group to instruction encoding. Added XSAVE/OSXSAVE and other CPU features to X86CpuInfo. Added proper AVX and AVX-512 detection to X86CpuInfo. Added support to get content of XCR0 in X86CpuInfo (callXGetBV). Added XSAVE instruction set support (Assembler/Compiler). Added SSE4a instruction set support (Assembler/Compiler). Added X86KReg and X86KVar register/variable support (AVX-512). Added X86ZmmReg and X86ZmmVar register/variable support (AVX-512).
2025-12-17 12:34:35 +03:00 · 2014-11-01 13:07:56 +01:00
parent 1318c9aff7
commit 515d854d10
21 changed files with 6969 additions and 5448 deletions
--- a/src/app/test/genopcode.h
+++ b/src/app/test/genopcode.h
@@ -13,7 +13,7 @@

 namespace asmgen {

-enum { kGenOpCodeInstCount = 2640 };
+enum { kGenOpCodeInstCount = 2656 };

 // Generate all instructions asmjit can emit.
 static void opcode(asmjit::X86Assembler& a) {
@@ -1275,9 +1275,25 @@ static void opcode(asmjit::X86Assembler& a) {
  a.pcmpistrm(xmm0, ptr_gp0, 0);
  a.pcmpgtq(xmm0, xmm7);
  a.pcmpgtq(xmm0, ptr_gp0);
+
+  // SSE4a.
+  a.nop();
+
+  a.extrq(xmm0, xmm1);
+  a.extrq(xmm0, 0x1, 0x2);
+  a.insertq(xmm0, xmm1);
+  a.insertq(xmm0, xmm1, 0x1, 0x2);
+  a.movntsd(ptr_gp0, xmm0);
+  a.movntss(ptr_gp0, xmm0);
+
+  // POPCNT.
+  a.nop();
+
  a.popcnt(gp0, ptr_gp0);

  // AESNI.
+  a.nop();
+
  a.aesdec(xmm0, xmm7);
  a.aesdec(xmm0, ptr_gp0);
  a.aesdeclast(xmm0, xmm7);
@@ -1292,10 +1308,24 @@ static void opcode(asmjit::X86Assembler& a) {
  a.aeskeygenassist(xmm0, ptr_gp0, 0);

  // PCLMULQDQ.
+  a.nop();
+
  a.pclmulqdq(xmm0, xmm7, 0);
  a.pclmulqdq(xmm0, ptr_gp0, 0);

+  // XSAVE.
+  a.nop();
+
+  a.xgetbv();
+  a.xsetbv();
+
+  a.xsave(ptr_gp0);
+  a.xsaveopt(ptr_gp0);
+  a.xrstor(ptr_gp0);
+
  // AVX.
+  a.nop();
+
  a.vaddpd(xmm0, xmm1, xmm2);
  a.vaddpd(xmm0, xmm1, ptr_gp0);
  a.vaddpd(ymm0, ymm1, ymm2);
@@ -1946,6 +1976,8 @@ static void opcode(asmjit::X86Assembler& a) {
  a.vzeroupper();

  // AVX+AESNI.
+  a.nop();
+
  a.vaesdec(xmm0, xmm1, xmm2);
  a.vaesdec(xmm0, xmm1, ptr_gp0);
  a.vaesdeclast(xmm0, xmm1, xmm2);
@@ -1960,10 +1992,14 @@ static void opcode(asmjit::X86Assembler& a) {
  a.vaeskeygenassist(xmm0, ptr_gp0, 0);

  // AVX+PCLMULQDQ.
+  a.nop();
+
  a.vpclmulqdq(xmm0, xmm1, xmm2, 0);
  a.vpclmulqdq(xmm0, xmm1, ptr_gp0, 0);

  // AVX2.
+  a.nop();
+
  a.vbroadcasti128(ymm0, ptr_gp0);
  a.vbroadcastsd(ymm0, xmm1);
  a.vbroadcastss(xmm0, xmm1);
@@ -2268,6 +2304,8 @@ static void opcode(asmjit::X86Assembler& a) {
  a.vpxor(ymm0, ymm1, ymm2);

  // FMA3.
+  a.nop();
+
  a.vfmadd132pd(xmm0, xmm1, ptr_gp0);
  a.vfmadd132pd(xmm0, xmm1, xmm2);
  a.vfmadd132pd(ymm0, ymm1, ptr_gp0);
@@ -2462,6 +2500,8 @@ static void opcode(asmjit::X86Assembler& a) {
  a.vfnmsub231ss(xmm0, xmm1, xmm2);

  // FMA4.
+  a.nop();
+
  a.vfmaddpd(xmm0, xmm1, xmm2, xmm3);
  a.vfmaddpd(xmm0, xmm1, ptr_gp0, xmm3);
  a.vfmaddpd(xmm0, xmm1, xmm2, ptr_gp0);
@@ -2560,6 +2600,8 @@ static void opcode(asmjit::X86Assembler& a) {
  a.vfnmsubss(xmm0, xmm1, xmm2, ptr_gp0);

  // XOP.
+  a.nop();
+
  a.vfrczpd(xmm0, xmm1);
  a.vfrczpd(xmm0, ptr_gp0);
  a.vfrczpd(ymm0, ymm1);
@@ -2709,6 +2751,8 @@ static void opcode(asmjit::X86Assembler& a) {
  a.vpshlw(xmm0, xmm1, ptr_gp0);

  // BMI.
+  a.nop();
+
  a.andn(gp0, gp1, zcx);
  a.andn(gp0, gp1, ptr_gp1);
  a.bextr(gp0, gp1, zcx);
@@ -2721,14 +2765,20 @@ static void opcode(asmjit::X86Assembler& a) {
  a.blsr(gp0, ptr_gp1);

  // LZCNT.
+  a.nop();
+
  a.lzcnt(gp0, gp1);
  a.lzcnt(gp0, ptr_gp1);

  // TZCNT.
+  a.nop();
+
  a.tzcnt(gp0, gp1);
  a.tzcnt(gp0, ptr_gp1);

  // BMI2.
+  a.nop();
+
  a.bzhi(gp0, gp1, zcx);
  a.bzhi(gp0, ptr_gp1, zcx);
  a.mulx(gp0, gp1, zcx);
@@ -2747,9 +2797,13 @@ static void opcode(asmjit::X86Assembler& a) {
  a.shrx(gp0, ptr_gp1, zcx);

  // RDRAND.
+  a.nop();
+
  a.rdrand(gp0);

  // F16C.
+  a.nop();
+
  a.vcvtph2ps(xmm0, xmm1);
  a.vcvtph2ps(xmm0, ptr_gp1);
  a.vcvtph2ps(ymm0, xmm1);
@@ -2758,6 +2812,9 @@ static void opcode(asmjit::X86Assembler& a) {
  a.vcvtps2ph(ptr_gp0, xmm1, 0);
  a.vcvtps2ph(xmm0, ymm1, 0);
  a.vcvtps2ph(ptr_gp0, ymm1, 0);
+
+  // Mark the end of the stream.
+  a.nop();
 }

 } // asmgen namespace
--- a/src/asmjit/asmjit.h
+++ b/src/asmjit/asmjit.h
@@ -235,15 +235,15 @@
 //! `BaseMem` class. These functions are used to make operands that represents
 //! memory addresses:
 //!
-//! - `asmjit::ptr()`
-//! - `asmjit::byte_ptr()`
-//! - `asmjit::word_ptr()`
-//! - `asmjit::dword_ptr()`
-//! - `asmjit::qword_ptr()`
-//! - `asmjit::tword_ptr()`
-//! - `asmjit::oword_ptr()`
-//! - `asmjit::yword_ptr()`
-//! - `asmjit::zword_ptr()`
+//! - `asmjit::ptr()`       - Address size not specified.
+//! - `asmjit::byte_ptr()`  - 1 byte.
+//! - `asmjit::word_ptr()`  - 2 bytes (Gpw size).
+//! - `asmjit::dword_ptr()` - 4 bytes (Gpd size).
+//! - `asmjit::qword_ptr()` - 8 bytes (Gpq/Mm size).
+//! - `asmjit::tword_ptr()` - 10 bytes (FPU).
+//! - `asmjit::oword_ptr()` - 16 bytes (Xmm size).
+//! - `asmjit::yword_ptr()` - 32 bytes (Ymm size).
+//! - `asmjit::zword_ptr()` - 64 bytes (Zmm size).
 //!
 //! Most useful function to make pointer should be `asmjit::ptr()`. It creates
 //! pointer to the target with unspecified size. Unspecified size works in all
@@ -298,10 +298,10 @@
 //! // Get `X86CpuInfo` global instance.
 //! const X86CpuInfo* cpuInfo = X86CpuInfo::getHost();
 //!
-//! if (cpuInfo->hasFeature(kX86CpuFeatureSse2)) {
+//! if (cpuInfo->hasFeature(kX86CpuFeatureSSE2)) {
 //!   // Processor has SSE2.
 //! }
-//! else if (cpuInfo->hasFeature(kX86CpuFeatureMmx)) {
+//! else if (cpuInfo->hasFeature(kX86CpuFeatureMMX)) {
 //!   // Processor doesn't have SSE2, but has MMX.
 //! }
 //! else {
--- a/src/asmjit/base/assembler.h
+++ b/src/asmjit/base/assembler.h
@@ -42,7 +42,7 @@ ASMJIT_ENUM(kInstId) {
 //! Instruction options (stub).
 ASMJIT_ENUM(kInstOptions) {
  //! No instruction options.
-  kInstOptionNone = 0x00,
+  kInstOptionNone = 0x00000000,

  //! Emit short form of the instruction.
  //!
@@ -53,7 +53,8 @@ ASMJIT_ENUM(kInstOptions) {
  //! can be dangerous if the short jmp/jcc is required, but not encodable due
  //! to large displacement, in such case an error happens and the whole
  //! assembler/compiler stream is unusable.
-  kInstOptionShortForm = 0x01,
+  kInstOptionShortForm = 0x00000001,
+
  //! Emit long form of the instruction.
  //!
  //! X86/X64:
@@ -61,12 +62,13 @@ ASMJIT_ENUM(kInstOptions) {
  //! Long form is mosrlt related to jmp and jcc instructions, but like the
  //! `kInstOptionShortForm` option it can be used by other instructions
  //! supporting both 8-bit and 32-bit immediates.
-  kInstOptionLongForm = 0x02,
+  kInstOptionLongForm = 0x00000002,

  //! Condition is likely to be taken.
-  kInstOptionTaken = 0x04,
+  kInstOptionTaken = 0x00000004,
+
  //! Condition is unlikely to be taken.
-  kInstOptionNotTaken = 0x08
+  kInstOptionNotTaken = 0x00000008
 };

 // ============================================================================
--- a/src/asmjit/base/compiler.h
+++ b/src/asmjit/base/compiler.h
@@ -993,11 +993,11 @@ struct VarAttr {
  //! Get whether `flag` is on.
  ASMJIT_INLINE bool hasFlag(uint32_t flag) { return (_flags & flag) != 0; }
  //! Add `flags`.
-  ASMJIT_INLINE void addFlags(uint32_t flags) { _flags |= flags; }
+  ASMJIT_INLINE void orFlags(uint32_t flags) { _flags |= flags; }
  //! Mask `flags`.
  ASMJIT_INLINE void andFlags(uint32_t flags) { _flags &= flags; }
  //! Clear `flags`.
-  ASMJIT_INLINE void delFlags(uint32_t flags) { _flags &= ~flags; }
+  ASMJIT_INLINE void andNotFlags(uint32_t flags) { _flags &= ~flags; }

  //! Get how many times the variable is used by the instruction/node.
  ASMJIT_INLINE uint32_t getVarCount() const { return _varCount; }
@@ -1642,7 +1642,7 @@ struct Node {
  // [Accessors - Type and Flags]
  // --------------------------------------------------------------------------

-  //! Get type of node, see `kNodeType`.
+  //! Get node type, see `kNodeType`.
  ASMJIT_INLINE uint32_t getType() const {
    return _type;
  }
@@ -1652,24 +1652,29 @@ struct Node {
    return _flags;
  }

-  //! Set node flags to `flags`.
-  ASMJIT_INLINE void setFlags(uint32_t flags) {
-    _flags = static_cast<uint16_t>(flags);
-  }
-
  //! Get whether the instruction has flag `flag`.
  ASMJIT_INLINE bool hasFlag(uint32_t flag) const {
    return (static_cast<uint32_t>(_flags) & flag) != 0;
  }

+  //! Set node flags to `flags`.
+  ASMJIT_INLINE void setFlags(uint32_t flags) {
+    _flags = static_cast<uint16_t>(flags);
+  }
+
  //! Add instruction `flags`.
-  ASMJIT_INLINE void addFlags(uint32_t flags) {
+  ASMJIT_INLINE void orFlags(uint32_t flags) {
    _flags |= static_cast<uint16_t>(flags);
  }

+  //! And instruction `flags`.
+  ASMJIT_INLINE void andFlags(uint32_t flags) {
+    _flags &= static_cast<uint16_t>(flags);
+  }
+
  //! Clear instruction `flags`.
-  ASMJIT_INLINE void delFlags(uint32_t flags) {
-    _flags &= static_cast<uint16_t>(~flags);
+  ASMJIT_INLINE void andNotFlags(uint32_t flags) {
+    _flags &= ~static_cast<uint16_t>(flags);
  }

  //! Get whether the node has beed fetched.
@@ -1695,18 +1700,18 @@ struct Node {
    return hasFlag(kNodeFlagIsInformative);
  }

-  //! Whether the instruction is an unconditional jump.
+  //! Whether the node is `InstNode` and unconditional jump.
  ASMJIT_INLINE bool isJmp() const { return hasFlag(kNodeFlagIsJmp); }
-  //! Whether the instruction is a conditional jump.
+  //! Whether the node is `InstNode` and conditional jump.
  ASMJIT_INLINE bool isJcc() const { return hasFlag(kNodeFlagIsJcc); }
-  //! Whether the instruction is an unconditional or conditional jump.
+  //! Whether the node is `InstNode` and conditional/unconditional jump.
  ASMJIT_INLINE bool isJmpOrJcc() const { return hasFlag(kNodeFlagIsJmp | kNodeFlagIsJcc); }
-  //! Whether the instruction is a return.
+  //! Whether the node is `InstNode` and return.
  ASMJIT_INLINE bool isRet() const { return hasFlag(kNodeFlagIsRet); }

-  //! Get whether the instruction is special.
+  //! Get whether the node is `InstNode` and the instruction is special.
  ASMJIT_INLINE bool isSpecial() const { return hasFlag(kNodeFlagIsSpecial); }
-  //! Get whether the instruction accesses FPU.
+  //! Get whether the node is `InstNode` and the instruction uses x87-FPU.
  ASMJIT_INLINE bool isFp() const { return hasFlag(kNodeFlagIsFp); }

  // --------------------------------------------------------------------------
@@ -1897,7 +1902,9 @@ struct EmbedNode : public Node {
  // --------------------------------------------------------------------------

  //! Create a new `EmbedNode` instance.
-  ASMJIT_INLINE EmbedNode(Compiler* compiler, void* data, uint32_t size) : Node(compiler, kNodeTypeEmbed) {
+  ASMJIT_INLINE EmbedNode(Compiler* compiler, void* data, uint32_t size) :
+    Node(compiler, kNodeTypeEmbed) {
+
    _size = size;
    if (size <= kInlineBufferSize) {
      if (data != NULL)
@@ -1953,7 +1960,7 @@ struct CommentNode : public Node {

  //! Create a new `CommentNode` instance.
  ASMJIT_INLINE CommentNode(Compiler* compiler, const char* comment) : Node(compiler, kNodeTypeComment) {
-    addFlags(kNodeFlagIsInformative);
+    orFlags(kNodeFlagIsInformative);
    _comment = comment;
  }

@@ -1974,8 +1981,10 @@ struct HintNode : public Node {
  // --------------------------------------------------------------------------

  //! Create a new `HintNode` instance.
-  ASMJIT_INLINE HintNode(Compiler* compiler, VarData* vd, uint32_t hint, uint32_t value) : Node(compiler, kNodeTypeHint) {
-    addFlags(kNodeFlagIsInformative);
+  ASMJIT_INLINE HintNode(Compiler* compiler, VarData* vd, uint32_t hint, uint32_t value) :
+    Node(compiler, kNodeTypeHint) {
+
+    orFlags(kNodeFlagIsInformative);
    _vd = vd;
    _hint = hint;
    _value = value;
@@ -2101,9 +2110,12 @@ struct InstNode : public Node {
  // --------------------------------------------------------------------------

  //! Create a new `InstNode` instance.
-  ASMJIT_INLINE InstNode(Compiler* compiler, uint32_t code, uint32_t options, Operand* opList, uint32_t opCount) : Node(compiler, kNodeTypeInst) {
-    _code = static_cast<uint16_t>(code);
-    _options = static_cast<uint8_t>(options);
+  ASMJIT_INLINE InstNode(Compiler* compiler, uint32_t instId, uint32_t instOptions, Operand* opList, uint32_t opCount) :
+    Node(compiler, kNodeTypeInst) {
+
+    _instId = static_cast<uint16_t>(instId);
+    _reserved = 0;
+    _instOptions = instOptions;

    _opCount = static_cast<uint8_t>(opCount);
    _opList = opList;
@@ -2118,18 +2130,17 @@ struct InstNode : public Node {
  // [Accessors]
  // --------------------------------------------------------------------------

-  //! Get instruction code, see `kX86InstId`.
-  ASMJIT_INLINE uint32_t getCode() const {
-    return _code;
+  //! Get instruction ID, see `kX86InstId`.
+  ASMJIT_INLINE uint32_t getInstId() const {
+    return _instId;
  }

-  //! Set instruction code to `code`.
+  //! Set instruction ID to `instId`.
  //!
-  //! Please do not modify instruction code if you are not know what you are
-  //! doing. Incorrect instruction code or operands can raise assertion() at
-  //! runtime.
-  ASMJIT_INLINE void setCode(uint32_t code) {
-    _code = static_cast<uint16_t>(code);
+  //! Please do not modify instruction code if you don't know what are you
+  //! doing. Incorrect instruction code or operands can cause assertion failure.
+  ASMJIT_INLINE void setInstId(uint32_t instId) {
+    _instId = static_cast<uint16_t>(instId);
  }

  //! Whether the instruction is an unconditional jump or whether the
@@ -2140,23 +2151,23 @@ struct InstNode : public Node {

  //! Get emit options.
  ASMJIT_INLINE uint32_t getOptions() const {
-    return _options;
+    return _instOptions;
  }
  //! Set emit options.
  ASMJIT_INLINE void setOptions(uint32_t options) {
-    _options = static_cast<uint8_t>(options);
+    _instOptions = options;
  }
  //! Add emit options.
  ASMJIT_INLINE void addOptions(uint32_t options) {
-    _options |= static_cast<uint8_t>(options);
+    _instOptions |= options;
  }
  //! Mask emit options.
  ASMJIT_INLINE void andOptions(uint32_t options) {
-    _options &= static_cast<uint8_t>(options);
+    _instOptions &= options;
  }
  //! Clear emit options.
  ASMJIT_INLINE void delOptions(uint32_t options) {
-    _options &= static_cast<uint8_t>(~options);
+    _instOptions &= ~options;
  }

  //! Get operands list.
@@ -2225,12 +2236,14 @@ _Update:
  // [Members]
  // --------------------------------------------------------------------------

-  //! Instruction code, see `kInstId`.
-  uint16_t _code;
-  //! Instruction options, see `kInstOptions`.
-  uint8_t _options;
+  //! Instruction ID, see `kInstId`.
+  uint16_t _instId;
  //! \internal
  uint8_t _memOpIndex;
+  //! \internal
+  uint8_t _reserved;
+  //! Instruction options, see `kInstOptions`.
+  uint32_t _instOptions;

  //! Operands list.
  Operand* _opList;
@@ -2474,7 +2487,7 @@ struct FuncNode : public Node {
  //! The "Red Zone" size - count of bytes which might be accessed without
  //! adjusting the stack pointer.
  uint16_t _redZoneSize;
-  //! Spill zone size (zone used by WIN64ABI).
+  //! Spill zone size (used by WIN64 ABI).
  uint16_t _spillZoneSize;

  //! Stack size needed for function arguments.
--- a/src/asmjit/base/constpool.cpp
+++ b/src/asmjit/base/constpool.cpp
@@ -396,7 +396,7 @@ UNIT(base_constpool) {
      EXPECT(prevOffset + 8 == curOffset,
        "pool.add() - Returned incorrect curOffset.");
      EXPECT(pool.getSize() == (i + 1) * 8,
-        "pool.getSize() - Reports incorrect size.");
+        "pool.getSize() - Reported incorrect size.");
      prevOffset = curOffset;
    }

--- a/src/asmjit/base/intutil.h
+++ b/src/asmjit/base/intutil.h
@@ -566,6 +566,26 @@ union UInt64 {
    return *this;
  }

+  // --------------------------------------------------------------------------
+  // [AndNot]
+  // --------------------------------------------------------------------------
+
+  ASMJIT_INLINE UInt64& andNot(uint64_t val) {
+    u64 &= ~val;
+    return *this;
+  }
+
+  ASMJIT_INLINE UInt64& andNot(const UInt64& val) {
+    if (kArchHost64Bit) {
+      u64 &= ~val.u64;
+    }
+    else {
+      u32[0] &= ~val.u32[0];
+      u32[1] &= ~val.u32[1];
+    }
+    return *this;
+  }
+
  // --------------------------------------------------------------------------
  // [Or]
  // --------------------------------------------------------------------------
@@ -606,26 +626,6 @@ union UInt64 {
    return *this;
  }

-  // --------------------------------------------------------------------------
-  // [Del]
-  // --------------------------------------------------------------------------
-
-  ASMJIT_INLINE UInt64& del(uint64_t val) {
-    u64 &= ~val;
-    return *this;
-  }
-
-  ASMJIT_INLINE UInt64& del(const UInt64& val) {
-    if (kArchHost64Bit) {
-      u64 &= ~val.u64;
-    }
-    else {
-      u32[0] &= ~val.u32[0];
-      u32[1] &= ~val.u32[1];
-    }
-    return *this;
-  }
-
  // --------------------------------------------------------------------------
  // [Eq]
  // --------------------------------------------------------------------------
--- a/src/asmjit/test/main.cpp
+++ b/src/asmjit/test/main.cpp
@@ -44,49 +44,64 @@ static void dumpCpu(void) {
  const X86CpuInfo* x86Cpu = static_cast<const X86CpuInfo*>(cpu);

  static const DumpCpuFeature x86FeaturesList[] = {
-    { kX86CpuFeatureMultithreading     , "Multithreading"      },
-    { kX86CpuFeatureExecuteDisableBit  , "Execute-Disable Bit" },
-    { kX86CpuFeatureRdtsc              , "Rdtsc"               },
-    { kX86CpuFeatureRdtscp             , "Rdtscp"              },
-    { kX86CpuFeatureCmov               , "Cmov"                },
-    { kX86CpuFeatureCmpXchg8B          , "Cmpxchg8b"           },
-    { kX86CpuFeatureCmpXchg16B         , "Cmpxchg16b"          },
-    { kX86CpuFeatureClflush            , "Clflush"             },
-    { kX86CpuFeaturePrefetch           , "Prefetch"            },
-    { kX86CpuFeatureLahfSahf           , "Lahf/Sahf"           },
-    { kX86CpuFeatureFxsr               , "Fxsave/Fxrstor"      },
-    { kX86CpuFeatureFfxsr              , "Fxsave/Fxrstor Opt." },
-    { kX86CpuFeatureMmx                , "Mmx"                 },
-    { kX86CpuFeatureMmxExt             , "MmxExt"              },
-    { kX86CpuFeature3dNow              , "3dnow"               },
-    { kX86CpuFeature3dNowExt           , "3dnowExt"            },
-    { kX86CpuFeatureSse                , "Sse"                 },
-    { kX86CpuFeatureSse2               , "Sse2"                },
-    { kX86CpuFeatureSse3               , "Sse3"                },
-    { kX86CpuFeatureSsse3              , "Ssse3"               },
-    { kX86CpuFeatureSse4A              , "Sse4a"               },
-    { kX86CpuFeatureSse41              , "Sse4.1"              },
-    { kX86CpuFeatureSse42              , "Sse4.2"              },
-    { kX86CpuFeatureMsse               , "Misaligned SSE"      },
-    { kX86CpuFeatureMonitorMWait       , "Monitor/MWait"       },
-    { kX86CpuFeatureMovbe              , "Movbe"               },
-    { kX86CpuFeaturePopcnt             , "Popcnt"              },
-    { kX86CpuFeatureLzcnt              , "Lzcnt"               },
-    { kX86CpuFeatureAesni              , "AesNI"               },
-    { kX86CpuFeaturePclmulqdq          , "Pclmulqdq"           },
-    { kX86CpuFeatureRdrand             , "Rdrand"              },
-    { kX86CpuFeatureAvx                , "Avx"                 },
-    { kX86CpuFeatureAvx2               , "Avx2"                },
+    { kX86CpuFeatureNX                 , "NX (Non-Execute Bit)"  },
+    { kX86CpuFeatureMT                 , "MT (Multi-Threading)"  },
+    { kX86CpuFeatureRDTSC              , "RDTSC"                 },
+    { kX86CpuFeatureRDTSCP             , "RDTSCP"                },
+    { kX86CpuFeatureCMOV               , "CMOV"                  },
+    { kX86CpuFeatureCMPXCHG8B          , "CMPXCHG8B"             },
+    { kX86CpuFeatureCMPXCHG16B         , "CMPXCHG16B"            },
+    { kX86CpuFeatureCLFLUSH            , "CLFLUSH"               },
+    { kX86CpuFeatureCLFLUSHOpt         , "CLFLUSH (Opt)"         },
+    { kX86CpuFeaturePREFETCH           , "PREFETCH"              },
+    { kX86CpuFeaturePREFETCHWT1        , "PREFETCHWT1"           },
+    { kX86CpuFeatureLahfSahf           , "LAHF/SAHF"             },
+    { kX86CpuFeatureFXSR               , "FXSR"                  },
+    { kX86CpuFeatureFXSROpt            , "FXSR (Opt)"            },
+    { kX86CpuFeatureMMX                , "MMX"                   },
+    { kX86CpuFeatureMMX2               , "MMX2"                  },
+    { kX86CpuFeature3DNOW              , "3DNOW"                 },
+    { kX86CpuFeature3DNOW2             , "3DNOW2"                },
+    { kX86CpuFeatureSSE                , "SSE"                   },
+    { kX86CpuFeatureSSE2               , "SSE2"                  },
+    { kX86CpuFeatureSSE3               , "SSE3"                  },
+    { kX86CpuFeatureSSSE3              , "SSSE3"                 },
+    { kX86CpuFeatureSSE4A              , "SSE4A"                 },
+    { kX86CpuFeatureSSE4_1             , "SSE4.1"                },
+    { kX86CpuFeatureSSE4_2             , "SSE4.2"                },
+    { kX86CpuFeatureMSSE               , "Misaligned SSE"        },
+    { kX86CpuFeatureMONITOR            , "MONITOR/MWAIT"         },
+    { kX86CpuFeatureMOVBE              , "MOVBE"                 },
+    { kX86CpuFeaturePOPCNT             , "POPCNT"                },
+    { kX86CpuFeatureLZCNT              , "LZCNT"                 },
+    { kX86CpuFeatureAESNI              , "AESNI"                 },
+    { kX86CpuFeaturePCLMULQDQ          , "PCLMULQDQ"             },
+    { kX86CpuFeatureRDRAND             , "RDRAND"                },
+    { kX86CpuFeatureRDSEED             , "RDSEED"                },
+    { kX86CpuFeatureSHA                , "SHA"                   },
+    { kX86CpuFeatureXSave              , "XSAVE"                 },
+    { kX86CpuFeatureXSaveOS            , "XSAVE (OS)"            },
+    { kX86CpuFeatureAVX                , "AVX"                   },
+    { kX86CpuFeatureAVX2               , "AVX2"                  },
    { kX86CpuFeatureF16C               , "F16C"                  },
-    { kX86CpuFeatureFma3               , "Fma3"                },
-    { kX86CpuFeatureFma4               , "Fma4"                },
-    { kX86CpuFeatureXop                , "Xop"                 },
-    { kX86CpuFeatureBmi                , "Bmi"                 },
-    { kX86CpuFeatureBmi2               , "Bmi2"                },
-    { kX86CpuFeatureHle                , "Hle"                 },
-    { kX86CpuFeatureRtm                , "Rtm"                 },
-    { kX86CpuFeatureFsGsBase           , "FsGsBase"            },
-    { kX86CpuFeatureRepMovsbStosbExt   , "RepMovsbStosbExt"    }
+    { kX86CpuFeatureFMA3               , "FMA3"                  },
+    { kX86CpuFeatureFMA4               , "FMA4"                  },
+    { kX86CpuFeatureXOP                , "XOP"                   },
+    { kX86CpuFeatureBMI                , "BMI"                   },
+    { kX86CpuFeatureBMI2               , "BMI2"                  },
+    { kX86CpuFeatureHLE                , "HLE"                   },
+    { kX86CpuFeatureRTM                , "RTM"                   },
+    { kX86CpuFeatureADX                , "ADX"                   },
+    { kX86CpuFeatureMPX                , "MPX"                   },
+    { kX86CpuFeatureFSGSBase           , "FS/GS Base"            },
+    { kX86CpuFeatureMOVSBSTOSBOpt      , "REP MOVSB/STOSB (Opt)" },
+    { kX86CpuFeatureAVX512F            , "AVX512F"               },
+    { kX86CpuFeatureAVX512CD           , "AVX512CD"              },
+    { kX86CpuFeatureAVX512PF           , "AVX512PF"              },
+    { kX86CpuFeatureAVX512ER           , "AVX512ER"              },
+    { kX86CpuFeatureAVX512DQ           , "AVX512DQ"              },
+    { kX86CpuFeatureAVX512BW           , "AVX512BW"              },
+    { kX86CpuFeatureAVX512VL           , "AVX512VL"              }
  };

  INFO("Host CPU Info (X86/X64):");
@@ -178,16 +193,19 @@ static void dumpSizeOf(void) {
 #if defined(ASMJIT_BUILD_X86) || defined(ASMJIT_BUILD_X64)
  INFO("SizeOf X86/X64:");
  DUMP_TYPE(asmjit::X86Assembler);
+  DUMP_TYPE(asmjit::X86InstInfo);
+  DUMP_TYPE(asmjit::X86InstExtendedInfo);
+
 #if !defined(ASMJIT_DISABLE_COMPILER)
  DUMP_TYPE(asmjit::X86Compiler);
  DUMP_TYPE(asmjit::X86CallNode);
  DUMP_TYPE(asmjit::X86FuncNode);
  DUMP_TYPE(asmjit::X86FuncDecl);
-  DUMP_TYPE(asmjit::X86InstInfo);
  DUMP_TYPE(asmjit::X86VarMap);
  DUMP_TYPE(asmjit::X86VarInfo);
  DUMP_TYPE(asmjit::X86VarState);
 #endif // !ASMJIT_DISABLE_COMPILER
+
  INFO("");
 #endif // ASMJIT_BUILD_X86
 }
--- a/src/asmjit/x86/x86assembler.cpp
+++ b/src/asmjit/x86/x86assembler.cpp
--- a/src/asmjit/x86/x86assembler.h
+++ b/src/asmjit/x86/x86assembler.h
@@ -57,16 +57,64 @@ namespace asmjit {
    return *this; \
  } \
  \
-  /*! Force REX prefix. */ \
+  /*! Force REX prefix (X64). */ \
  ASMJIT_INLINE _Class_& rex() { \
    _instOptions |= kX86InstOptionRex; \
    return *this; \
  } \
  \
-  /*! Force 3-byte VEX prefix. */ \
+  /*! Force 3-byte VEX prefix (AVX+). */ \
  ASMJIT_INLINE _Class_& vex3() { \
    _instOptions |= kX86InstOptionVex3; \
    return *this; \
+  } \
+  \
+  /*! Force 4-byte EVEX prefix (AVX512+). */ \
+  ASMJIT_INLINE _Class_& evex() { \
+    _instOptions |= kX86InstOptionEvex; \
+    return *this; \
+  } \
+  \
+  /*! Use zeroing instead of merging (AVX512+). */ \
+  ASMJIT_INLINE _Class_& z() { \
+    _instOptions |= kX86InstOptionEvexZero; \
+    return *this; \
+  } \
+  \
+  /*! Broadcast one element to all other elements (AVX512+). */ \
+  ASMJIT_INLINE _Class_& _1ToN() { \
+    _instOptions |= kX86InstOptionEvexOneN; \
+    return *this; \
+  } \
+  \
+  /*! Suppress all exceptions (AVX512+). */ \
+  ASMJIT_INLINE _Class_& sae() { \
+    _instOptions |= kX86InstOptionEvexSae; \
+    return *this; \
+  } \
+  \
+  /*! Static rounding mode `round-to-nearest` (even) and `SAE` (AVX512+). */ \
+  ASMJIT_INLINE _Class_& rn_sae() { \
+    _instOptions |= kX86InstOptionEvexRnSae; \
+    return *this; \
+  } \
+  \
+  /*! Static rounding mode `round-down` (toward -inf) and `SAE` (AVX512+). */ \
+  ASMJIT_INLINE _Class_& rd_sae() { \
+    _instOptions |= kX86InstOptionEvexRdSae; \
+    return *this; \
+  } \
+  \
+  /*! Static rounding mode `round-up` (toward +inf) and `SAE` (AVX512+). */ \
+  ASMJIT_INLINE _Class_& ru_sae() { \
+    _instOptions |= kX86InstOptionEvexRuSae; \
+    return *this; \
+  } \
+  \
+  /*! Static rounding mode `round-toward-zero` (truncate) and `SAE` (AVX512+). */ \
+  ASMJIT_INLINE _Class_& rz_sae() { \
+    _instOptions |= kX86InstOptionEvexRzSae; \
+    return *this; \
  }

 //! X86/X64 assembler.
@@ -94,10 +142,10 @@ namespace asmjit {
 //! ~~~
 //! // Use asmjit namespace.
 //! using namespace asmjit;
-//! using namespace asmjit::host;
+//! using namespace asmjit::x86;
 //!
-//! // Create Assembler instance.
-//! Assembler a;
+//! // Create X86Assembler instance.
+//! X86Assembler a;
 //!
 //! // Prolog.
 //! a.push(ebp);
@@ -115,10 +163,10 @@ namespace asmjit {
 //! ~~~
 //!
 //! You can see that syntax is very close to Intel one. Only difference is that
-//! you are calling functions that emits the binary code for you. All registers
-//! are in `asmjit` namespace, so it's very comfortable to use it (look at
-//! first line). There is also used method `imm()` to create an immediate value.
-//! Use `imm_u()` to create unsigned immediate value.
+//! you are calling functions that emit binary code for you. All registers are
+//! in `asmjit::x86` namespace, so it's very comfortable to use it (look at the
+//! `use namespace` section). Without importing `asmjit::x86` registers would
+//! have to be written as `x86::eax`, `x86::esp`, and so on.
 //!
 //! There is also possibility to use memory addresses and immediates. Use
 //! `ptr()`, `byte_ptr()`, `word_ptr()`, `dword_ptr()` and similar functions to
@@ -126,14 +174,14 @@ namespace asmjit {
 //! information related to the operand size is needed only in rare cases, that
 //! is an instruction without having any register operands, such as `inc [mem]`.
 //!
-//! for example, `a` is `x86::Assembler` instance:
+//! for example, `a` is an `X86Assembler` instance:
 //!
 //! ~~~
 //! a.mov(ptr(eax), 0);             // mov ptr [eax], 0
 //! a.mov(ptr(eax), edx);           // mov ptr [eax], edx
 //! ~~~
 //!
-//! But it's also possible to create complex addresses:
+//! But it's also possible to create complex addresses offered by x86 architecture:
 //!
 //! ~~~
 //! // eax + ecx*x addresses
@@ -148,10 +196,12 @@ namespace asmjit {
 //! a.mov(ptr(eax, ecx, 3, 16), 0); // mov ptr [eax + ecx * 8 + 16], 0
 //! ~~~
 //!
-//! All addresses shown are using `ptr()` to make memory operand. Some assembler
-//! instructions (single operand ones) needs to have specified memory operand
-//! size. For example `a.inc(ptr(eax))` can't be called, because the meaning is
-//! ambiguous, see the code below.
+//! All addresses shown are using `x86::ptr()` to make memory operand. Some
+//! assembler instructions using a single operand need to know the size of
+//! the operand to avoid ambiguity. For example `a.inc(ptr(eax))` is ambiguous
+//! and would cause a runtime error. This problem can be fixed by using memory
+//! operand with size specified - `byte_ptr`, `word_ptr`, `dword_ptr`, see the
+//! code below:
 //!
 //! ~~~
 //! // [byte] address.
@@ -163,30 +213,34 @@ namespace asmjit {
 //! // [dword] address.
 //! a.inc(dword_ptr(eax));          // Inc dword ptr [eax].
 //! a.dec(dword_ptr(eax));          // Dec dword ptr [eax].
+//! // [dword] address.
+//! a.inc(dword_ptr(rax));          // Inc qword ptr [rax].
+//! a.dec(dword_ptr(rax));          // Dec qword ptr [rax].
 //! ~~~
 //!
 //! Calling JIT Code
 //! ----------------
 //!
-//! While you are over from emitting instructions, you can make your function
-//! by using `Assembler::make()` method. This method will use memory
-//! manager to allocate virtual memory and relocates generated code to it. For
-//! memory allocation is used global memory manager by default and memory is
-//! freeable, but of course this default behavior can be overridden specifying
-//! your memory manager and allocation type. If you want to do with code
-//! something else you can always override make() method and do what you want.
+//! After you are finished with emitting instructions, you can make your function
+//! callable by using `Assembler::make()` method. This method will use memory
+//! manager to allocate virtual memory and relocates generated code to it. The
+//! memory is allocated through `Runtime` instance provided to `X86Assembler`
+//! constructor.
 //!
-//! You can get size of generated code by `getCodeSize()` or `getOffset()`
-//! methods. These methods returns you code size or more precisely the current
-//! code offset in bytes. The `takeCode()` function can be used to take the
-//! internal buffer and reset the code generator, but the buffer taken has to
-//! be freed manually in such case.
+//! The size of the code generated can be retrieved by `getCodeSize()` and
+//! `getOffset()` methods. The `getOffset()` method returns the current offset
+//! (that is mostly equal to the final code size, if called after the code
+//! generation) and `getCodeSize()` returns the final code size with possible
+//! trampolines. The `takeCode()` method can be used to take the internal buffer
+//! and reset the code generator, but the buffer returned has to be freed manually
+//! in such case.
 //!
 //! Machine code can be executed only in memory that is marked executable. This
-//! mark is usually not set for memory returned by a C/C++ `malloc` function.
-//! The `VMem::alloc()` function can be used allocate a memory where the code can
-//! be executed or more preferably `VMemMgr` which has interface
-//! similar to `malloc/free` and can allocate chunks of various sizes.
+//! mark is usually not set for memory returned by a C/C++ `malloc()` function.
+//! The `VMemUtil::alloc()` function can be used allocate a memory where the code
+//! can be executed. Please note that `VMemUtil` is a low-level class that works
+//! at memory page level. High level interface that is similar to malloc/free is
+//! provided by `VMemMgr` class.
 //!
 //! The next example shows how to allocate memory where the code can be executed:
 //!
@@ -194,27 +248,28 @@ namespace asmjit {
 //! using namespace asmjit;
 //!
 //! JitRuntime runtime;
-//! Assembler a(&runtime);
+//! X86Assembler a(&runtime);
 //!
-//! // ... Your code generation ...
+//! ... Code generation ...
 //!
-//! // The function prototype
+//! // The function prototype.
 //! typedef void (*MyFunc)();
 //!
-//! // make your function
+//! // Make the function.
 //! MyFunc func = asmjit_cast<MyFunc>(a.make());
 //!
-//! // call your function
+//! // Call the function.
 //! func();
 //!
-//! // If you don't need your function again, free it.
+//! // Release the function if not needed anymore.
 //! runtime.release(func);
 //! ~~~
 //!
-//! This was a very primitive showing how the generated code can be executed.
-//! In production noone will probably generate a function that is only called
-//! once and nobody will probably free the function right after it was executed.
-//! The code just shows the proper way of code generation and cleanup.
+//! This was a very primitive example showing how the generated code can be.
+//! executed by using the foundation of classes AsmJit offers. In production
+//! nobody is likely to generate a function that is only called once and freed
+//! immediately after it's been called, however, the concept of releasing code
+//! that is not needed anymore should be clear.
 //!
 //! Labels
 //! ------
@@ -278,7 +333,7 @@ namespace asmjit {
 //! a.mov(esp, ebp);
 //! a.pop(ebp);
 //!
-//! // Return: STDCALL convention is to pop stack in called function.
+//! // Return: Pop the stack by `arg_size` as defined by `STDCALL` convention.
 //! a.ret(arg_size);
 //! ~~~
 //!
@@ -315,13 +370,13 @@ namespace asmjit {
 //! Next, more advanced, but often needed technique is that you can build your
 //! own registers allocator. X86 architecture contains 8 general purpose
 //! registers, 8 Mm registers and 8 Xmm/Ymm/Zmm registers. X64 architecture
-//! extends the count of Gp registers and Xmm/Ymm/Zmm registers to 16 or 32
-//! when AVX512 is available.
+//! extends the count of Gp registers and Xmm/Ymm/Zmm registers to 16. AVX-512
+//! architecture extends Xmm/Ymm/Zmm SIMD registers to 32.
 //!
 //! To create a general purpose register operand from register index use
 //! `gpb_lo()`, `gpb_hi()`, `gpw()`, `gpd()`, `gpq()`. To create registers of
-//! other types there are functions `fp()`, `mm()`, `xmm()`, `ymm()` and `zmm()`
-//! available.
+//! other types there `fp()`, `mm()`, `k()`, `xmm()`, `ymm()` and `zmm()`
+//! functions available that return a new register operand.
 //!
 //! \sa X86Compiler.
 struct ASMJIT_VCLASS X86Assembler : public Assembler {
@@ -340,7 +395,7 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  // [Arch]
  // --------------------------------------------------------------------------

-  //! Get count of registers of the current architecture.
+  //! Get count of registers of the current architecture and mode.
  ASMJIT_INLINE const X86RegCount& getRegCount() const {
    return _regCount;
  }
@@ -478,7 +533,7 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  X86GpReg zdi;

  // --------------------------------------------------------------------------
-  // [Base Instructions]
+  // [Emit]
  // --------------------------------------------------------------------------

 #define INST_0x(_Inst_, _Code_) \
@@ -654,6 +709,30 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
    return emit(_Code_, o0, o1, o2); \
  }

+#define INST_3ii(_Inst_, _Code_, _Op0_, _Op1_, _Op2_) \
+  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, const _Op1_& o1, const _Op2_& o2) { \
+    return emit(_Code_, o0, o1, o2); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, int o1, int o2) { \
+    Imm o1Imm(o1); \
+    return emit(_Code_, o0, o1Imm, o2); \
+  } \
+  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, unsigned int o1, unsigned int o2) { \
+    Imm o1Imm(o1); \
+    return emit(_Code_, o0, o1Imm, static_cast<uint64_t>(o2)); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, int64_t o1, int64_t o2) { \
+    Imm o1Imm(o1); \
+    return emit(_Code_, o0, o1Imm, static_cast<uint64_t>(o2)); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, uint64_t o1, uint64_t o2) { \
+    Imm o1Imm(o1); \
+    return emit(_Code_, o0, o1Imm, o2); \
+  }
+
 #define INST_4x(_Inst_, _Code_, _Op0_, _Op1_, _Op2_, _Op3_) \
  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, const _Op1_& o1, const _Op2_& o2, const _Op3_& o3) { \
    return emit(_Code_, o0, o1, o2, o3); \
@@ -686,6 +765,35 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
    return emit(_Code_, o0, o1, o2, o3); \
  }

+#define INST_4ii(_Inst_, _Code_, _Op0_, _Op1_, _Op2_, _Op3_) \
+  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, const _Op1_& o1, const _Op2_& o2, const _Op3_& o3) { \
+    return emit(_Code_, o0, o1, o2, o3); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, const _Op1_& o1, int o2, int o3) { \
+    Imm o2Imm(o2); \
+    return emit(_Code_, o0, o1, o2Imm, o3); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, const _Op1_& o1, unsigned int o2, unsigned int o3) { \
+    Imm o2Imm(o2); \
+    return emit(_Code_, o0, o1, o2Imm, static_cast<uint64_t>(o3)); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, const _Op1_& o1, int64_t o2, int64_t o3) { \
+    Imm o2Imm(o2); \
+    return emit(_Code_, o0, o1, o2Imm, static_cast<uint64_t>(o3)); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE Error _Inst_(const _Op0_& o0, const _Op1_& o1, uint64_t o2, uint64_t o3) { \
+    Imm o2Imm(o2); \
+    return emit(_Code_, o0, o1, o2Imm, o3); \
+  }
+
+  // --------------------------------------------------------------------------
+  // [X86/X64]
+  // --------------------------------------------------------------------------
+
  //! Add with Carry.
  INST_2x(adc, kX86InstIdAdc, X86GpReg, X86GpReg)
  //! \overload
@@ -837,11 +945,6 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! CPU identification (i486).
  INST_0x(cpuid, kX86InstIdCpuid)

-  //! Accumulate crc32 value (polynomial 0x11EDC6F41) (SSE4.2).
-  INST_2x_(crc32, kX86InstIdCrc32, X86GpReg, X86GpReg, o0.isRegType(kX86RegTypeGpd) || o0.isRegType(kX86RegTypeGpq))
-  //! \overload
-  INST_2x_(crc32, kX86InstIdCrc32, X86GpReg, X86Mem, o0.isRegType(kX86RegTypeGpd) || o0.isRegType(kX86RegTypeGpq))
-
  //! Decimal adjust AL after addition (X86 Only).
  INST_0x(daa, kX86InstIdDaa)
  //! Decimal adjust AL after subtraction (X86 Only).
@@ -1037,11 +1140,6 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! Pop stack into EFLAGS register (32-bit or 64-bit).
  INST_0x(popf, kX86InstIdPopf)

-  //! Return the count of number of bits set to 1 (SSE4.2).
-  INST_2x_(popcnt, kX86InstIdPopcnt, X86GpReg, X86GpReg, !o0.isGpb() && o0.getRegType() == o1.getRegType())
-  //! \overload
-  INST_2x_(popcnt, kX86InstIdPopcnt, X86GpReg, X86Mem, !o0.isGpb())
-
  //! Push WORD or DWORD/QWORD on the stack.
  INST_1x_(push, kX86InstIdPush, X86GpReg, o0.getSize() == 2 || o0.getSize() == _regSize)
  //! Push WORD or DWORD/QWORD on the stack.
@@ -1329,273 +1427,277 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  INST_2i(xor_, kX86InstIdXor, X86Mem, Imm)

  // --------------------------------------------------------------------------
-  // [Fpu]
+  // [FPU]
  // --------------------------------------------------------------------------

-  //! Compute 2^x - 1 (FPU).
+  //! Compute `2^x - 1` - `fp0 = POW(2, fp0) - 1` (FPU).
  INST_0x(f2xm1, kX86InstIdF2xm1)
-  //! Absolute value of fp0 (FPU).
+  //! Abs `fp0 = ABS(fp0)` (FPU).
  INST_0x(fabs, kX86InstIdFabs)

-  //! Add `o1` to `o0` (one has to be `fp0`) and store result in `o0` (FPU).
+  //! Add `o0 = o0 + o1` (one operand has to be `fp0`) (FPU).
  INST_2x_(fadd, kX86InstIdFadd, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Add 4-byte or 8-byte FP `o0` to fp0 and store result in fp0 (FPU).
+  //! Add `fp0 = fp0 + float_or_double[o0]` (FPU).
  INST_1x(fadd, kX86InstIdFadd, X86Mem)
-  //! Add fp0 to `o0` and pop the FPU stack (FPU).
+  //! Add `o0 = o0 + fp0` and POP (FPU).
  INST_1x(faddp, kX86InstIdFaddp, X86FpReg)
-  //! \overload
+  //! Add `fp1 = fp1 + fp0` and POP (FPU).
  INST_0x(faddp, kX86InstIdFaddp)

-  //! Load binary coded decimal (FPU).
+  //! Load BCD from `[o0]` and PUSH (FPU).
  INST_1x(fbld, kX86InstIdFbld, X86Mem)
-  //! Store BCD integer and Pop (FPU).
+  //! Store BCD-Integer to `[o0]` and POP (FPU).
  INST_1x(fbstp, kX86InstIdFbstp, X86Mem)
-  //! Change fp0 sign (FPU).
+
+  //! Complement Sign `fp0 = -fp0` (FPU).
  INST_0x(fchs, kX86InstIdFchs)
+
  //! Clear exceptions (FPU).
  INST_0x(fclex, kX86InstIdFclex)

-  //! Conditional move (FPU).
+  //! Conditional move `if (CF=1) fp0 = o0` (FPU).
  INST_1x(fcmovb, kX86InstIdFcmovb, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (CF|ZF=1) fp0 = o0` (FPU).
  INST_1x(fcmovbe, kX86InstIdFcmovbe, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (ZF=1) fp0 = o0` (FPU).
  INST_1x(fcmove, kX86InstIdFcmove, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (CF=0) fp0 = o0` (FPU).
  INST_1x(fcmovnb, kX86InstIdFcmovnb, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (CF|ZF=0) fp0 = o0` (FPU).
  INST_1x(fcmovnbe, kX86InstIdFcmovnbe, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (ZF=0) fp0 = o0` (FPU).
  INST_1x(fcmovne, kX86InstIdFcmovne, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (PF=0) fp0 = o0` (FPU).
  INST_1x(fcmovnu, kX86InstIdFcmovnu, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (PF=1) fp0 = o0` (FPU).
  INST_1x(fcmovu, kX86InstIdFcmovu, X86FpReg)

-  //! Compare fp0 with `o0` (FPU).
+  //! Compare `fp0` with `o0` (FPU).
  INST_1x(fcom, kX86InstIdFcom, X86FpReg)
-  //! Compare fp0 with fp1 (FPU).
+  //! Compare `fp0` with `fp1` (FPU).
  INST_0x(fcom, kX86InstIdFcom)
-  //! Compare fp0 with 4-byte or 8-byte FP at `src` (FPU).
+  //! Compare `fp0` with `float_or_double[o0]` (FPU).
  INST_1x(fcom, kX86InstIdFcom, X86Mem)
-  //! Compare fp0 with `o0` and pop the FPU stack (FPU).
+  //! Compare `fp0` with `o0` and POP (FPU).
  INST_1x(fcomp, kX86InstIdFcomp, X86FpReg)
-  //! Compare fp0 with fp1 and pop the FPU stack (FPU).
+  //! Compare `fp0` with `fp1` and POP (FPU).
  INST_0x(fcomp, kX86InstIdFcomp)
-  //! Compare fp0 with 4-byte or 8-byte FP at `adr` and pop the FPU stack (FPU).
+  //! Compare `fp0` with `float_or_double[o0]` and POP (FPU).
  INST_1x(fcomp, kX86InstIdFcomp, X86Mem)
-    //! Compare fp0 with fp1 and pop the FPU stack twice (FPU).
+  //! Compare `fp0` with `fp1` and POP twice (FPU).
  INST_0x(fcompp, kX86InstIdFcompp)
-  //! Compare fp0 and `o0` and Set EFLAGS (FPU).
+  //! Compare `fp0` with `o0` and set EFLAGS (FPU).
  INST_1x(fcomi, kX86InstIdFcomi, X86FpReg)
-  //! Compare fp0 and `o0` and Set EFLAGS and pop the FPU stack (FPU).
+  //! Compare `fp0` with `o0` and set EFLAGS and POP (FPU).
  INST_1x(fcomip, kX86InstIdFcomip, X86FpReg)

-  //! Calculate cosine of fp0 and store result in fp0 (FPU).
+  //! Cos `fp0 = cos(fp0)` (FPU).
  INST_0x(fcos, kX86InstIdFcos)
-  //! Decrement FPU stack-top pointer (FPU).
+
+  //! Decrement FPU stack pointer (FPU).
  INST_0x(fdecstp, kX86InstIdFdecstp)

-  //! Divide `o0` by `o1` (one has to be `fp0`) (FPU).
+  //! Divide `o0 = o0 / o1` (one has to be `fp0`) (FPU).
  INST_2x_(fdiv, kX86InstIdFdiv, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Divide fp0 by 32-bit or 64-bit FP value (FPU).
+  //! Divide `fp0 = fp0 / float_or_double[o0]` (FPU).
  INST_1x(fdiv, kX86InstIdFdiv, X86Mem)
-  //! Divide `o0` by fp0 (FPU).
+  //! Divide `o0 = o0 / fp0` and POP (FPU).
  INST_1x(fdivp, kX86InstIdFdivp, X86FpReg)
-  //! \overload
+  //! Divide `fp1 = fp1 / fp0` and POP (FPU).
  INST_0x(fdivp, kX86InstIdFdivp)

-  //! Reverse divide `o0` by `o1` (one has to be `fp0`) (FPU).
+  //! Reverse divide `o0 = o1 / o0` (one has to be `fp0`) (FPU).
  INST_2x_(fdivr, kX86InstIdFdivr, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Reverse divide fp0 by 32-bit or 64-bit FP value (FPU).
+  //! Reverse divide `fp0 = float_or_double[o0] / fp0` (FPU).
  INST_1x(fdivr, kX86InstIdFdivr, X86Mem)
-  //! Reverse divide `o0` by fp0 (FPU).
+  //! Reverse divide `o0 = fp0 / o0` and POP (FPU).
  INST_1x(fdivrp, kX86InstIdFdivrp, X86FpReg)
-  //! \overload
+  //! Reverse divide `fp1 = fp0 / fp1` and POP (FPU).
  INST_0x(fdivrp, kX86InstIdFdivrp)

  //! Free FP register (FPU).
  INST_1x(ffree, kX86InstIdFfree, X86FpReg)

-  //! Add 16-bit or 32-bit integer to fp0 (FPU).
+  //! Add `fp0 = fp0 + short_or_int[o0]` (FPU).
  INST_1x_(fiadd, kX86InstIdFiadd, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Compare fp0 with 16-bit or 32-bit Integer (FPU).
+  //! Compare `fp0` with `short_or_int[o0]` (FPU).
  INST_1x_(ficom, kX86InstIdFicom, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Compare fp0 with 16-bit or 32-bit Integer and pop the FPU stack (FPU).
+  //! Compare `fp0` with `short_or_int[o0]` and POP (FPU).
  INST_1x_(ficomp, kX86InstIdFicomp, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Divide fp0 by 32-bit or 16-bit integer (`src`) (FPU).
+  //! Divide `fp0 = fp0 / short_or_int[o0]` (FPU).
  INST_1x_(fidiv, kX86InstIdFidiv, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Reverse divide fp0 by 32-bit or 16-bit integer (`src`) (FPU).
+  //! Reverse divide `fp0 = short_or_int[o0] / fp0` (FPU).
  INST_1x_(fidivr, kX86InstIdFidivr, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)

-  //! Load 16-bit, 32-bit or 64-bit Integer and push it to the FPU stack (FPU).
+  //! Load `short_or_int_or_long[o0]` and PUSH (FPU).
  INST_1x_(fild, kX86InstIdFild, X86Mem, o0.getSize() == 2 || o0.getSize() == 4 || o0.getSize() == 8)
-  //! Multiply fp0 by 16-bit or 32-bit integer and store it to fp0 (FPU).
+  //! Multiply `fp0 *= short_or_int[o0]` (FPU).
  INST_1x_(fimul, kX86InstIdFimul, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)

-  //! Increment FPU stack-top pointer (FPU).
+  //! Increment FPU stack pointer (FPU).
  INST_0x(fincstp, kX86InstIdFincstp)
  //! Initialize FPU (FPU).
  INST_0x(finit, kX86InstIdFinit)

-  //! Subtract 16-bit or 32-bit integer from fp0 and store result to fp0 (FPU).
+  //! Subtract `fp0 = fp0 - short_or_int[o0]` (FPU).
  INST_1x_(fisub, kX86InstIdFisub, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Reverse subtract 16-bit or 32-bit integer from fp0 and store result to fp0 (FPU).
+  //! Reverse subtract `fp0 = short_or_int[o0] - fp0` (FPU).
  INST_1x_(fisubr, kX86InstIdFisubr, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)

  //! Initialize FPU without checking for pending unmasked exceptions (FPU).
  INST_0x(fninit, kX86InstIdFninit)

-  //! Store fp0 as 16-bit or 32-bit Integer to `o0` (FPU).
+  //! Store `fp0` as `short_or_int[o0]` (FPU).
  INST_1x_(fist, kX86InstIdFist, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Store fp0 as 16-bit, 32-bit or 64-bit Integer to `o0` and pop the FPU stack (FPU).
+  //! Store `fp0` as `short_or_int_or_long[o0]` and POP (FPU).
  INST_1x_(fistp, kX86InstIdFistp, X86Mem, o0.getSize() == 2 || o0.getSize() == 4 || o0.getSize() == 8)
-  //! Push 32-bit, 64-bit or 80-bit floating point value on the FPU stack (FPU).
+
+  //! Load `float_or_double_or_extended[o0]` and PUSH (FPU).
  INST_1x_(fld, kX86InstIdFld, X86Mem, o0.getSize() == 4 || o0.getSize() == 8 || o0.getSize() == 10)
-  //! Push `o0` on the FPU stack (FPU).
+  //! PUSH `o0` (FPU).
  INST_1x(fld, kX86InstIdFld, X86FpReg)

-  //! Push +1.0 on the FPU stack (FPU).
+  //! PUSH `1.0` (FPU).
  INST_0x(fld1, kX86InstIdFld1)
-  //! Push log2(10) on the FPU stack (FPU).
+  //! PUSH `log2(10)` (FPU).
  INST_0x(fldl2t, kX86InstIdFldl2t)
-  //! Push log2(e) on the FPU stack (FPU).
+  //! PUSH `log2(e)` (FPU).
  INST_0x(fldl2e, kX86InstIdFldl2e)
-  //! Push pi on the FPU stack (FPU).
+  //! PUSH `pi` (FPU).
  INST_0x(fldpi, kX86InstIdFldpi)
-  //! Push log10(2) on the FPU stack (FPU).
+  //! PUSH `log10(2)` (FPU).
  INST_0x(fldlg2, kX86InstIdFldlg2)
-  //! Push ln(2) on the FPU stack (FPU).
+  //! PUSH `ln(2)` (FPU).
  INST_0x(fldln2, kX86InstIdFldln2)
-  //! Push +0.0 on the FPU stack (FPU).
+  //! PUSH `+0.0` (FPU).
  INST_0x(fldz, kX86InstIdFldz)

-  //! Load x87 FPU control word (2 bytes) (FPU).
+  //! Load x87 FPU control word from `word_ptr[o0]` (FPU).
  INST_1x(fldcw, kX86InstIdFldcw, X86Mem)
-  //! Load x87 FPU environment (14 or 28 bytes) (FPU).
+  //! Load x87 FPU environment (14 or 28 bytes) from `[o0]` (FPU).
  INST_1x(fldenv, kX86InstIdFldenv, X86Mem)

-  //! Multiply `o0` by `o1` (one has to be `fp0`) and store result in `o0` (FPU).
+  //! Multiply `o0 = o0  * o1` (one has to be `fp0`) (FPU).
  INST_2x_(fmul, kX86InstIdFmul, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Multiply fp0 by 32-bit or 64-bit `o0` and store result in fp0 (FPU).
+  //! Multiply `fp0 = fp0 * float_or_double[o0]` (FPU).
  INST_1x(fmul, kX86InstIdFmul, X86Mem)
-  //! Multiply fp0 by `o0` and pop the FPU stack (FPU).
+  //! Multiply `o0 = o0 * fp0` and POP (FPU).
  INST_1x(fmulp, kX86InstIdFmulp, X86FpReg)
-  //! \overload
+  //! Multiply `fp1 = fp1 * fp0` and POP (FPU).
  INST_0x(fmulp, kX86InstIdFmulp)

  //! Clear exceptions (FPU).
  INST_0x(fnclex, kX86InstIdFnclex)
  //! No operation (FPU).
  INST_0x(fnop, kX86InstIdFnop)
-  //! Save FPU state (FPU).
+  //! Save FPU state to `[o0]` (FPU).
  INST_1x(fnsave, kX86InstIdFnsave, X86Mem)
-  //! Store x87 FPU environment (FPU).
+  //! Store x87 FPU environment to `[o0]` (FPU).
  INST_1x(fnstenv, kX86InstIdFnstenv, X86Mem)
-  //! Store x87 FPU control word (FPU).
+  //! Store x87 FPU control word to `[o0]` (FPU).
  INST_1x(fnstcw, kX86InstIdFnstcw, X86Mem)

  //! Store x87 FPU status word to `o0` (AX) (FPU).
  INST_1x_(fnstsw, kX86InstIdFnstsw, X86GpReg, o0.isRegCode(kX86RegTypeGpw, kX86RegIndexAx))
-  //! Store x87 FPU status word to `o0` (2 bytes) (FPU).
+  //! Store x87 FPU status word to `word_ptr[o0]` (FPU).
  INST_1x(fnstsw, kX86InstIdFnstsw, X86Mem)

-  //! Arctan(`fp1` / `fp0`) and pop the FPU stack (FPU).
+  //! Partial Arctan `fp1 = atan2(fp1, fp0)` and POP (FPU).
  INST_0x(fpatan, kX86InstIdFpatan)
-  //! Fprem(`fp0`, `fp1`) and pop the FPU stack (FPU).
+  //! Partial Remainder[Trunc] `fp1 = fp0 % fp1` and POP (FPU).
  INST_0x(fprem, kX86InstIdFprem)
-  //! Fprem(`fp0`, `fp1`) and pop the FPU stack (FPU).
+  //! Partial Remainder[Round] `fp1 = fp0 % fp1` and POP (FPU).
  INST_0x(fprem1, kX86InstIdFprem1)
-  //! Arctan(`fp0`) and pop the FPU stack (FPU).
+  //! Partial Tan `fp0 = tan(fp0)` and PUSH `1.0` (FPU).
  INST_0x(fptan, kX86InstIdFptan)
-  //! Round `fp0` to Integer (FPU).
+  //! Round `fp0 = round(fp0)` (FPU).
  INST_0x(frndint, kX86InstIdFrndint)

-  //! Restore FPU state from `o0` (94 or 108 bytes) (FPU).
+  //! Restore FPU state from `[o0]` (94 or 108 bytes) (FPU).
  INST_1x(frstor, kX86InstIdFrstor, X86Mem)
-  //! Save FPU state to `o0` (94 or 108 bytes) (FPU).
+  //! Save FPU state to `[o0]` (94 or 108 bytes) (FPU).
  INST_1x(fsave, kX86InstIdFsave, X86Mem)

-  //! Scale `fp0` by `fp1` (FPU).
+  //! Scale `fp0 = fp0 * pow(2, RoundTowardsZero(fp1))` (FPU).
  INST_0x(fscale, kX86InstIdFscale)
-  //! Sine of `fp0` and store result in `fp0` (FPU).
+  //! Sin `fp0 = sin(fp0)` (FPU).
  INST_0x(fsin, kX86InstIdFsin)
-  //! Sine and cosine of `fp0`, store sine in `fp0` and push cosine on the FPU stack (FPU).
+  //! Sincos `fp0 = sin(fp0)` and PUSH `cos(fp0)` (FPU).
  INST_0x(fsincos, kX86InstIdFsincos)
-  //! Square root of `fp0` and store it in `fp0` (FPU).
+  //! Square root `fp0 = sqrt(fp0)` (FPU).
  INST_0x(fsqrt, kX86InstIdFsqrt)

-  //! Store floating point value to 32-bit or 64-bit memory location (FPU).
+  //! Store floating point value to `float_or_double[o0]` (FPU).
  INST_1x_(fst, kX86InstIdFst, X86Mem, o0.getSize() == 4 || o0.getSize() == 8)
-  //! Store floating point value to `o0` (FPU).
+  //! Copy `o0 = fp0` (FPU).
  INST_1x(fst, kX86InstIdFst, X86FpReg)
-  //! Store floating point value to 32-bit or 64-bit memory location and pop the FPU stack (FPU).
+  //! Store floating point value to `float_or_double_or_extended[o0]` and POP (FPU).
  INST_1x_(fstp, kX86InstIdFstp, X86Mem, o0.getSize() == 4 || o0.getSize() == 8 || o0.getSize() == 10)
-  //! Store floating point value to `o0` and pop the FPU stack (FPU).
+  //! Copy `o0 = fp0` and POP (FPU).
  INST_1x(fstp, kX86InstIdFstp, X86FpReg)

-  //! Store x87 FPU control word to `o0` (2 bytes) (FPU).
+  //! Store x87 FPU control word to `word_ptr[o0]` (FPU).
  INST_1x(fstcw, kX86InstIdFstcw, X86Mem)
-  //! Store x87 FPU environment to `o0` (14 or 28 bytes) (FPU).
+  //! Store x87 FPU environment to `[o0]` (14 or 28 bytes) (FPU).
  INST_1x(fstenv, kX86InstIdFstenv, X86Mem)
-  //! Store x87 FPU status word to AX (FPU).
+  //! Store x87 FPU status word to `o0` (AX) (FPU).
  INST_1x_(fstsw, kX86InstIdFstsw, X86GpReg, o0.getRegIndex() == kX86RegIndexAx)
-  //! Store x87 FPU status word (2 bytes) (FPU).
+  //! Store x87 FPU status word to `word_ptr[o0]` (FPU).
  INST_1x(fstsw, kX86InstIdFstsw, X86Mem)

-  //! Subtract `o0` from `o0` (one has to be `fp0`) and store result in `o0` (FPU).
+  //! Subtract `o0 = o0 - o1` (one has to be `fp0`) (FPU).
  INST_2x_(fsub, kX86InstIdFsub, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Subtract 32-bit or 64-bit `o0` from fp0 and store result in fp0 (FPU).
+  //! Subtract `fp0 = fp0 - float_or_double[o0]` (FPU).
  INST_1x_(fsub, kX86InstIdFsub, X86Mem, o0.getSize() == 4 || o0.getSize() == 8)
-  //! Subtract fp0 from `o0` and pop FPU stack (FPU).
+  //! Subtract `o0 = o0 - fp0` and POP (FPU).
  INST_1x(fsubp, kX86InstIdFsubp, X86FpReg)
-  //! \overload
+  //! Subtract `fp1 = fp1 - fp0` and POP (FPU).
  INST_0x(fsubp, kX86InstIdFsubp)

-  //! Reverse subtract `o1` from `o0` (one has to be `fp0`) and store result in `o0` (FPU).
+  //! Reverse subtract `o0 = o1 - o0` (one has to be `fp0`) (FPU).
  INST_2x_(fsubr, kX86InstIdFsubr, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Reverse subtract 32-bit or 64-bit `o0` from `fp0` and store result in `fp0` (FPU).
+  //! Reverse subtract `fp0 = fp0 - float_or_double[o0]` (FPU).
  INST_1x_(fsubr, kX86InstIdFsubr, X86Mem, o0.getSize() == 4 || o0.getSize() == 8)
-  //! Reverse subtract `fp0` from `o0` and pop FPU stack (FPU).
+  //! Reverse subtract `o0 = o0 - fp0` and POP (FPU).
  INST_1x(fsubrp, kX86InstIdFsubrp, X86FpReg)
-  //! \overload
+  //! Reverse subtract `fp1 = fp1 - fp0` and POP (FPU).
  INST_0x(fsubrp, kX86InstIdFsubrp)

-  //! Floating point test - Compare `fp0` with 0.0. (FPU).
+  //! Compare `fp0` with `0.0` (FPU).
  INST_0x(ftst, kX86InstIdFtst)

  //! Unordered compare `fp0` with `o0` (FPU).
  INST_1x(fucom, kX86InstIdFucom, X86FpReg)
  //! Unordered compare `fp0` with `fp1` (FPU).
  INST_0x(fucom, kX86InstIdFucom)
-  //! Unordered compare `fp0` and `o0`, check for ordered values and set EFLAGS (FPU).
+  //! Unordered compare `fp0` with `o0`, check for ordered values and set EFLAGS (FPU).
  INST_1x(fucomi, kX86InstIdFucomi, X86FpReg)
-  //! Unordered compare `fp0` and `o0`, check for ordered values and set EFLAGS and pop the FPU stack (FPU).
+  //! Unordered compare `fp0` with `o0`, check for ordered values and set EFLAGS and POP (FPU).
  INST_1x(fucomip, kX86InstIdFucomip, X86FpReg)
-  //! Unordered compare `fp0` with `o0` and pop the FPU stack (FPU).
+  //! Unordered compare `fp0` with `o0` and POP (FPU).
  INST_1x(fucomp, kX86InstIdFucomp, X86FpReg)
-  //! Unordered compare `fp0` with `fp1` and pop the FPU stack (FPU).
+  //! Unordered compare `fp0` with `fp1` and POP (FPU).
  INST_0x(fucomp, kX86InstIdFucomp)
-  //! Unordered compare `fp0` with `fp1` and pop the FPU stack twice (FPU).
+  //! Unordered compare `fp0` with `fp1` and POP twice (FPU).
  INST_0x(fucompp, kX86InstIdFucompp)

  INST_0x(fwait, kX86InstIdFwait)

  //! Examine fp0 (FPU).
  INST_0x(fxam, kX86InstIdFxam)
-  //! Exchange content of fp0 with `o0` (FPU).
+  //! Exchange `fp0` with `o0` (FPU).
  INST_1x(fxch, kX86InstIdFxch, X86FpReg)

  //! Restore FP/MMX/SIMD extension states to `o0` (512 bytes) (FPU, MMX, SSE).
  INST_1x(fxrstor, kX86InstIdFxrstor, X86Mem)
  //! Store FP/MMX/SIMD extension states to `o0` (512 bytes) (FPU, MMX, SSE).
  INST_1x(fxsave, kX86InstIdFxsave, X86Mem)
-  //! Extract exponent and store to `fp0` and push significand on the FPU stack (FPU).
+  //! Extract `fp0 = exponent(fp0)` and PUSH `significant(fp0)` (FPU).
  INST_0x(fxtract, kX86InstIdFxtract)

-  //! Compute `fp1 * log2(fp0)`, pop the FPU stack and store result in `fp0` (FPU).
+  //! Compute `fp1 = fp1 * log2(fp0)` and POP (FPU).
  INST_0x(fyl2x, kX86InstIdFyl2x)
-  //! Compute `fp1 * log2(fp0 + 1)`, pop the FPU stack and store result in `fp0` (FPU).
+  //! Compute `fp1 = fp1 * log2(fp0 + 1)` and POP (FPU).
  INST_0x(fyl2xp1, kX86InstIdFyl2xp1)

  // --------------------------------------------------------------------------
@@ -2356,12 +2458,12 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_2x(cvtdq2ps, kX86InstIdCvtdq2ps, X86XmmReg, X86Mem)

-  //! Convert packed DP-FP to packed QWORDs (SSE2).
+  //! Convert packed DP-FP to packed DWORDs (SSE2).
  INST_2x(cvtpd2dq, kX86InstIdCvtpd2dq, X86XmmReg, X86XmmReg)
  //! \overload
  INST_2x(cvtpd2dq, kX86InstIdCvtpd2dq, X86XmmReg, X86Mem)

-  //! Convert packed DP-FP to packed QRODSs (SSE2).
+  //! Convert packed DP-FP to packed DWORDs (SSE2).
  INST_2x(cvtpd2pi, kX86InstIdCvtpd2pi, X86MmReg, X86XmmReg)
  //! \overload
  INST_2x(cvtpd2pi, kX86InstIdCvtpd2pi, X86MmReg, X86Mem)
@@ -2376,7 +2478,7 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_2x(cvtpi2pd, kX86InstIdCvtpi2pd, X86XmmReg, X86Mem)

-  //! Convert packed SP-FP to packed QWORDs (SSE2).
+  //! Convert packed SP-FP to packed DWORDs (SSE2).
  INST_2x(cvtps2dq, kX86InstIdCvtps2dq, X86XmmReg, X86XmmReg)
  //! \overload
  INST_2x(cvtps2dq, kX86InstIdCvtps2dq, X86XmmReg, X86Mem)
@@ -2411,12 +2513,12 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_2x(cvttpd2pi, kX86InstIdCvttpd2pi, X86MmReg, X86Mem)

-  //! Convert with truncation packed DP-FP to packed QWORDs (SSE2).
+  //! Convert with truncation packed DP-FP to packed DWORDs (SSE2).
  INST_2x(cvttpd2dq, kX86InstIdCvttpd2dq, X86XmmReg, X86XmmReg)
  //! \overload
  INST_2x(cvttpd2dq, kX86InstIdCvttpd2dq, X86XmmReg, X86Mem)

-  //! Convert with truncation packed SP-FP to packed QWORDs (SSE2).
+  //! Convert with truncation packed SP-FP to packed DWORDs (SSE2).
  INST_2x(cvttps2dq, kX86InstIdCvttps2dq, X86XmmReg, X86XmmReg)
  //! \overload
  INST_2x(cvttps2dq, kX86InstIdCvttps2dq, X86XmmReg, X86Mem)
@@ -2951,8 +3053,7 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_2x(addsubps, kX86InstIdAddsubps, X86XmmReg, X86Mem)

-  //! Store truncated `fp0` as 16-bit, 32-bit or 64-bit integer to `o0` and pop
-  //! the FPU stack (FPU / SSE3).
+  //! Store truncated `fp0` to `short_or_int_or_long[o0]` and POP (FPU & SSE3).
  INST_1x(fisttp, kX86InstIdFisttp, X86Mem)

  //! Packed DP-FP horizontal add (SSE3).
@@ -3412,6 +3513,11 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  // [SSE4.2]
  // --------------------------------------------------------------------------

+  //! Accumulate crc32 value (polynomial 0x11EDC6F41) (SSE4.2).
+  INST_2x_(crc32, kX86InstIdCrc32, X86GpReg, X86GpReg, o0.isRegType(kX86RegTypeGpd) || o0.isRegType(kX86RegTypeGpq))
+  //! \overload
+  INST_2x_(crc32, kX86InstIdCrc32, X86GpReg, X86Mem, o0.isRegType(kX86RegTypeGpd) || o0.isRegType(kX86RegTypeGpq))
+
  //! Packed compare explicit length strings, return index (SSE4.2).
  INST_3i(pcmpestri, kX86InstIdPcmpestri, X86XmmReg, X86XmmReg, Imm)
  //! \overload
@@ -3437,6 +3543,43 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_2x(pcmpgtq, kX86InstIdPcmpgtq, X86XmmReg, X86Mem)

+  // --------------------------------------------------------------------------
+  // [SSE4a]
+  // --------------------------------------------------------------------------
+
+  //! Extract Field (SSE4a).
+  INST_2x(extrq, kX86InstIdExtrq, X86XmmReg, X86XmmReg)
+  //! Extract Field (SSE4a).
+  INST_3ii(extrq, kX86InstIdExtrq, X86XmmReg, Imm, Imm)
+
+  //! Insert Field (SSE4a).
+  INST_2x(insertq, kX86InstIdInsertq, X86XmmReg, X86XmmReg)
+  //! Insert Field (SSE4a).
+  INST_4ii(insertq, kX86InstIdInsertq, X86XmmReg, X86XmmReg, Imm, Imm)
+
+  //! Move Non-Temporal Scalar DP-FP (SSE4a).
+  INST_2x(movntsd, kX86InstIdMovntsd, X86Mem, X86XmmReg)
+  //! Move Non-Temporal Scalar SP-FP (SSE4a).
+  INST_2x(movntss, kX86InstIdMovntss, X86Mem, X86XmmReg)
+
+  // --------------------------------------------------------------------------
+  // [POPCNT]
+  // --------------------------------------------------------------------------
+
+  //! Return the count of number of bits set to 1 (POPCNT).
+  INST_2x_(popcnt, kX86InstIdPopcnt, X86GpReg, X86GpReg, !o0.isGpb() && o0.getRegType() == o1.getRegType())
+  //! \overload
+  INST_2x_(popcnt, kX86InstIdPopcnt, X86GpReg, X86Mem, !o0.isGpb())
+
+  // --------------------------------------------------------------------------
+  // [LZCNT]
+  // --------------------------------------------------------------------------
+
+  //! Count the number of leading zero bits (LZCNT).
+  INST_2x(lzcnt, kX86InstIdLzcnt, X86GpReg, X86GpReg)
+  //! \overload
+  INST_2x(lzcnt, kX86InstIdLzcnt, X86GpReg, X86Mem)
+
  // --------------------------------------------------------------------------
  // [AESNI]
  // --------------------------------------------------------------------------
@@ -3480,6 +3623,30 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_3i(pclmulqdq, kX86InstIdPclmulqdq, X86XmmReg, X86Mem, Imm)

+  // --------------------------------------------------------------------------
+  // [XSAVE]
+  // --------------------------------------------------------------------------
+
+  //! Restore Processor Extended States specified by `EDX:EAX` (XSAVE).
+  INST_1x(xrstor, kX86InstIdXrstor, X86Mem)
+  //! Restore Processor Extended States specified by `EDX:EAX` (XSAVE&X64).
+  INST_1x(xrstor64, kX86InstIdXrstor64, X86Mem)
+
+  //! Save Processor Extended States specified by `EDX:EAX` (XSAVE).
+  INST_1x(xsave, kX86InstIdXsave, X86Mem)
+  //! Save Processor Extended States specified by `EDX:EAX` (XSAVE&X64).
+  INST_1x(xsave64, kX86InstIdXsave64, X86Mem)
+
+  //! Save Processor Extended States specified by `EDX:EAX` (Optimized) (XSAVEOPT).
+  INST_1x(xsaveopt, kX86InstIdXsave, X86Mem)
+  //! Save Processor Extended States specified by `EDX:EAX` (Optimized) (XSAVEOPT&X64).
+  INST_1x(xsaveopt64, kX86InstIdXsave64, X86Mem)
+
+  //! Get XCR - `EDX:EAX <- XCR[ECX]` (XSAVE).
+  INST_0x(xgetbv, kX86InstIdXgetbv)
+  //! Set XCR - `XCR[ECX] <- EDX:EAX` (XSAVE).
+  INST_0x(xsetbv, kX86InstIdXsetbv)
+
  // --------------------------------------------------------------------------
  // [AVX]
  // --------------------------------------------------------------------------
@@ -3667,7 +3834,7 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_2x(vcvtdq2ps, kX86InstIdVcvtdq2ps, X86YmmReg, X86Mem)

-  //! Convert packed DP-FP to packed QWORDs (AVX).
+  //! Convert packed DP-FP to packed DWORDs (AVX).
  INST_2x(vcvtpd2dq, kX86InstIdVcvtpd2dq, X86XmmReg, X86XmmReg)
  //! \overload
  INST_2x(vcvtpd2dq, kX86InstIdVcvtpd2dq, X86XmmReg, X86YmmReg)
@@ -3681,7 +3848,7 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_2x(vcvtpd2ps, kX86InstIdVcvtpd2ps, X86XmmReg, X86Mem)

-  //! Convert packed SP-FP to packed QWORDs (AVX).
+  //! Convert packed SP-FP to packed DWORDs (AVX).
  INST_2x(vcvtps2dq, kX86InstIdVcvtps2dq, X86XmmReg, X86XmmReg)
  //! \overload
  INST_2x(vcvtps2dq, kX86InstIdVcvtps2dq, X86XmmReg, X86Mem)
@@ -3729,14 +3896,14 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_2x(vcvtss2si, kX86InstIdVcvtss2si, X86GpReg, X86Mem)

-  //! Convert with truncation packed DP-FP to packed QWORDs (AVX).
+  //! Convert with truncation packed DP-FP to packed DWORDs (AVX).
  INST_2x(vcvttpd2dq, kX86InstIdVcvttpd2dq, X86XmmReg, X86XmmReg)
  //! \overload
  INST_2x(vcvttpd2dq, kX86InstIdVcvttpd2dq, X86XmmReg, X86YmmReg)
  //! \overload
  INST_2x(vcvttpd2dq, kX86InstIdVcvttpd2dq, X86XmmReg, X86Mem)

-  //! Convert with truncation packed SP-FP to packed QWORDs (AVX).
+  //! Convert with truncation packed SP-FP to packed DWORDs (AVX).
  INST_2x(vcvttps2dq, kX86InstIdVcvttps2dq, X86XmmReg, X86XmmReg)
  //! \overload
  INST_2x(vcvttps2dq, kX86InstIdVcvttps2dq, X86XmmReg, X86Mem)
@@ -3745,7 +3912,7 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_2x(vcvttps2dq, kX86InstIdVcvttps2dq, X86YmmReg, X86Mem)

-  //! Convert with truncation scalar DP-FP to DWORD (AVX).
+  //! Convert with truncation scalar DP-FP to INT32 (AVX).
  INST_2x(vcvttsd2si, kX86InstIdVcvttsd2si, X86GpReg, X86XmmReg)
  //! \overload
  INST_2x(vcvttsd2si, kX86InstIdVcvttsd2si, X86GpReg, X86Mem)
@@ -6393,15 +6560,6 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  //! \overload
  INST_2x(tzcnt, kX86InstIdTzcnt, X86GpReg, X86Mem)

-  // --------------------------------------------------------------------------
-  // [LZCNT]
-  // --------------------------------------------------------------------------
-
-  //! Count the number of leading zero bits (LZCNT).
-  INST_2x(lzcnt, kX86InstIdLzcnt, X86GpReg, X86GpReg)
-  //! \overload
-  INST_2x(lzcnt, kX86InstIdLzcnt, X86GpReg, X86Mem)
-
  // --------------------------------------------------------------------------
  // [BMI2]
  // --------------------------------------------------------------------------
@@ -6450,11 +6608,11 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
  // [RDRAND]
  // --------------------------------------------------------------------------

-  //! Store a random number in destination register.
+  //! Store a random number in destination register (RDRAND).
  //!
  //! Please do not use this instruction in cryptographic software. The result
-  //! doesn't necessarily have to be random which may cause a major security
-  //! issue in the software that relies on it.
+  //! doesn't necessarily have to be random, which may cause a major security
+  //! hole in the software.
  INST_1x(rdrand, kX86InstIdRdrand, X86GpReg)

  // --------------------------------------------------------------------------
@@ -6503,10 +6661,12 @@ struct ASMJIT_VCLASS X86Assembler : public Assembler {
 #undef INST_3x
 #undef INST_3x_
 #undef INST_3i
+#undef INST_3ii

 #undef INST_4x
 #undef INST_4x_
 #undef INST_4i
+#undef INST_4ii
 };

 //! \}
--- a/src/asmjit/x86/x86compiler.cpp
+++ b/src/asmjit/x86/x86compiler.cpp
@@ -60,14 +60,18 @@ const X86VarInfo _x86VarInfo[] = {
  /* 10: kVarTypeFp32     */ { kX86RegTypeFp   , 4 , C(Fp) , D(Sp)            , "fp"  },
  /* 11: kVarTypeFp64     */ { kX86RegTypeFp   , 8 , C(Fp) , D(Dp)            , "fp"  },
  /* 12: kX86VarTypeMm    */ { kX86RegTypeMm   , 8 , C(Mm) , 0                , "mm"  },
-  /* 13: kX86VarTypeXmm   */ { kX86RegTypeXmm  , 16, C(Xyz), 0                , "xmm" },
-  /* 14: kX86VarTypeXmmSs */ { kX86RegTypeXmm  , 4 , C(Xyz), D(Sp)            , "xmm" },
-  /* 15: kX86VarTypeXmmPs */ { kX86RegTypeXmm  , 16, C(Xyz), D(Sp) | D(Packed), "xmm" },
-  /* 16: kX86VarTypeXmmSd */ { kX86RegTypeXmm  , 8 , C(Xyz), D(Dp)            , "xmm" },
-  /* 17: kX86VarTypeXmmPd */ { kX86RegTypeXmm  , 16, C(Xyz), D(Dp) | D(Packed), "xmm" },
-  /* 18: kX86VarTypeYmm   */ { kX86RegTypeYmm  , 32, C(Xyz), 0                , "ymm" },
-  /* 19: kX86VarTypeYmmPs */ { kX86RegTypeYmm  , 32, C(Xyz), D(Sp) | D(Packed), "ymm" },
-  /* 20: kX86VarTypeYmmPd */ { kX86RegTypeYmm  , 32, C(Xyz), D(Dp) | D(Packed), "ymm" }
+  /* 13: kX86VarTypeK     */ { kX86RegTypeK    , 8 , C(K)  , 0                , "k"   },
+  /* 14: kX86VarTypeXmm   */ { kX86RegTypeXmm  , 16, C(Xyz), 0                , "xmm" },
+  /* 15: kX86VarTypeXmmSs */ { kX86RegTypeXmm  , 4 , C(Xyz), D(Sp)            , "xmm" },
+  /* 16: kX86VarTypeXmmPs */ { kX86RegTypeXmm  , 16, C(Xyz), D(Sp) | D(Packed), "xmm" },
+  /* 17: kX86VarTypeXmmSd */ { kX86RegTypeXmm  , 8 , C(Xyz), D(Dp)            , "xmm" },
+  /* 18: kX86VarTypeXmmPd */ { kX86RegTypeXmm  , 16, C(Xyz), D(Dp) | D(Packed), "xmm" },
+  /* 19: kX86VarTypeYmm   */ { kX86RegTypeYmm  , 32, C(Xyz), 0                , "ymm" },
+  /* 20: kX86VarTypeYmmPs */ { kX86RegTypeYmm  , 32, C(Xyz), D(Sp) | D(Packed), "ymm" },
+  /* 21: kX86VarTypeYmmPd */ { kX86RegTypeYmm  , 32, C(Xyz), D(Dp) | D(Packed), "ymm" },
+  /* 22: kX86VarTypeZmm   */ { kX86RegTypeZmm  , 64, C(Xyz), 0                , "zmm" },
+  /* 23: kX86VarTypeZmmPs */ { kX86RegTypeZmm  , 64, C(Xyz), D(Sp) | D(Packed), "zmm" },
+  /* 24: kX86VarTypeZmmPd */ { kX86RegTypeZmm  , 64, C(Xyz), D(Dp) | D(Packed), "zmm" }
 };

 #undef D
@@ -88,14 +92,18 @@ const uint8_t _x86VarMapping[kX86VarTypeCount] = {
  /* 10: kVarTypeFp32     */ kVarTypeFp32,
  /* 11: kVarTypeFp64     */ kVarTypeFp64,
  /* 12: kX86VarTypeMm    */ kX86VarTypeMm,
-  /* 13: kX86VarTypeXmm   */ kX86VarTypeXmm,
-  /* 14: kX86VarTypeXmmSs */ kX86VarTypeXmmSs,
-  /* 15: kX86VarTypeXmmPs */ kX86VarTypeXmmPs,
-  /* 16: kX86VarTypeXmmSd */ kX86VarTypeXmmSd,
-  /* 17: kX86VarTypeXmmPd */ kX86VarTypeXmmPd,
-  /* 18: kX86VarTypeYmm   */ kX86VarTypeYmm,
-  /* 19: kX86VarTypeYmmPs */ kX86VarTypeYmmPs,
-  /* 20: kX86VarTypeYmmPd */ kX86VarTypeYmmPd
+  /* 13: kX86VarTypeK     */ kX86VarTypeK,
+  /* 14: kX86VarTypeXmm   */ kX86VarTypeXmm,
+  /* 15: kX86VarTypeXmmSs */ kX86VarTypeXmmSs,
+  /* 16: kX86VarTypeXmmPs */ kX86VarTypeXmmPs,
+  /* 17: kX86VarTypeXmmSd */ kX86VarTypeXmmSd,
+  /* 18: kX86VarTypeXmmPd */ kX86VarTypeXmmPd,
+  /* 19: kX86VarTypeYmm   */ kX86VarTypeYmm,
+  /* 20: kX86VarTypeYmmPs */ kX86VarTypeYmmPs,
+  /* 21: kX86VarTypeYmmPd */ kX86VarTypeYmmPd,
+  /* 22: kX86VarTypeZmm   */ kX86VarTypeZmm,
+  /* 23: kX86VarTypeZmmPs */ kX86VarTypeZmmPs,
+  /* 24: kX86VarTypeZmmPd */ kX86VarTypeZmmPd
 };
 #endif // ASMJIT_BUILD_X86

@@ -114,14 +122,18 @@ const uint8_t _x64VarMapping[kX86VarTypeCount] = {
  /* 10: kVarTypeFp32     */ kVarTypeFp32,
  /* 11: kVarTypeFp64     */ kVarTypeFp64,
  /* 12: kX86VarTypeMm    */ kX86VarTypeMm,
-  /* 13: kX86VarTypeXmm   */ kX86VarTypeXmm,
-  /* 14: kX86VarTypeXmmSs */ kX86VarTypeXmmSs,
-  /* 15: kX86VarTypeXmmPs */ kX86VarTypeXmmPs,
-  /* 16: kX86VarTypeXmmSd */ kX86VarTypeXmmSd,
-  /* 17: kX86VarTypeXmmPd */ kX86VarTypeXmmPd,
-  /* 18: kX86VarTypeYmm   */ kX86VarTypeYmm,
-  /* 19: kX86VarTypeYmmPs */ kX86VarTypeYmmPs,
-  /* 20: kX86VarTypeYmmPd */ kX86VarTypeYmmPd
+  /* 13: kX86VarTypeK     */ kX86VarTypeK,
+  /* 14: kX86VarTypeXmm   */ kX86VarTypeXmm,
+  /* 15: kX86VarTypeXmmSs */ kX86VarTypeXmmSs,
+  /* 16: kX86VarTypeXmmPs */ kX86VarTypeXmmPs,
+  /* 17: kX86VarTypeXmmSd */ kX86VarTypeXmmSd,
+  /* 18: kX86VarTypeXmmPd */ kX86VarTypeXmmPd,
+  /* 19: kX86VarTypeYmm   */ kX86VarTypeYmm,
+  /* 20: kX86VarTypeYmmPs */ kX86VarTypeYmmPs,
+  /* 21: kX86VarTypeYmmPd */ kX86VarTypeYmmPd,
+  /* 22: kX86VarTypeZmm   */ kX86VarTypeZmm,
+  /* 23: kX86VarTypeZmmPs */ kX86VarTypeZmmPs,
+  /* 24: kX86VarTypeZmmPd */ kX86VarTypeZmmPd
 };
 #endif // ASMJIT_BUILD_X64

@@ -482,14 +494,14 @@ static Error X86FuncDecl_initFunc(X86FuncDecl* self, uint32_t arch,

        if (x86ArgIsInt(varType) && i < ASMJIT_ARRAY_SIZE(self->_passedOrderGp)) {
          arg._regIndex = self->_passedOrderGp[i];
-          self->_used.add(kX86RegClassGp, IntUtil::mask(arg.getRegIndex()));
+          self->_used.or_(kX86RegClassGp, IntUtil::mask(arg.getRegIndex()));
          continue;
        }

        if (x86ArgIsFp(varType) && i < ASMJIT_ARRAY_SIZE(self->_passedOrderXmm)) {
          arg._varType = static_cast<uint8_t>(x86ArgTypeToXmmType(varType));
          arg._regIndex = self->_passedOrderXmm[i];
-          self->_used.add(kX86RegClassXyz, IntUtil::mask(arg.getRegIndex()));
+          self->_used.or_(kX86RegClassXyz, IntUtil::mask(arg.getRegIndex()));
        }
      }

@@ -527,7 +539,7 @@ static Error X86FuncDecl_initFunc(X86FuncDecl* self, uint32_t arch,
          continue;

        arg._regIndex = self->_passedOrderGp[gpPos++];
-        self->_used.add(kX86RegClassGp, IntUtil::mask(arg.getRegIndex()));
+        self->_used.or_(kX86RegClassGp, IntUtil::mask(arg.getRegIndex()));
      }

      // Register arguments (Xmm), always left-to-right.
@@ -538,7 +550,7 @@ static Error X86FuncDecl_initFunc(X86FuncDecl* self, uint32_t arch,
        if (x86ArgIsFp(varType)) {
          arg._varType = static_cast<uint8_t>(x86ArgTypeToXmmType(varType));
          arg._regIndex = self->_passedOrderXmm[xmmPos++];
-          self->_used.add(kX86RegClassXyz, IntUtil::mask(arg.getRegIndex()));
+          self->_used.or_(kX86RegClassXyz, IntUtil::mask(arg.getRegIndex()));
        }
      }

@@ -722,9 +734,9 @@ Error X86Compiler::setArch(uint32_t arch) {

    _regCount.reset();
    _regCount._gp  = 8;
-    _regCount._fp = 8;
    _regCount._mm  = 8;
-    _regCount._xy = 8;
+    _regCount._k   = 8;
+    _regCount._xyz = 8;

    zax = x86::eax;
    zcx = x86::ecx;
@@ -747,9 +759,9 @@ Error X86Compiler::setArch(uint32_t arch) {

    _regCount.reset();
    _regCount._gp  = 16;
-    _regCount._fp = 8;
    _regCount._mm  = 8;
-    _regCount._xy = 16;
+    _regCount._k   = 8;
+    _regCount._xyz = 16;

    zax = x86::rax;
    zcx = x86::rcx;
@@ -783,7 +795,7 @@ static InstNode* X86Compiler_newInst(X86Compiler* self, void* p, uint32_t code,
    JumpNode* node = new(p) JumpNode(self, code, options, opList, opCount);
    TargetNode* jTarget = self->getTargetById(opList[0].getId());

-    node->addFlags(code == kX86InstIdJmp ? kNodeFlagIsJmp | kNodeFlagIsTaken : kNodeFlagIsJcc);
+    node->orFlags(code == kX86InstIdJmp ? kNodeFlagIsJmp | kNodeFlagIsTaken : kNodeFlagIsJcc);
    node->_target = jTarget;
    node->_jumpNext = static_cast<JumpNode*>(jTarget->_from);

@@ -792,9 +804,9 @@ static InstNode* X86Compiler_newInst(X86Compiler* self, void* p, uint32_t code,

    // The 'jmp' is always taken, conditional jump can contain hint, we detect it.
    if (code == kX86InstIdJmp)
-      node->addFlags(kNodeFlagIsTaken);
+      node->orFlags(kNodeFlagIsTaken);
    else if (options & kInstOptionTaken)
-      node->addFlags(kNodeFlagIsTaken);
+      node->orFlags(kNodeFlagIsTaken);

    node->addOptions(options);
    return node;
@@ -1025,6 +1037,22 @@ InstNode* X86Compiler::emit(uint32_t code, const Operand& o0, const Operand& o1,
  return static_cast<InstNode*>(addNode(node));
 }

+InstNode* X86Compiler::emit(uint32_t code, const Operand& o0, const Operand& o1, const Operand& o2, int o3_) {
+  Imm o3(o3_);
+  InstNode* node = newInst(code, o0, o1, o2, o3);
+  if (node == NULL)
+    return NULL;
+  return static_cast<InstNode*>(addNode(node));
+}
+
+InstNode* X86Compiler::emit(uint32_t code, const Operand& o0, const Operand& o1, const Operand& o2, uint64_t o3_) {
+  Imm o3(o3_);
+  InstNode* node = newInst(code, o0, o1, o2, o3);
+  if (node == NULL)
+    return NULL;
+  return static_cast<InstNode*>(addNode(node));
+}
+
 // ============================================================================
 // [asmjit::X86Compiler - Func]
 // ============================================================================
--- a/src/asmjit/x86/x86compiler.h
+++ b/src/asmjit/x86/x86compiler.h
@@ -46,24 +46,34 @@ ASMJIT_ENUM(kX86VarType) {
  //! Variable is Mm (MMX).
  kX86VarTypeMm = 12,

+  //! Variable is K (AVX512+)
+  kX86VarTypeK,
+
  //! Variable is Xmm (SSE+).
  kX86VarTypeXmm,
-  //! Variable is scalar Xmm SP-FP number.
+  //! Variable is a scalar Xmm SP-FP number.
  kX86VarTypeXmmSs,
-  //! Variable is packed Xmm SP-FP number (4 floats).
+  //! Variable is a packed Xmm SP-FP number (4 floats).
  kX86VarTypeXmmPs,
-  //! Variable is scalar Xmm DP-FP number.
+  //! Variable is a scalar Xmm DP-FP number.
  kX86VarTypeXmmSd,
-  //! Variable is packed Xmm DP-FP number (2 doubles).
+  //! Variable is a packed Xmm DP-FP number (2 doubles).
  kX86VarTypeXmmPd,

  //! Variable is Ymm (AVX+).
  kX86VarTypeYmm,
-  //! Variable is packed Ymm SP-FP number (8 floats).
+  //! Variable is a packed Ymm SP-FP number (8 floats).
  kX86VarTypeYmmPs,
-  //! Variable is packed Ymm DP-FP number (4 doubles).
+  //! Variable is a packed Ymm DP-FP number (4 doubles).
  kX86VarTypeYmmPd,

+  //! Variable is Zmm (AVX512+).
+  kX86VarTypeZmm,
+  //! Variable is a packed Zmm SP-FP number (16 floats).
+  kX86VarTypeZmmPs,
+  //! Variable is a packed Zmm DP-FP number (8 doubles).
+  kX86VarTypeZmmPd,
+
  //! Count of variable types.
  kX86VarTypeCount,

@@ -76,7 +86,10 @@ ASMJIT_ENUM(kX86VarType) {
  _kX86VarTypeXmmEnd = kX86VarTypeXmmPd,

  _kX86VarTypeYmmStart = kX86VarTypeYmm,
-  _kX86VarTypeYmmEnd = kX86VarTypeYmmPd
+  _kX86VarTypeYmmEnd = kX86VarTypeYmmPd,
+
+  _kX86VarTypeZmmStart = kX86VarTypeZmm,
+  _kX86VarTypeZmmEnd = kX86VarTypeZmmPd
  //! \}
 };

@@ -552,6 +565,9 @@ struct X86Var : public Var {

  //! Get whether the variable is Mm (64-bit) register.
  ASMJIT_INLINE bool isMm() const { return _vreg.type == kX86RegTypeMm; }
+  //! Get whether the variable is K (64-bit) register.
+  ASMJIT_INLINE bool isK() const { return _vreg.type == kX86RegTypeK; }
+
  //! Get whether the variable is Xmm (128-bit) register.
  ASMJIT_INLINE bool isXmm() const { return _vreg.type == kX86RegTypeXmm; }
  //! Get whether the variable is Ymm (256-bit) register.
@@ -2016,6 +2032,7 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
    = kArchHost
 #endif // ASMJIT_HOST_X86 || ASMJIT_HOST_X64
  );
+
  //! Destroy the `X86Compiler` instance.
  ASMJIT_API ~X86Compiler();

@@ -2023,7 +2040,12 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
  // [Arch]
  // --------------------------------------------------------------------------

-  //! Get count of registers of the current architecture.
+  //! \internal
+  //!
+  //! Set the architecture to `arch`.
+  ASMJIT_API Error setArch(uint32_t arch);
+
+  //! Get count of registers of the current architecture and mode.
  ASMJIT_INLINE const X86RegCount& getRegCount() const {
    return _regCount;
  }
@@ -2075,8 +2097,6 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
    return x86::ptr_abs(pAbs, index, shift, disp, _regSize);
  }

-  ASMJIT_API Error setArch(uint32_t arch);
-
  // --------------------------------------------------------------------------
  // [Inst / Emit]
  // --------------------------------------------------------------------------
@@ -2119,6 +2139,10 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
  ASMJIT_API InstNode* emit(uint32_t code, const Operand& o0, const Operand& o1, int o2);
  //! \overload
  ASMJIT_API InstNode* emit(uint32_t code, const Operand& o0, const Operand& o1, uint64_t o2);
+  //! \overload
+  ASMJIT_API InstNode* emit(uint32_t code, const Operand& o0, const Operand& o1, const Operand& o2, int o3);
+  //! \overload
+  ASMJIT_API InstNode* emit(uint32_t code, const Operand& o0, const Operand& o1, const Operand& o2, uint64_t o3);

  // --------------------------------------------------------------------------
  // [Func]
@@ -2438,7 +2462,7 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
  X86GpReg zdi;

  // --------------------------------------------------------------------------
-  // [X86 Instructions]
+  // [Emit]
  // --------------------------------------------------------------------------

 #define INST_0x(_Inst_, _Code_) \
@@ -2614,6 +2638,31 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
    return emit(_Code_, o0, o1, o2); \
  }

+#define INST_3ii(_Inst_, _Code_, _Op0_, _Op1_, _Op2_) \
+  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, const _Op1_& o1, const _Op2_& o2) { \
+    return emit(_Code_, o0, o1, o2); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, int o1, int o2) { \
+    Imm o1Imm(o1); \
+    return emit(_Code_, o0, o1Imm, o2); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, unsigned int o1, unsigned int o2) { \
+    Imm o1Imm(o1); \
+    return emit(_Code_, o0, o1Imm, static_cast<uint64_t>(o2)); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, int64_t o1, int64_t o2) { \
+    Imm o1Imm(o1); \
+    return emit(_Code_, o0, o1Imm, static_cast<uint64_t>(o2)); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, uint64_t o1, uint64_t o2) { \
+    Imm o1Imm(o1); \
+    return emit(_Code_, o0, o1Imm, o2); \
+  }
+
 #define INST_4x(_Inst_, _Code_, _Op0_, _Op1_, _Op2_) \
  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, const _Op1_& o1, const _Op2_& o2, const _Op3_& o3) { \
    return emit(_Code_, o0, o1, o2, o3); \
@@ -2646,6 +2695,35 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
    return emit(_Code_, o0, o1, o2, o3); \
  }

+#define INST_4ii(_Inst_, _Code_, _Op0_, _Op1_, _Op2_, _Op3_) \
+  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, const _Op1_& o1, const _Op2_& o2, const _Op3_& o3) { \
+    return emit(_Code_, o0, o1, o2, o3); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, const _Op1_& o1, int o2, int o3) { \
+    Imm o2Imm(o2); \
+    return emit(_Code_, o0, o1, o2Imm, o3); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, const _Op1_& o1, unsigned int o2, unsigned int o3) { \
+    Imm o2Imm(o2); \
+    return emit(_Code_, o0, o1, o2Imm, static_cast<uint64_t>(o3)); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, const _Op1_& o1, int64_t o2, int64_t o3) { \
+    Imm o2Imm(o2); \
+    return emit(_Code_, o0, o1, o2Imm, static_cast<uint64_t>(o3)); \
+  } \
+  /*! \overload */ \
+  ASMJIT_INLINE InstNode* _Inst_(const _Op0_& o0, const _Op1_& o1, uint64_t o2, uint64_t o3) { \
+    Imm o2Imm(o2); \
+    return emit(_Code_, o0, o1, o2Imm, o3); \
+  }
+
+  // --------------------------------------------------------------------------
+  // [X86/X64]
+  // --------------------------------------------------------------------------
+
  //! Add with carry.
  INST_2x(adc, kX86InstIdAdc, X86GpVar, X86GpVar)
  //! \overload
@@ -2832,11 +2910,6 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
    return emit(kX86InstIdCpuid, x_eax, w_ebx, x_ecx, w_edx);
  }

-  //! Accumulate crc32 value (polynomial 0x11EDC6F41) (SSE4.2).
-  INST_2x_(crc32, kX86InstIdCrc32, X86GpVar, X86GpVar, o0.isRegType(kX86RegTypeGpd) || o0.isRegType(kX86RegTypeGpq))
-  //! \overload
-  INST_2x_(crc32, kX86InstIdCrc32, X86GpVar, X86Mem, o0.isRegType(kX86RegTypeGpd) || o0.isRegType(kX86RegTypeGpq))
-
  //! Decimal adjust AL after addition (X86 Only).
  INST_1x(daa, kX86InstIdDaa, X86GpVar)
  //! Decimal adjust AL after subtraction (X86 Only).
@@ -3024,11 +3097,6 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
  //! Pop stack into EFLAGS Register (32-bit or 64-bit).
  INST_0x(popf, kX86InstIdPopf)

-  //! Return the count of number of bits set to 1 (SSE4.2).
-  INST_2x_(popcnt, kX86InstIdPopcnt, X86GpVar, X86GpVar, !o0.isGpb() && o0.getSize() == o1.getSize())
-  //! \overload
-  INST_2x_(popcnt, kX86InstIdPopcnt, X86GpVar, X86Mem, !o0.isGpb())
-
  //! Push WORD or DWORD/QWORD on the stack.
  INST_1x_(push, kX86InstIdPush, X86GpVar, o0.getSize() == 2 || o0.getSize() == _regSize)
  //! Push WORD or DWORD/QWORD on the stack.
@@ -3299,273 +3367,277 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
  INST_2i(xor_, kX86InstIdXor, X86Mem, Imm)

  // --------------------------------------------------------------------------
-  // [Fpu]
+  // [FPU]
  // --------------------------------------------------------------------------

-  //! Compute 2^x - 1 (FPU).
+  //! Compute `2^x - 1` - `fp0 = POW(2, fp0) - 1` (FPU).
  INST_0x(f2xm1, kX86InstIdF2xm1)
-  //! Absolute value of fp0 (FPU).
+  //! Abs `fp0 = ABS(fp0)` (FPU).
  INST_0x(fabs, kX86InstIdFabs)

-  //! Add `o1` to `o0` (one has to be `fp0`) and store result in `o0` (FPU).
+  //! Add `o0 = o0 + o1` (one operand has to be `fp0`) (FPU).
  INST_2x_(fadd, kX86InstIdFadd, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Add 4-byte or 8-byte FP `o0` to fp0 and store result in fp0 (FPU).
+  //! Add `fp0 = fp0 + float_or_double[o0]` (FPU).
  INST_1x(fadd, kX86InstIdFadd, X86Mem)
-  //! Add fp0 to `o0` and pop the FPU stack (FPU).
+  //! Add `o0 = o0 + fp0` and POP (FPU).
  INST_1x(faddp, kX86InstIdFaddp, X86FpReg)
-  //! \overload
+  //! Add `fp1 = fp1 + fp0` and POP (FPU).
  INST_0x(faddp, kX86InstIdFaddp)

-  //! Load binary coded decimal (FPU).
+  //! Load BCD from `[o0]` and PUSH (FPU).
  INST_1x(fbld, kX86InstIdFbld, X86Mem)
-  //! Store BCD integer and Pop (FPU).
+  //! Store BCD-Integer to `[o0]` and POP (FPU).
  INST_1x(fbstp, kX86InstIdFbstp, X86Mem)
-  //! Change fp0 sign (FPU).
+
+  //! Complement Sign `fp0 = -fp0` (FPU).
  INST_0x(fchs, kX86InstIdFchs)
+
  //! Clear exceptions (FPU).
  INST_0x(fclex, kX86InstIdFclex)

-  //! Conditional move (FPU).
+  //! Conditional move `if (CF=1) fp0 = o0` (FPU).
  INST_1x(fcmovb, kX86InstIdFcmovb, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (CF|ZF=1) fp0 = o0` (FPU).
  INST_1x(fcmovbe, kX86InstIdFcmovbe, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (ZF=1) fp0 = o0` (FPU).
  INST_1x(fcmove, kX86InstIdFcmove, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (CF=0) fp0 = o0` (FPU).
  INST_1x(fcmovnb, kX86InstIdFcmovnb, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (CF|ZF=0) fp0 = o0` (FPU).
  INST_1x(fcmovnbe, kX86InstIdFcmovnbe, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (ZF=0) fp0 = o0` (FPU).
  INST_1x(fcmovne, kX86InstIdFcmovne, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (PF=0) fp0 = o0` (FPU).
  INST_1x(fcmovnu, kX86InstIdFcmovnu, X86FpReg)
-  //! Conditional move (FPU).
+  //! Conditional move `if (PF=1) fp0 = o0` (FPU).
  INST_1x(fcmovu, kX86InstIdFcmovu, X86FpReg)

-  //! Compare fp0 with `o0` (FPU).
+  //! Compare `fp0` with `o0` (FPU).
  INST_1x(fcom, kX86InstIdFcom, X86FpReg)
-  //! Compare fp0 with fp1 (FPU).
+  //! Compare `fp0` with `fp1` (FPU).
  INST_0x(fcom, kX86InstIdFcom)
-  //! Compare fp0 with 4-byte or 8-byte FP at `src` (FPU).
+  //! Compare `fp0` with `float_or_double[o0]` (FPU).
  INST_1x(fcom, kX86InstIdFcom, X86Mem)
-  //! Compare fp0 with `o0` and pop the FPU stack (FPU).
+  //! Compare `fp0` with `o0` and POP (FPU).
  INST_1x(fcomp, kX86InstIdFcomp, X86FpReg)
-  //! Compare fp0 with fp1 and pop the FPU stack (FPU).
+  //! Compare `fp0` with `fp1` and POP (FPU).
  INST_0x(fcomp, kX86InstIdFcomp)
-  //! Compare fp0 with 4-byte or 8-byte FP at `adr` and pop the FPU stack (FPU).
+  //! Compare `fp0` with `float_or_double[o0]` and POP (FPU).
  INST_1x(fcomp, kX86InstIdFcomp, X86Mem)
-    //! Compare fp0 with fp1 and pop the FPU stack twice (FPU).
+  //! Compare `fp0` with `fp1` and POP twice (FPU).
  INST_0x(fcompp, kX86InstIdFcompp)
-  //! Compare fp0 and `o0` and Set EFLAGS (FPU).
+  //! Compare `fp0` with `o0` and set EFLAGS (FPU).
  INST_1x(fcomi, kX86InstIdFcomi, X86FpReg)
-  //! Compare fp0 and `o0` and Set EFLAGS and pop the FPU stack (FPU).
+  //! Compare `fp0` with `o0` and set EFLAGS and POP (FPU).
  INST_1x(fcomip, kX86InstIdFcomip, X86FpReg)

-  //! Calculate cosine of fp0 and store result in fp0 (FPU).
+  //! Cos `fp0 = cos(fp0)` (FPU).
  INST_0x(fcos, kX86InstIdFcos)
-  //! Decrement FPU stack-top pointer (FPU).
+
+  //! Decrement FPU stack pointer (FPU).
  INST_0x(fdecstp, kX86InstIdFdecstp)

-  //! Divide `o0` by `o1` (one has to be `fp0`) (FPU).
+  //! Divide `o0 = o0 / o1` (one has to be `fp0`) (FPU).
  INST_2x_(fdiv, kX86InstIdFdiv, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Divide fp0 by 32-bit or 64-bit FP value (FPU).
+  //! Divide `fp0 = fp0 / float_or_double[o0]` (FPU).
  INST_1x(fdiv, kX86InstIdFdiv, X86Mem)
-  //! Divide `o0` by fp0 (FPU).
+  //! Divide `o0 = o0 / fp0` and POP (FPU).
  INST_1x(fdivp, kX86InstIdFdivp, X86FpReg)
-  //! \overload
+  //! Divide `fp1 = fp1 / fp0` and POP (FPU).
  INST_0x(fdivp, kX86InstIdFdivp)

-  //! Reverse divide `o0` by `o1` (one has to be `fp0`) (FPU).
+  //! Reverse divide `o0 = o1 / o0` (one has to be `fp0`) (FPU).
  INST_2x_(fdivr, kX86InstIdFdivr, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Reverse divide fp0 by 32-bit or 64-bit FP value (FPU).
+  //! Reverse divide `fp0 = float_or_double[o0] / fp0` (FPU).
  INST_1x(fdivr, kX86InstIdFdivr, X86Mem)
-  //! Reverse divide `o0` by fp0 (FPU).
+  //! Reverse divide `o0 = fp0 / o0` and POP (FPU).
  INST_1x(fdivrp, kX86InstIdFdivrp, X86FpReg)
-  //! \overload
+  //! Reverse divide `fp1 = fp0 / fp1` and POP (FPU).
  INST_0x(fdivrp, kX86InstIdFdivrp)

  //! Free FP register (FPU).
  INST_1x(ffree, kX86InstIdFfree, X86FpReg)

-  //! Add 16-bit or 32-bit integer to fp0 (FPU).
+  //! Add `fp0 = fp0 + short_or_int[o0]` (FPU).
  INST_1x_(fiadd, kX86InstIdFiadd, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Compare fp0 with 16-bit or 32-bit Integer (FPU).
+  //! Compare `fp0` with `short_or_int[o0]` (FPU).
  INST_1x_(ficom, kX86InstIdFicom, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Compare fp0 with 16-bit or 32-bit Integer and pop the FPU stack (FPU).
+  //! Compare `fp0` with `short_or_int[o0]` and POP (FPU).
  INST_1x_(ficomp, kX86InstIdFicomp, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Divide fp0 by 32-bit or 16-bit integer (`src`) (FPU).
+  //! Divide `fp0 = fp0 / short_or_int[o0]` (FPU).
  INST_1x_(fidiv, kX86InstIdFidiv, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Reverse divide fp0 by 32-bit or 16-bit integer (`src`) (FPU).
+  //! Reverse divide `fp0 = short_or_int[o0] / fp0` (FPU).
  INST_1x_(fidivr, kX86InstIdFidivr, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)

-  //! Load 16-bit, 32-bit or 64-bit Integer and push it to the FPU stack (FPU).
+  //! Load `short_or_int_or_long[o0]` and PUSH (FPU).
  INST_1x_(fild, kX86InstIdFild, X86Mem, o0.getSize() == 2 || o0.getSize() == 4 || o0.getSize() == 8)
-  //! Multiply fp0 by 16-bit or 32-bit integer and store it to fp0 (FPU).
+  //! Multiply `fp0 *= short_or_int[o0]` (FPU).
  INST_1x_(fimul, kX86InstIdFimul, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)

-  //! Increment FPU stack-top pointer (FPU).
+  //! Increment FPU stack pointer (FPU).
  INST_0x(fincstp, kX86InstIdFincstp)
  //! Initialize FPU (FPU).
  INST_0x(finit, kX86InstIdFinit)

-  //! Subtract 16-bit or 32-bit integer from fp0 and store result to fp0 (FPU).
+  //! Subtract `fp0 = fp0 - short_or_int[o0]` (FPU).
  INST_1x_(fisub, kX86InstIdFisub, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Reverse subtract 16-bit or 32-bit integer from fp0 and store result to fp0 (FPU).
+  //! Reverse subtract `fp0 = short_or_int[o0] - fp0` (FPU).
  INST_1x_(fisubr, kX86InstIdFisubr, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)

  //! Initialize FPU without checking for pending unmasked exceptions (FPU).
  INST_0x(fninit, kX86InstIdFninit)

-  //! Store fp0 as 16-bit or 32-bit Integer to `o0` (FPU).
+  //! Store `fp0` as `short_or_int[o0]` (FPU).
  INST_1x_(fist, kX86InstIdFist, X86Mem, o0.getSize() == 2 || o0.getSize() == 4)
-  //! Store fp0 as 16-bit, 32-bit or 64-bit Integer to `o0` and pop the FPU stack (FPU).
+  //! Store `fp0` as `short_or_int_or_long[o0]` and POP (FPU).
  INST_1x_(fistp, kX86InstIdFistp, X86Mem, o0.getSize() == 2 || o0.getSize() == 4 || o0.getSize() == 8)
-  //! Push 32-bit, 64-bit or 80-bit floating point value on the FPU stack (FPU).
+
+  //! Load `float_or_double_or_extended[o0]` and PUSH (FPU).
  INST_1x_(fld, kX86InstIdFld, X86Mem, o0.getSize() == 4 || o0.getSize() == 8 || o0.getSize() == 10)
-  //! Push `o0` on the FPU stack (FPU).
+  //! PUSH `o0` (FPU).
  INST_1x(fld, kX86InstIdFld, X86FpReg)

-  //! Push +1.0 on the FPU stack (FPU).
+  //! PUSH `1.0` (FPU).
  INST_0x(fld1, kX86InstIdFld1)
-  //! Push log2(10) on the FPU stack (FPU).
+  //! PUSH `log2(10)` (FPU).
  INST_0x(fldl2t, kX86InstIdFldl2t)
-  //! Push log2(e) on the FPU stack (FPU).
+  //! PUSH `log2(e)` (FPU).
  INST_0x(fldl2e, kX86InstIdFldl2e)
-  //! Push pi on the FPU stack (FPU).
+  //! PUSH `pi` (FPU).
  INST_0x(fldpi, kX86InstIdFldpi)
-  //! Push log10(2) on the FPU stack (FPU).
+  //! PUSH `log10(2)` (FPU).
  INST_0x(fldlg2, kX86InstIdFldlg2)
-  //! Push ln(2) on the FPU stack (FPU).
+  //! PUSH `ln(2)` (FPU).
  INST_0x(fldln2, kX86InstIdFldln2)
-  //! Push +0.0 on the FPU stack (FPU).
+  //! PUSH `+0.0` (FPU).
  INST_0x(fldz, kX86InstIdFldz)

-  //! Load x87 FPU control word (2 bytes) (FPU).
+  //! Load x87 FPU control word from `word_ptr[o0]` (FPU).
  INST_1x(fldcw, kX86InstIdFldcw, X86Mem)
-  //! Load x87 FPU environment (14 or 28 bytes) (FPU).
+  //! Load x87 FPU environment (14 or 28 bytes) from `[o0]` (FPU).
  INST_1x(fldenv, kX86InstIdFldenv, X86Mem)

-  //! Multiply `o0` by `o1` (one has to be `fp0`) and store result in `o0` (FPU).
+  //! Multiply `o0 = o0  * o1` (one has to be `fp0`) (FPU).
  INST_2x_(fmul, kX86InstIdFmul, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Multiply fp0 by 32-bit or 64-bit `o0` and store result in fp0 (FPU).
+  //! Multiply `fp0 = fp0 * float_or_double[o0]` (FPU).
  INST_1x(fmul, kX86InstIdFmul, X86Mem)
-  //! Multiply fp0 by `o0` and pop the FPU stack (FPU).
+  //! Multiply `o0 = o0 * fp0` and POP (FPU).
  INST_1x(fmulp, kX86InstIdFmulp, X86FpReg)
-  //! \overload
+  //! Multiply `fp1 = fp1 * fp0` and POP (FPU).
  INST_0x(fmulp, kX86InstIdFmulp)

  //! Clear exceptions (FPU).
  INST_0x(fnclex, kX86InstIdFnclex)
  //! No operation (FPU).
  INST_0x(fnop, kX86InstIdFnop)
-  //! Save FPU state (FPU).
+  //! Save FPU state to `[o0]` (FPU).
  INST_1x(fnsave, kX86InstIdFnsave, X86Mem)
-  //! Store x87 FPU environment (FPU).
+  //! Store x87 FPU environment to `[o0]` (FPU).
  INST_1x(fnstenv, kX86InstIdFnstenv, X86Mem)
-  //! Store x87 FPU control word (FPU).
+  //! Store x87 FPU control word to `[o0]` (FPU).
  INST_1x(fnstcw, kX86InstIdFnstcw, X86Mem)

  //! Store x87 FPU status word to `o0` (AX) (FPU).
-  INST_1x_(fnstsw, kX86InstIdFnstsw, X86GpReg, o0.isRegCode(kX86RegTypeGpw, kX86RegIndexAx))
-  //! Store x87 FPU status word to `o0` (2 bytes) (FPU).
+  INST_1x(fnstsw, kX86InstIdFnstsw, X86GpVar)
+  //! Store x87 FPU status word to `word_ptr[o0]` (FPU).
  INST_1x(fnstsw, kX86InstIdFnstsw, X86Mem)

-  //! Arctan(`fp1` / `fp0`) and pop the FPU stack (FPU).
+  //! Partial Arctan `fp1 = atan2(fp1, fp0)` and POP (FPU).
  INST_0x(fpatan, kX86InstIdFpatan)
-  //! Fprem(`fp0`, `fp1`) and pop the FPU stack (FPU).
+  //! Partial Remainder[Trunc] `fp1 = fp0 % fp1` and POP (FPU).
  INST_0x(fprem, kX86InstIdFprem)
-  //! Fprem(`fp0`, `fp1`) and pop the FPU stack (FPU).
+  //! Partial Remainder[Round] `fp1 = fp0 % fp1` and POP (FPU).
  INST_0x(fprem1, kX86InstIdFprem1)
-  //! Arctan(`fp0`) and pop the FPU stack (FPU).
+  //! Partial Tan `fp0 = tan(fp0)` and PUSH `1.0` (FPU).
  INST_0x(fptan, kX86InstIdFptan)
-  //! Round `fp0` to Integer (FPU).
+  //! Round `fp0 = round(fp0)` (FPU).
  INST_0x(frndint, kX86InstIdFrndint)

-  //! Restore FPU state from `o0` (94 or 108 bytes) (FPU).
+  //! Restore FPU state from `[o0]` (94 or 108 bytes) (FPU).
  INST_1x(frstor, kX86InstIdFrstor, X86Mem)
-  //! Save FPU state to `o0` (94 or 108 bytes) (FPU).
+  //! Save FPU state to `[o0]` (94 or 108 bytes) (FPU).
  INST_1x(fsave, kX86InstIdFsave, X86Mem)

-  //! Scale `fp0` by `fp1` (FPU).
+  //! Scale `fp0 = fp0 * pow(2, RoundTowardsZero(fp1))` (FPU).
  INST_0x(fscale, kX86InstIdFscale)
-  //! Sine of `fp0` and store result in `fp0` (FPU).
+  //! Sin `fp0 = sin(fp0)` (FPU).
  INST_0x(fsin, kX86InstIdFsin)
-  //! Sine and cosine of `fp0`, store sine in `fp0` and push cosine on the FPU stack (FPU).
+  //! Sincos `fp0 = sin(fp0)` and PUSH `cos(fp0)` (FPU).
  INST_0x(fsincos, kX86InstIdFsincos)
-  //! Square root of `fp0` and store it in `fp0` (FPU).
+  //! Square root `fp0 = sqrt(fp0)` (FPU).
  INST_0x(fsqrt, kX86InstIdFsqrt)

-  //! Store floating point value to 32-bit or 64-bit memory location (FPU).
+  //! Store floating point value to `float_or_double[o0]` (FPU).
  INST_1x_(fst, kX86InstIdFst, X86Mem, o0.getSize() == 4 || o0.getSize() == 8)
-  //! Store floating point value to `o0` (FPU).
+  //! Copy `o0 = fp0` (FPU).
  INST_1x(fst, kX86InstIdFst, X86FpReg)
-  //! Store floating point value to 32-bit or 64-bit memory location and pop the FPU stack (FPU).
+  //! Store floating point value to `float_or_double_or_extended[o0]` and POP (FPU).
  INST_1x_(fstp, kX86InstIdFstp, X86Mem, o0.getSize() == 4 || o0.getSize() == 8 || o0.getSize() == 10)
-  //! Store floating point value to `o0` and pop the FPU stack (FPU).
+  //! Copy `o0 = fp0` and POP (FPU).
  INST_1x(fstp, kX86InstIdFstp, X86FpReg)

-  //! Store x87 FPU control word to `o0` (2 bytes) (FPU).
+  //! Store x87 FPU control word to `word_ptr[o0]` (FPU).
  INST_1x(fstcw, kX86InstIdFstcw, X86Mem)
-  //! Store x87 FPU environment to `o0` (14 or 28 bytes) (FPU).
+  //! Store x87 FPU environment to `[o0]` (14 or 28 bytes) (FPU).
  INST_1x(fstenv, kX86InstIdFstenv, X86Mem)
-  //! Store x87 FPU status word to `o0` (allocated in AX) (FPU).
+  //! Store x87 FPU status word to `o0` (AX) (FPU).
  INST_1x(fstsw, kX86InstIdFstsw, X86GpVar)
-  //! Store x87 FPU status word (2 bytes) (FPU).
+  //! Store x87 FPU status word to `word_ptr[o0]` (FPU).
  INST_1x(fstsw, kX86InstIdFstsw, X86Mem)

-  //! Subtract `o0` from `o0` (one has to be `fp0`) and store result in `o0` (FPU).
+  //! Subtract `o0 = o0 - o1` (one has to be `fp0`) (FPU).
  INST_2x_(fsub, kX86InstIdFsub, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Subtract 32-bit or 64-bit `o0` from fp0 and store result in fp0 (FPU).
+  //! Subtract `fp0 = fp0 - float_or_double[o0]` (FPU).
  INST_1x_(fsub, kX86InstIdFsub, X86Mem, o0.getSize() == 4 || o0.getSize() == 8)
-  //! Subtract fp0 from `o0` and pop FPU stack (FPU).
+  //! Subtract `o0 = o0 - fp0` and POP (FPU).
  INST_1x(fsubp, kX86InstIdFsubp, X86FpReg)
-  //! \overload
+  //! Subtract `fp1 = fp1 - fp0` and POP (FPU).
  INST_0x(fsubp, kX86InstIdFsubp)

-  //! Reverse subtract `o1` from `o0` (one has to be `fp0`) and store result in `o0` (FPU).
+  //! Reverse subtract `o0 = o1 - o0` (one has to be `fp0`) (FPU).
  INST_2x_(fsubr, kX86InstIdFsubr, X86FpReg, X86FpReg, o0.getRegIndex() == 0 || o1.getRegIndex() == 0)
-  //! Reverse subtract 32-bit or 64-bit `o0` from `fp0` and store result in `fp0` (FPU).
+  //! Reverse subtract `fp0 = fp0 - float_or_double[o0]` (FPU).
  INST_1x_(fsubr, kX86InstIdFsubr, X86Mem, o0.getSize() == 4 || o0.getSize() == 8)
-  //! Reverse subtract `fp0` from `o0` and pop FPU stack (FPU).
+  //! Reverse subtract `o0 = o0 - fp0` and POP (FPU).
  INST_1x(fsubrp, kX86InstIdFsubrp, X86FpReg)
-  //! \overload
+  //! Reverse subtract `fp1 = fp1 - fp0` and POP (FPU).
  INST_0x(fsubrp, kX86InstIdFsubrp)

-  //! Floating point test - Compare `fp0` with 0.0. (FPU).
+  //! Compare `fp0` with `0.0` (FPU).
  INST_0x(ftst, kX86InstIdFtst)

  //! Unordered compare `fp0` with `o0` (FPU).
  INST_1x(fucom, kX86InstIdFucom, X86FpReg)
  //! Unordered compare `fp0` with `fp1` (FPU).
  INST_0x(fucom, kX86InstIdFucom)
-  //! Unordered compare `fp0` and `o0`, check for ordered values and set EFLAGS (FPU).
+  //! Unordered compare `fp0` with `o0`, check for ordered values and set EFLAGS (FPU).
  INST_1x(fucomi, kX86InstIdFucomi, X86FpReg)
-  //! Unordered compare `fp0` and `o0`, check for ordered values and set EFLAGS and pop the FPU stack (FPU).
+  //! Unordered compare `fp0` with `o0`, check for ordered values and set EFLAGS and POP (FPU).
  INST_1x(fucomip, kX86InstIdFucomip, X86FpReg)
-  //! Unordered compare `fp0` with `o0` and pop the FPU stack (FPU).
+  //! Unordered compare `fp0` with `o0` and POP (FPU).
  INST_1x(fucomp, kX86InstIdFucomp, X86FpReg)
-  //! Unordered compare `fp0` with `fp1` and pop the FPU stack (FPU).
+  //! Unordered compare `fp0` with `fp1` and POP (FPU).
  INST_0x(fucomp, kX86InstIdFucomp)
-  //! Unordered compare `fp0` with `fp1` and pop the FPU stack twice (FPU).
+  //! Unordered compare `fp0` with `fp1` and POP twice (FPU).
  INST_0x(fucompp, kX86InstIdFucompp)

  INST_0x(fwait, kX86InstIdFwait)

  //! Examine fp0 (FPU).
  INST_0x(fxam, kX86InstIdFxam)
-  //! Exchange content of fp0 with `o0` (FPU).
+  //! Exchange `fp0` with `o0` (FPU).
  INST_1x(fxch, kX86InstIdFxch, X86FpReg)

  //! Restore FP/MMX/SIMD extension states to `o0` (512 bytes) (FPU, MMX, SSE).
  INST_1x(fxrstor, kX86InstIdFxrstor, X86Mem)
  //! Store FP/MMX/SIMD extension states to `o0` (512 bytes) (FPU, MMX, SSE).
  INST_1x(fxsave, kX86InstIdFxsave, X86Mem)
-  //! Extract exponent and store to `fp0` and push significand on the FPU stack (FPU).
+  //! Extract `fp0 = exponent(fp0)` and PUSH `significant(fp0)` (FPU).
  INST_0x(fxtract, kX86InstIdFxtract)

-  //! Compute `fp1 * log2(fp0)`, pop the FPU stack and store result in `fp0` (FPU).
+  //! Compute `fp1 = fp1 * log2(fp0)` and POP (FPU).
  INST_0x(fyl2x, kX86InstIdFyl2x)
-  //! Compute `fp1 * log2(fp0 + 1)`, pop the FPU stack and store result in `fp0` (FPU).
+  //! Compute `fp1 = fp1 * log2(fp0 + 1)` and POP (FPU).
  INST_0x(fyl2xp1, kX86InstIdFyl2xp1)

  // --------------------------------------------------------------------------
@@ -3833,7 +3905,7 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
  INST_0x(emms, kX86InstIdEmms)

  // --------------------------------------------------------------------------
-  // [3dNow]
+  // [3DNOW]
  // --------------------------------------------------------------------------

  //! Packed SP-FP to DWORD convert (3dNow!).
@@ -4921,8 +4993,7 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
  //! \overload
  INST_2x(addsubps, kX86InstIdAddsubps, X86XmmVar, X86Mem)

-  //! Store truncated `fp0` as 16-bit, 32-bit or 64-bit integer to `o0` and pop
-  //! the FPU stack (FPU / SSE3).
+  //! Store truncated `fp0` to `short_or_int_or_long[o0]` and POP (FPU & SSE3).
  INST_1x(fisttp, kX86InstIdFisttp, X86Mem)

  //! Packed DP-FP horizontal add (SSE3).
@@ -5382,6 +5453,11 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
  // [SSE4.2]
  // --------------------------------------------------------------------------

+  //! Accumulate crc32 value (polynomial 0x11EDC6F41) (SSE4.2).
+  INST_2x_(crc32, kX86InstIdCrc32, X86GpVar, X86GpVar, o0.isRegType(kX86RegTypeGpd) || o0.isRegType(kX86RegTypeGpq))
+  //! \overload
+  INST_2x_(crc32, kX86InstIdCrc32, X86GpVar, X86Mem, o0.isRegType(kX86RegTypeGpd) || o0.isRegType(kX86RegTypeGpq))
+
  //! Packed compare explicit length strings, return index (SSE4.2).
  INST_3i(pcmpestri, kX86InstIdPcmpestri, X86XmmVar, X86XmmVar, Imm)
  //! \overload
@@ -5407,6 +5483,43 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
  //! \overload
  INST_2x(pcmpgtq, kX86InstIdPcmpgtq, X86XmmVar, X86Mem)

+  // --------------------------------------------------------------------------
+  // [SSE4a]
+  // --------------------------------------------------------------------------
+
+  //! Extract Field (SSE4a).
+  INST_2x(extrq, kX86InstIdExtrq, X86XmmVar, X86XmmVar)
+  //! Extract Field (SSE4a).
+  INST_3ii(extrq, kX86InstIdExtrq, X86XmmVar, Imm, Imm)
+
+  //! Insert Field (SSE4a).
+  INST_2x(insertq, kX86InstIdInsertq, X86XmmVar, X86XmmVar)
+  //! Insert Field (SSE4a).
+  INST_4ii(insertq, kX86InstIdInsertq, X86XmmVar, X86XmmVar, Imm, Imm)
+
+  //! Move Non-Temporal Scalar DP-FP (SSE4a).
+  INST_2x(movntsd, kX86InstIdMovntsd, X86Mem, X86XmmVar)
+  //! Move Non-Temporal Scalar SP-FP (SSE4a).
+  INST_2x(movntss, kX86InstIdMovntss, X86Mem, X86XmmVar)
+
+  // --------------------------------------------------------------------------
+  // [POPCNT]
+  // --------------------------------------------------------------------------
+
+  //! Return the count of number of bits set to 1 (POPCNT).
+  INST_2x_(popcnt, kX86InstIdPopcnt, X86GpVar, X86GpVar, !o0.isGpb() && o0.getSize() == o1.getSize())
+  //! \overload
+  INST_2x_(popcnt, kX86InstIdPopcnt, X86GpVar, X86Mem, !o0.isGpb())
+
+  // --------------------------------------------------------------------------
+  // [LZCNT]
+  // --------------------------------------------------------------------------
+
+  //! Count the number of leading zero bits (LZCNT).
+  INST_2x(lzcnt, kX86InstIdLzcnt, X86GpVar, X86GpVar)
+  //! \overload
+  INST_2x(lzcnt, kX86InstIdLzcnt, X86GpVar, X86Mem)
+
  // --------------------------------------------------------------------------
  // [AESNI]
  // --------------------------------------------------------------------------
@@ -5450,6 +5563,34 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
  //! \overload
  INST_3i(pclmulqdq, kX86InstIdPclmulqdq, X86XmmVar, X86Mem, Imm);

+  // --------------------------------------------------------------------------
+  // [XSAVE]
+  // --------------------------------------------------------------------------
+
+  //! Restore Processor Extended States specified by `o1:o2` (XSAVE).
+  INST_3x(xrstor, kX86InstIdXrstor, X86Mem, X86GpVar, X86GpVar)
+  //! Restore Processor Extended States specified by `o1:o2` (XSAVE&X64).
+  INST_3x(xrstor64, kX86InstIdXrstor64, X86Mem, X86GpVar, X86GpVar)
+
+  //! Save Processor Extended States specified by `o1:o2` (XSAVE).
+  INST_3x(xsave, kX86InstIdXsave, X86Mem, X86GpVar, X86GpVar)
+  //! Save Processor Extended States specified by `o1:o2` (XSAVE&X64).
+  INST_3x(xsave64, kX86InstIdXsave64, X86Mem, X86GpVar, X86GpVar)
+
+  //! Save Processor Extended States specified by `o1:o2` (Optimized) (XSAVEOPT).
+  INST_3x(xsaveopt, kX86InstIdXsave, X86Mem, X86GpVar, X86GpVar)
+  //! Save Processor Extended States specified by `o1:o2` (Optimized) (XSAVEOPT&X64).
+  INST_3x(xsaveopt64, kX86InstIdXsave64, X86Mem, X86GpVar, X86GpVar)
+
+  //! Get XCR - `o1:o2 <- XCR[o0]` (`EDX:EAX <- XCR[ECX]`) (XSAVE).
+  INST_3x(xgetbv, kX86InstIdXgetbv, X86GpVar, X86GpVar, X86GpVar)
+  //! Set XCR - `XCR[o0] <- o1:o2` (`XCR[ECX] <- EDX:EAX`) (XSAVE).
+  INST_3x(xsetbv, kX86InstIdXsetbv, X86GpVar, X86GpVar, X86GpVar)
+
+  // --------------------------------------------------------------------------
+  // [Cleanup]
+  // --------------------------------------------------------------------------
+
 #undef INST_0x

 #undef INST_1x
@@ -5465,10 +5606,12 @@ struct ASMJIT_VCLASS X86Compiler : public Compiler {
 #undef INST_3x
 #undef INST_3x_
 #undef INST_3i
+#undef INST_3ii

 #undef INST_4x
 #undef INST_4x_
 #undef INST_4i
+#undef INST_4ii
 };

 //! \}
--- a/src/asmjit/x86/x86context.cpp
+++ b/src/asmjit/x86/x86context.cpp
@@ -143,9 +143,9 @@ static void X86Context_annotateOperand(X86Context* self,
 }

 static bool X86Context_annotateInstruction(X86Context* self,
-  StringBuilder& sb, uint32_t code, const Operand* opList, uint32_t opCount) {
+  StringBuilder& sb, uint32_t instId, const Operand* opList, uint32_t opCount) {

-  sb.appendString(_x86InstInfo[code].getInstName());
+  sb.appendString(_x86InstInfo[instId].getInstName());
  for (uint32_t i = 0; i < opCount; i++) {
    if (i == 0)
      sb.appendChar(' ');
@@ -207,7 +207,7 @@ static void X86Context_traceNode(X86Context* self, Node* node_) {
    case kNodeTypeInst: {
      InstNode* node = static_cast<InstNode*>(node_);
      X86Context_annotateInstruction(self, sb,
-        node->getCode(), node->getOpList(), node->getOpCount());
+        node->getInstId(), node->getOpList(), node->getOpCount());
      break;
    }

@@ -287,8 +287,8 @@ void X86Context::reset() {

  _stackFrameCell = NULL;
  _gaRegs[kX86RegClassGp ] = IntUtil::bits(_regCount.getGp()) & ~IntUtil::mask(kX86RegIndexSp);
-  _gaRegs[kX86RegClassFp ] = IntUtil::bits(_regCount.getFp());
  _gaRegs[kX86RegClassMm ] = IntUtil::bits(_regCount.getMm());
+  _gaRegs[kX86RegClassK  ] = IntUtil::bits(_regCount.getK());
  _gaRegs[kX86RegClassXyz] = IntUtil::bits(_regCount.getXyz());

  _argBaseReg = kInvalidReg; // Used by patcher.
@@ -426,8 +426,26 @@ static const X86SpecialInst x86SpecialInstBlend[] = {
  { 0             , kInvalidReg   , kVarAttrInReg     }
 };

-static ASMJIT_INLINE const X86SpecialInst* X86SpecialInst_get(uint32_t code, const Operand* opList, uint32_t opCount) {
-  switch (code) {
+static const X86SpecialInst x86SpecialInstXsaveXrstor[] = {
+  { kInvalidReg   , kInvalidReg   , 0                 },
+  { kX86RegIndexDx, kInvalidReg   , kVarAttrInReg     },
+  { kX86RegIndexAx, kInvalidReg   , kVarAttrInReg     }
+};
+
+static const X86SpecialInst x86SpecialInstXgetbv[] = {
+  { kX86RegIndexCx, kInvalidReg   , kVarAttrInReg     },
+  { kInvalidReg   , kX86RegIndexDx, kVarAttrOutReg    },
+  { kInvalidReg   , kX86RegIndexAx, kVarAttrOutReg    }
+};
+
+static const X86SpecialInst x86SpecialInstXsetbv[] = {
+  { kX86RegIndexCx, kInvalidReg   , kVarAttrInReg     },
+  { kX86RegIndexDx, kInvalidReg   , kVarAttrInReg     },
+  { kX86RegIndexAx, kInvalidReg   , kVarAttrInReg     }
+};
+
+static ASMJIT_INLINE const X86SpecialInst* X86SpecialInst_get(uint32_t instId, const Operand* opList, uint32_t opCount) {
+  switch (instId) {
    case kX86InstIdCpuid:
      return x86SpecialInstCpuid;

@@ -600,6 +618,20 @@ static ASMJIT_INLINE const X86SpecialInst* X86SpecialInst_get(uint32_t code, con
    case kX86InstIdPblendvb:
      return x86SpecialInstBlend;

+    case kX86InstIdXrstor:
+    case kX86InstIdXrstor64:
+    case kX86InstIdXsave:
+    case kX86InstIdXsave64:
+    case kX86InstIdXsaveopt:
+    case kX86InstIdXsaveopt64:
+      return x86SpecialInstXsaveXrstor;
+
+    case kX86InstIdXgetbv:
+      return x86SpecialInstXgetbv;
+
+    case kX86InstIdXsetbv:
+      return x86SpecialInstXsetbv;
+
    default:
      return NULL;
  }
@@ -976,7 +1008,7 @@ void X86Context::emitMoveVarOnStack(
  X86Reg r0, r1;

  uint32_t regSize = compiler->getRegSize();
-  uint32_t instCode;
+  uint32_t instId;

  switch (dstType) {
    case kVarTypeInt8:
@@ -1002,7 +1034,7 @@ void X86Context::emitMoveVarOnStack(
        r1.setSize(1);
        r1.setCode(kX86RegTypeGpbLo, srcIndex);

-        instCode = (dstType == kVarTypeInt16 && srcType == kVarTypeInt8) ? kX86InstIdMovsx : kX86InstIdMovzx;
+        instId = (dstType == kVarTypeInt16 && srcType == kVarTypeInt8) ? kX86InstIdMovsx : kX86InstIdMovzx;
        goto _ExtendMovGpD;
      }

@@ -1027,7 +1059,7 @@ void X86Context::emitMoveVarOnStack(
        r1.setSize(1);
        r1.setCode(kX86RegTypeGpbLo, srcIndex);

-        instCode = (dstType == kVarTypeInt32 && srcType == kVarTypeInt8) ? kX86InstIdMovsx : kX86InstIdMovzx;
+        instId = (dstType == kVarTypeInt32 && srcType == kVarTypeInt8) ? kX86InstIdMovsx : kX86InstIdMovzx;
        goto _ExtendMovGpD;
      }

@@ -1036,7 +1068,7 @@ void X86Context::emitMoveVarOnStack(
        r1.setSize(2);
        r1.setCode(kX86RegTypeGpw, srcIndex);

-        instCode = (dstType == kVarTypeInt32 && srcType == kVarTypeInt16) ? kX86InstIdMovsx : kX86InstIdMovzx;
+        instId = (dstType == kVarTypeInt32 && srcType == kVarTypeInt16) ? kX86InstIdMovsx : kX86InstIdMovzx;
        goto _ExtendMovGpD;
      }

@@ -1060,7 +1092,7 @@ void X86Context::emitMoveVarOnStack(
        r1.setSize(1);
        r1.setCode(kX86RegTypeGpbLo, srcIndex);

-        instCode = (dstType == kVarTypeInt64 && srcType == kVarTypeInt8) ? kX86InstIdMovsx : kX86InstIdMovzx;
+        instId = (dstType == kVarTypeInt64 && srcType == kVarTypeInt8) ? kX86InstIdMovsx : kX86InstIdMovzx;
        goto _ExtendMovGpXQ;
      }

@@ -1069,7 +1101,7 @@ void X86Context::emitMoveVarOnStack(
        r1.setSize(2);
        r1.setCode(kX86RegTypeGpw, srcIndex);

-        instCode = (dstType == kVarTypeInt64 && srcType == kVarTypeInt16) ? kX86InstIdMovsx : kX86InstIdMovzx;
+        instId = (dstType == kVarTypeInt64 && srcType == kVarTypeInt16) ? kX86InstIdMovsx : kX86InstIdMovzx;
        goto _ExtendMovGpXQ;
      }

@@ -1078,7 +1110,7 @@ void X86Context::emitMoveVarOnStack(
        r1.setSize(4);
        r1.setCode(kX86RegTypeGpd, srcIndex);

-        instCode = kX86InstIdMovsxd;
+        instId = kX86InstIdMovsxd;
        if (dstType == kVarTypeInt64 && srcType == kVarTypeInt32)
          goto _ExtendMovGpXQ;
        else
@@ -1104,7 +1136,7 @@ void X86Context::emitMoveVarOnStack(
        r1.setSize(1);
        r1.setCode(kX86RegTypeGpbLo, srcIndex);

-        instCode = kX86InstIdMovzx;
+        instId = kX86InstIdMovzx;
        goto _ExtendMovGpXQ;
      }

@@ -1113,7 +1145,7 @@ void X86Context::emitMoveVarOnStack(
        r1.setSize(2);
        r1.setCode(kX86RegTypeGpw, srcIndex);

-        instCode = kX86InstIdMovzx;
+        instId = kX86InstIdMovzx;
        goto _ExtendMovGpXQ;
      }

@@ -1175,7 +1207,7 @@ _ExtendMovGpD:
  r0.setSize(4);
  r0.setCode(kX86RegTypeGpd, srcIndex);

-  compiler->emit(instCode, r0, r1);
+  compiler->emit(instId, r0, r1);
  compiler->emit(kX86InstIdMov, m0, r0);
  return;

@@ -1185,7 +1217,7 @@ _ExtendMovGpXQ:
    r0.setSize(8);
    r0.setCode(kX86RegTypeGpq, srcIndex);

-    compiler->emit(instCode, r0, r1);
+    compiler->emit(instId, r0, r1);
    compiler->emit(kX86InstIdMov, m0, r0);
  }
  else {
@@ -1193,7 +1225,7 @@ _ExtendMovGpXQ:
    r0.setSize(4);
    r0.setCode(kX86RegTypeGpd, srcIndex);

-    compiler->emit(instCode, r0, r1);
+    compiler->emit(instId, r0, r1);

 _ExtendMovGpDQ:
    compiler->emit(kX86InstIdMov, m0, r0);
@@ -1749,8 +1781,8 @@ static ASMJIT_INLINE Node* X86Context_getOppositeJccFlow(JumpNode* jNode) {
 // ============================================================================

 //! \internal
-static void X86Context_prepareSingleVarInst(uint32_t code, VarAttr* va) {
-  switch (code) {
+static void X86Context_prepareSingleVarInst(uint32_t instId, VarAttr* va) {
+  switch (instId) {
    // - andn     reg, reg ; Set all bits in reg to 0.
    // - xor/pxor reg, reg ; Set all bits in reg to 0.
    // - sub/psub reg, reg ; Set all bits in reg to 0.
@@ -1763,7 +1795,7 @@ static void X86Context_prepareSingleVarInst(uint32_t code, VarAttr* va) {
    case kX86InstIdPsubsb    : case kX86InstIdPsubsw    : case kX86InstIdPsubusb   : case kX86InstIdPsubusw   :
    case kX86InstIdPcmpeqb   : case kX86InstIdPcmpeqw   : case kX86InstIdPcmpeqd   : case kX86InstIdPcmpeqq   :
    case kX86InstIdPcmpgtb   : case kX86InstIdPcmpgtw   : case kX86InstIdPcmpgtd   : case kX86InstIdPcmpgtq   :
-      va->delFlags(kVarAttrInReg);
+      va->andNotFlags(kVarAttrInReg);
      break;

    // - and      reg, reg ; Nop.
@@ -1772,7 +1804,7 @@ static void X86Context_prepareSingleVarInst(uint32_t code, VarAttr* va) {
    case kX86InstIdAnd       : case kX86InstIdAndpd     : case kX86InstIdAndps     : case kX86InstIdPand      :
    case kX86InstIdOr        : case kX86InstIdOrpd      : case kX86InstIdOrps      : case kX86InstIdPor       :
    case kX86InstIdXchg      :
-      va->delFlags(kVarAttrOutReg);
+      va->andNotFlags(kVarAttrOutReg);
      break;
  }
 }
@@ -1824,7 +1856,7 @@ static ASMJIT_INLINE X86RegMask X86Context_getUsedArgs(X86Context* self, X86Call
    const FuncInOut& arg = decl->getArg(i);
    if (!arg.hasRegIndex())
      continue;
-    regs.add(x86VarTypeToClass(arg.getVarType()), IntUtil::mask(arg.getRegIndex()));
+    regs.or_(x86VarTypeToClass(arg.getVarType()), IntUtil::mask(arg.getRegIndex()));
  }

  return regs;
@@ -2117,7 +2149,7 @@ Error X86Context::fetch() {
      goto _NoMemory; \
    \
    X86RegCount vaIndex; \
-    vaIndex.makeIndex(regCount); \
+    vaIndex.indexFromRegCount(regCount); \
    \
    map->_vaCount = vaCount; \
    map->_count = regCount; \
@@ -2181,7 +2213,7 @@ Error X86Context::fetch() {
      regCount.add(_Vd_->getClass()); \
    } \
    \
-    _Va_->addFlags(_Flags_); \
+    _Va_->orFlags(_Flags_); \
    _Va_->addVarCount(1); \
  } while (0)

@@ -2230,18 +2262,13 @@ _NextGroup:
        VI_BEGIN();

        if (node->getHint() == kVarHintAlloc) {
-          uint32_t remain[kX86RegClassCount];
+          uint32_t remain[_kX86RegClassManagedCount];
          HintNode* cur = node;

          remain[kX86RegClassGp ] = _regCount.getGp() - 1 - func->hasFuncFlag(kFuncFlagIsNaked);
-          remain[kX86RegClassFp ] = _regCount.getFp();
          remain[kX86RegClassMm ] = _regCount.getMm();
-
-          // Correct. Instead of using `getXyz()` which may be 32 in 64-bit
-          // mode we use `getGp()`. The reason is that not all registers are
-          // accessible by all instructions when using AVX512, this makes the
-          // algorithm safe.
-          remain[kX86RegClassXyz] = _regCount.getGp();
+          remain[kX86RegClassK  ] = _regCount.getK();
+          remain[kX86RegClassXyz] = _regCount.getXyz();

          // Merge as many alloc-hints as possible.
          for (;;) {
@@ -2333,14 +2360,14 @@ _NextGroup:
      case kNodeTypeInst: {
        InstNode* node = static_cast<InstNode*>(node_);

-        uint32_t code = node->getCode();
+        uint32_t instId = node->getInstId();
        uint32_t flags = node->getFlags();

        Operand* opList = node->getOpList();
        uint32_t opCount = node->getOpCount();

        if (opCount) {
-          const X86InstExtendedInfo& extendedInfo = _x86InstInfo[code].getExtendedInfo();
+          const X86InstExtendedInfo& extendedInfo = _x86InstInfo[instId].getExtendedInfo();
          const X86SpecialInst* special = NULL;
          VI_BEGIN();

@@ -2348,7 +2375,7 @@ _NextGroup:
          if (extendedInfo.isFp())
            flags |= kNodeFlagIsFp;

-          if (extendedInfo.isSpecial() && (special = X86SpecialInst_get(code, opList, opCount)) != NULL)
+          if (extendedInfo.isSpecial() && (special = X86SpecialInst_get(instId, opList, opCount)) != NULL)
            flags |= kNodeFlagIsSpecial;

          uint32_t gpAllowedMask = 0xFFFFFFFF;
@@ -2363,7 +2390,7 @@ _NextGroup:
              VI_MERGE_VAR(vd, va, 0, gaRegs[vd->getClass()] & gpAllowedMask);

              if (static_cast<X86Var*>(op)->isGpb()) {
-                va->addFlags(static_cast<X86GpVar*>(op)->isGpbLo() ? kX86VarAttrGpbLo : kX86VarAttrGpbHi);
+                va->orFlags(static_cast<X86GpVar*>(op)->isGpbLo() ? kX86VarAttrGpbLo : kX86VarAttrGpbHi);
                if (arch == kArchX86) {
                  // If a byte register is accessed in 32-bit mode we have to limit
                  // all allocable registers for that variable to eax/ebx/ecx/edx.
@@ -2401,17 +2428,17 @@ _NextGroup:

                if (inReg != kInvalidReg) {
                  uint32_t mask = IntUtil::mask(inReg);
-                  inRegs.add(c, mask);
+                  inRegs.or_(c, mask);
                  va->addInRegs(mask);
                }

                if (outReg != kInvalidReg) {
                  uint32_t mask = IntUtil::mask(outReg);
-                  outRegs.add(c, mask);
+                  outRegs.or_(c, mask);
                  va->setOutRegIndex(outReg);
                }

-                va->addFlags(special[i].flags);
+                va->orFlags(special[i].flags);
              }
              else {
                uint32_t inFlags = kVarAttrInReg;
@@ -2426,7 +2453,7 @@ _NextGroup:
                  // but there are some exceptions based on the operands' size
                  // and type.
                  if (extendedInfo.isMove()) {
-                    uint32_t movSize = extendedInfo.getMoveSize();
+                    uint32_t movSize = extendedInfo.getWriteSize();
                    uint32_t varSize = vd->getSize();

                    // Exception - If the source operand is a memory location
@@ -2461,7 +2488,7 @@ _NextGroup:
                    combinedFlags = inFlags;
                  }
                  // Imul.
-                  else if (code == kX86InstIdImul && opCount == 3) {
+                  else if (instId == kX86InstIdImul && opCount == 3) {
                    combinedFlags = outFlags;
                  }
                }
@@ -2470,13 +2497,13 @@ _NextGroup:
                  combinedFlags = inFlags;

                  // Idiv is a special instruction, never handled here.
-                  ASMJIT_ASSERT(code != kX86InstIdIdiv);
+                  ASMJIT_ASSERT(instId != kX86InstIdIdiv);

                  // Xchg/Xadd/Imul.
-                  if (extendedInfo.isXchg() || (code == kX86InstIdImul && opCount == 3 && i == 1))
+                  if (extendedInfo.isXchg() || (instId == kX86InstIdImul && opCount == 3 && i == 1))
                    combinedFlags = inFlags | outFlags;
                }
-                va->addFlags(combinedFlags);
+                va->orFlags(combinedFlags);
              }
            }
            else if (op->isMem()) {
@@ -2488,7 +2515,7 @@ _NextGroup:
                if (!vd->isStack()) {
                  VI_MERGE_VAR(vd, va, 0, gaRegs[vd->getClass()] & gpAllowedMask);
                  if (m->getMemType() == kMemTypeBaseIndex) {
-                    va->addFlags(kVarAttrInReg);
+                    va->orFlags(kVarAttrInReg);
                  }
                  else {
                    uint32_t inFlags = kVarAttrInMem;
@@ -2503,7 +2530,7 @@ _NextGroup:
                      // as if it's just move to the register. It's just a bit
                      // simpler as there are no special cases.
                      if (extendedInfo.isMove()) {
-                        uint32_t movSize = IntUtil::iMax<uint32_t>(extendedInfo.getMoveSize(), m->getSize());
+                        uint32_t movSize = IntUtil::iMax<uint32_t>(extendedInfo.getWriteSize(), m->getSize());
                        uint32_t varSize = vd->getSize();

                        if (movSize >= varSize)
@@ -2523,7 +2550,7 @@ _NextGroup:
                        combinedFlags = inFlags | outFlags;
                    }

-                    va->addFlags(combinedFlags);
+                    va->orFlags(combinedFlags);
                  }
                }
              }
@@ -2533,7 +2560,7 @@ _NextGroup:
                vd = compiler->getVdById(m->getIndex());
                VI_MERGE_VAR(vd, va, 0, gaRegs[kX86RegClassGp] & gpAllowedMask);
                va->andAllocableRegs(indexMask);
-                va->addFlags(kVarAttrInReg);
+                va->orFlags(kVarAttrInReg);
              }
            }
          }
@@ -2543,7 +2570,7 @@ _NextGroup:
            // Handle instructions which result in zeros/ones or nop if used with the
            // same destination and source operand.
            if (vaCount == 1 && opCount >= 2 && opList[0].isVar() && opList[1].isVar() && !node->hasMemOp())
-              X86Context_prepareSingleVarInst(code, &vaTmpList[0]);
+              X86Context_prepareSingleVarInst(instId, &vaTmpList[0]);
          }

          VI_END(node_);
@@ -2577,7 +2604,7 @@ _NextGroup:
              // backward jump. This behavior can be overridden by using
              // `kInstOptionTaken` when the instruction is created.
              if (!jNode->isTaken() && opCount == 1 && jTargetFlowId <= flowId) {
-                jNode->addFlags(kNodeFlagIsTaken);
+                jNode->orFlags(kNodeFlagIsTaken);
              }
            }
            else if (jNext->isFetched()) {
@@ -2623,18 +2650,18 @@ _NextGroup:

          if (arg.hasRegIndex()) {
            if (x86VarTypeToClass(aType) == vd->getClass()) {
-              va->addFlags(kVarAttrOutReg);
+              va->orFlags(kVarAttrOutReg);
              va->setOutRegIndex(arg.getRegIndex());
            }
            else {
-              va->addFlags(kVarAttrOutConv);
+              va->orFlags(kVarAttrOutConv);
            }
          }
          else {
            if ((x86VarTypeToClass(aType) == vd->getClass()) ||
                (vType == kX86VarTypeXmmSs && aType == kVarTypeFp32) ||
                (vType == kX86VarTypeXmmSd && aType == kVarTypeFp64)) {
-              va->addFlags(kVarAttrOutMem);
+              va->orFlags(kVarAttrOutMem);
            }
            else {
              // TODO: [COMPILER] Not implemented.
@@ -2678,8 +2705,8 @@ _NextGroup:
                // TODO: [COMPILER] Fix RetNode fetch.
                VI_MERGE_VAR(vd, va, 0, 0);
                va->setInRegs(i == 0 ? IntUtil::mask(kX86RegIndexAx) : IntUtil::mask(kX86RegIndexDx));
-                va->addFlags(kVarAttrInReg);
-                inRegs.add(retClass, va->getInRegs());
+                va->orFlags(kVarAttrInReg);
+                inRegs.or_(retClass, va->getInRegs());
              }
            }
          }
@@ -2719,7 +2746,7 @@ _NextGroup:
          vd = compiler->getVdById(target->getId());
          VI_MERGE_VAR(vd, va, 0, 0);

-          va->addFlags(kVarAttrInReg | kVarAttrInCall);
+          va->orFlags(kVarAttrInReg | kVarAttrInCall);
          if (va->getInRegs() == 0)
            va->addAllocableRegs(gpAllocableMask);
        }
@@ -2731,12 +2758,12 @@ _NextGroup:
            if (!vd->isStack()) {
              VI_MERGE_VAR(vd, va, 0, 0);
              if (m->getMemType() == kMemTypeBaseIndex) {
-                va->addFlags(kVarAttrInReg | kVarAttrInCall);
+                va->orFlags(kVarAttrInReg | kVarAttrInCall);
                if (va->getInRegs() == 0)
                  va->addAllocableRegs(gpAllocableMask);
              }
              else {
-                va->addFlags(kVarAttrInMem | kVarAttrInCall);
+                va->orFlags(kVarAttrInMem | kVarAttrInCall);
              }
            }
          }
@@ -2746,7 +2773,7 @@ _NextGroup:
            vd = compiler->getVdById(m->getIndex());
            VI_MERGE_VAR(vd, va, 0, 0);

-            va->addFlags(kVarAttrInReg | kVarAttrInCall);
+            va->orFlags(kVarAttrInReg | kVarAttrInCall);
            if ((va->getInRegs() & ~indexMask) == 0)
              va->andAllocableRegs(gpAllocableMask & indexMask);
          }
@@ -2769,10 +2796,10 @@ _NextGroup:

            if (vd->getClass() == argClass) {
              va->addInRegs(IntUtil::mask(arg.getRegIndex()));
-              va->addFlags(kVarAttrInReg | kVarAttrInArg);
+              va->orFlags(kVarAttrInReg | kVarAttrInArg);
            }
            else {
-              va->addFlags(kVarAttrInConv | kVarAttrInArg);
+              va->orFlags(kVarAttrInConv | kVarAttrInArg);
            }
          }
          // If this is a stack-based argument we insert SArgNode instead of
@@ -2803,18 +2830,18 @@ _NextGroup:

            if (vd->getClass() == retClass) {
              va->setOutRegIndex(ret.getRegIndex());
-              va->addFlags(kVarAttrOutReg | kVarAttrOutRet);
+              va->orFlags(kVarAttrOutReg | kVarAttrOutRet);
            }
            else {
-              va->addFlags(kVarAttrOutConv | kVarAttrOutRet);
+              va->orFlags(kVarAttrOutConv | kVarAttrOutRet);
            }
          }
        }

        // Init clobbered.
        clobberedRegs.set(kX86RegClassGp , IntUtil::bits(_regCount.getGp())  & (~decl->getPreserved(kX86RegClassGp )));
-        clobberedRegs.set(kX86RegClassFp , IntUtil::bits(_regCount.getFp()));
        clobberedRegs.set(kX86RegClassMm , IntUtil::bits(_regCount.getMm())  & (~decl->getPreserved(kX86RegClassMm )));
+        clobberedRegs.set(kX86RegClassK  , IntUtil::bits(_regCount.getK())   & (~decl->getPreserved(kX86RegClassK  )));
        clobberedRegs.set(kX86RegClassXyz, IntUtil::bits(_regCount.getXyz()) & (~decl->getPreserved(kX86RegClassXyz)));

        VI_END(node_);
@@ -2860,7 +2887,7 @@ Error X86Context::annotate() {
    if (node_->getComment() == NULL) {
      if (node_->getType() == kNodeTypeInst) {
        InstNode* node = static_cast<InstNode*>(node_);
-        X86Context_annotateInstruction(this, sb, node->getCode(), node->getOpList(), node->getOpCount());
+        X86Context_annotateInstruction(this, sb, node->getInstId(), node->getOpList(), node->getOpCount());

        node_->setComment(static_cast<char*>(sa.dup(sb.getData(), sb.getLength() + 1)));
        maxLen = IntUtil::iMax<uint32_t>(maxLen, static_cast<uint32_t>(sb.getLength()));
@@ -2962,7 +2989,7 @@ protected:
  //! Variable map.
  X86VarMap* _map;
  //! VarAttr list (per register class).
-  VarAttr* _vaList[4];
+  VarAttr* _vaList[_kX86RegClassManagedCount];

  //! Count of all VarAttr's.
  uint32_t _vaCount;
@@ -2990,8 +3017,8 @@ ASMJIT_INLINE void X86BaseAlloc::init(Node* node, X86VarMap* map) {
  {
    VarAttr* va = map->getVaList();
    _vaList[kX86RegClassGp ] = va;
-    _vaList[kX86RegClassFp ] = va + map->getVaStart(kX86RegClassFp );
    _vaList[kX86RegClassMm ] = va + map->getVaStart(kX86RegClassMm );
+    _vaList[kX86RegClassK  ] = va + map->getVaStart(kX86RegClassK  );
    _vaList[kX86RegClassXyz] = va + map->getVaStart(kX86RegClassXyz);
  }

@@ -3223,8 +3250,8 @@ ASMJIT_INLINE Error X86VarAlloc::run(Node* node_) {
  cleanup();

  // Update clobbered mask.
-  _context->_clobberedRegs.add(_willAlloc);
-  _context->_clobberedRegs.add(map->_clobberedRegs);
+  _context->_clobberedRegs.or_(_willAlloc);
+  _context->_clobberedRegs.or_(map->_clobberedRegs);

  // Unuse.
  unuseAfter<kX86RegClassGp >();
@@ -3245,7 +3272,7 @@ ASMJIT_INLINE void X86VarAlloc::init(Node* node, X86VarMap* map) {
  // add more registers when assigning registers to variables that don't need
  // any specific register.
  _willAlloc = map->_inRegs;
-  _willAlloc.add(map->_outRegs);
+  _willAlloc.or_(map->_outRegs);
  _willSpill.reset();
 }

@@ -3308,7 +3335,7 @@ ASMJIT_INLINE void X86VarAlloc::plan() {

          if ((mandatoryRegs | allocableRegs) & regMask) {
            va->setOutRegIndex(regIndex);
-            va->addFlags(kVarAttrAllocOutDone);
+            va->orFlags(kVarAttrAllocOutDone);

            if (mandatoryRegs & regMask) {
              // Case 'a' - 'willAlloc' contains initially all inRegs from all VarAttr's.
@@ -3329,7 +3356,7 @@ ASMJIT_INLINE void X86VarAlloc::plan() {
        else {
          if ((mandatoryRegs | allocableRegs) & regMask) {
            va->setInRegIndex(regIndex);
-            va->addFlags(kVarAttrAllocInDone);
+            va->orFlags(kVarAttrAllocInDone);

            if (mandatoryRegs & regMask) {
              // Case 'a' - 'willAlloc' contains initially all inRegs from all VarAttr's.
@@ -3377,7 +3404,7 @@ ASMJIT_INLINE void X86VarAlloc::plan() {
      }
      else {
        ASMJIT_TLOG("[RA-PLAN ] Done\n");
-        va->addFlags(kVarAttrAllocInDone);
+        va->orFlags(kVarAttrAllocInDone);
        addVaDone(C);
        continue;
      }
@@ -3563,12 +3590,12 @@ ASMJIT_INLINE void X86VarAlloc::alloc() {
          VarAttr* bVa = bVd->getVa();
          _context->swapGp(aVd, bVd);

-          aVa->addFlags(kVarAttrAllocInDone);
+          aVa->orFlags(kVarAttrAllocInDone);
          addVaDone(C);

          // Doublehit, two registers allocated by a single swap.
          if (bVa != NULL && bVa->getInRegIndex() == aIndex) {
-            bVa->addFlags(kVarAttrAllocInDone);
+            bVa->orFlags(kVarAttrAllocInDone);
            addVaDone(C);
          }

@@ -3579,7 +3606,7 @@ ASMJIT_INLINE void X86VarAlloc::alloc() {
      else if (aIndex != kInvalidReg) {
        _context->move<C>(aVd, bIndex);

-        aVa->addFlags(kVarAttrAllocInDone);
+        aVa->orFlags(kVarAttrAllocInDone);
        addVaDone(C);

        didWork = true;
@@ -3588,7 +3615,7 @@ ASMJIT_INLINE void X86VarAlloc::alloc() {
      else {
        _context->alloc<C>(aVd, bIndex);

-        aVa->addFlags(kVarAttrAllocInDone);
+        aVa->orFlags(kVarAttrAllocInDone);
        addVaDone(C);

        didWork = true;
@@ -3613,7 +3640,7 @@ ASMJIT_INLINE void X86VarAlloc::alloc() {
      _context->attach<C>(vd, regIndex, false);
    }

-    va->addFlags(kVarAttrAllocOutDone);
+    va->orFlags(kVarAttrAllocOutDone);
    addVaDone(C);
  }
 }
@@ -3730,7 +3757,7 @@ ASMJIT_INLINE void X86VarAlloc::modified() {
      uint32_t regMask = IntUtil::mask(regIndex);

      vd->setModified(true);
-      _context->_x86State._modified.add(C, regMask);
+      _context->_x86State._modified.or_(C, regMask);
    }
  }
 }
@@ -3972,7 +3999,7 @@ ASMJIT_INLINE void X86CallAlloc::plan() {
      // is not clobbered (i.e. it will survive function call).
      if ((regMask & inRegs) != 0 || ((regMask & ~clobbered) != 0 && (vaFlags & kVarAttrUnuse) == 0)) {
        va->setInRegIndex(regIndex);
-        va->addFlags(kVarAttrAllocInDone);
+        va->orFlags(kVarAttrAllocInDone);
        addVaDone(C);
      }
      else {
@@ -3985,7 +4012,7 @@ ASMJIT_INLINE void X86CallAlloc::plan() {
        willFree |= regMask;
      }
      else {
-        va->addFlags(kVarAttrAllocInDone);
+        va->orFlags(kVarAttrAllocInDone);
        addVaDone(C);
      }
    }
@@ -4131,12 +4158,12 @@ ASMJIT_INLINE void X86CallAlloc::alloc() {
        if (C == kX86RegClassGp) {
          _context->swapGp(aVd, bVd);

-          aVa->addFlags(kVarAttrAllocInDone);
+          aVa->orFlags(kVarAttrAllocInDone);
          addVaDone(C);

          // Doublehit, two registers allocated by a single swap.
          if (bVa != NULL && bVa->getInRegIndex() == aIndex) {
-            bVa->addFlags(kVarAttrAllocInDone);
+            bVa->orFlags(kVarAttrAllocInDone);
            addVaDone(C);
          }

@@ -4147,7 +4174,7 @@ ASMJIT_INLINE void X86CallAlloc::alloc() {
      else if (aIndex != kInvalidReg) {
        _context->move<C>(aVd, bIndex);

-        aVa->addFlags(kVarAttrAllocInDone);
+        aVa->orFlags(kVarAttrAllocInDone);
        addVaDone(C);

        didWork = true;
@@ -4156,7 +4183,7 @@ ASMJIT_INLINE void X86CallAlloc::alloc() {
      else {
        _context->alloc<C>(aVd, bIndex);

-        aVa->addFlags(kVarAttrAllocInDone);
+        aVa->orFlags(kVarAttrAllocInDone);
        addVaDone(C);

        didWork = true;
@@ -4227,7 +4254,7 @@ ASMJIT_INLINE void X86CallAlloc::duplicate() {
    for (uint32_t dupIndex = 0; inRegs != 0; dupIndex++, inRegs >>= 1) {
      if (inRegs & 0x1) {
        _context->emitMove(vd, dupIndex, regIndex, "Duplicate");
-        _context->_clobberedRegs.add(C, IntUtil::mask(dupIndex));
+        _context->_clobberedRegs.or_(C, IntUtil::mask(dupIndex));
      }
    }
  }
@@ -4467,8 +4494,8 @@ static Error X86Context_initFunc(X86Context* self, X86FuncNode* func) {

  // Setup "Save-Restore" registers.
  func->_saveRestoreRegs.set(kX86RegClassGp , clobberedRegs.get(kX86RegClassGp ) & decl->getPreserved(kX86RegClassGp ));
-  func->_saveRestoreRegs.set(kX86RegClassFp , 0);
  func->_saveRestoreRegs.set(kX86RegClassMm , clobberedRegs.get(kX86RegClassMm ) & decl->getPreserved(kX86RegClassMm ));
+  func->_saveRestoreRegs.set(kX86RegClassK  , 0);
  func->_saveRestoreRegs.set(kX86RegClassXyz, clobberedRegs.get(kX86RegClassXyz) & decl->getPreserved(kX86RegClassXyz));

  ASMJIT_ASSERT(!func->_saveRestoreRegs.has(kX86RegClassGp, IntUtil::mask(kX86RegIndexSp)));
@@ -4540,7 +4567,7 @@ static Error X86Context_initFunc(X86Context* self, X86FuncNode* func) {
    // from '_saveRestoreRegs' in case that it is preserved.
    fRegMask = IntUtil::mask(fRegIndex);
    if ((fRegMask & decl->getPreserved(kX86RegClassGp)) != 0) {
-      func->_saveRestoreRegs.del(kX86RegClassGp, fRegMask);
+      func->_saveRestoreRegs.andNot(kX86RegClassGp, fRegMask);
      func->_isStackFrameRegPreserved = true;
    }

@@ -4556,7 +4583,7 @@ static Error X86Context_initFunc(X86Context* self, X86FuncNode* func) {
      else
        stackFrameCopyRegs = IntUtil::keepNOnesFromRight(stackFrameCopyRegs, IntUtil::iMin<uint32_t>(maxRegs, 2));

-      func->_saveRestoreRegs.add(kX86RegClassGp, stackFrameCopyRegs & decl->getPreserved(kX86RegClassGp));
+      func->_saveRestoreRegs.or_(kX86RegClassGp, stackFrameCopyRegs & decl->getPreserved(kX86RegClassGp));
      IntUtil::indexNOnesFromRight(func->_stackFrameCopyGpIndex, stackFrameCopyRegs, maxRegs);
    }
  }
@@ -5096,7 +5123,7 @@ _NextGroup:
    }

    next = node_->getNext();
-    node_->addFlags(kNodeFlagIsTranslated);
+    node_->orFlags(kNodeFlagIsTranslated);

    ASMJIT_TSEC({
      X86Context_traceNode(this, node_);
@@ -5143,7 +5170,7 @@ _NextGroup:
              VarData* vd = va->getVd();

              if (!liveness->getBit(vd->getContextId()))
-                va->addFlags(kVarAttrUnuse);
+                va->orFlags(kVarAttrUnuse);
            }
          }
        }
@@ -5321,7 +5348,7 @@ _NextGroup:

    for (;;) {
      Node* next = node_->getNext();
-      node_->addFlags(kNodeFlagIsScheduled);
+      node_->orFlags(kNodeFlagIsScheduled);

      // Shouldn't happen here, investigate if hit.
      ASMJIT_ASSERT(node_ != stop);
@@ -5508,7 +5535,7 @@ static ASMJIT_INLINE Error X86Context_serialize(X86Context* self, X86Assembler*
      case kNodeTypeInst: {
        InstNode* node = static_cast<InstNode*>(node_);

-        uint32_t code = node->getCode();
+        uint32_t instId = node->getInstId();
        uint32_t opCount = node->getOpCount();

        const Operand* opList = node->getOpList();
@@ -5517,9 +5544,10 @@ static ASMJIT_INLINE Error X86Context_serialize(X86Context* self, X86Assembler*
        const Operand* o0 = &noOperand;
        const Operand* o1 = &noOperand;
        const Operand* o2 = &noOperand;
+        const Operand* o3 = &noOperand;

        if (node->isSpecial()) {
-          switch (code) {
+          switch (instId) {
            case kX86InstIdCpuid:
              break;

@@ -5632,6 +5660,19 @@ static ASMJIT_INLINE Error X86Context_serialize(X86Context* self, X86Assembler*
            case kX86InstIdRepneScasB: case kX86InstIdRepneScasD: case kX86InstIdRepneScasQ: case kX86InstIdRepneScasW:
              break;

+            case kX86InstIdXrstor:
+            case kX86InstIdXrstor64:
+            case kX86InstIdXsave:
+            case kX86InstIdXsave64:
+            case kX86InstIdXsaveopt:
+            case kX86InstIdXsaveopt64:
+              o0 = &opList[0];
+              break;
+
+            case kX86InstIdXgetbv:
+            case kX86InstIdXsetbv:
+              break;
+
            default:
              ASMJIT_ASSERT(!"Reached");
          }
@@ -5640,10 +5681,11 @@ static ASMJIT_INLINE Error X86Context_serialize(X86Context* self, X86Assembler*
          if (opCount > 0) o0 = &opList[0];
          if (opCount > 1) o1 = &opList[1];
          if (opCount > 2) o2 = &opList[2];
+          if (opCount > 3) o3 = &opList[3];
        }

-        // We use this form, because it is the main one.
-        assembler->emit(code, *o0, *o1, *o2);
+        // Should call _emit() directly as 4 operand form is the main form.
+        assembler->emit(instId, *o0, *o1, *o2, *o3);
        break;
      }

--- a/src/asmjit/x86/x86context_p.h
+++ b/src/asmjit/x86/x86context_p.h
@@ -150,8 +150,8 @@ struct X86Context : public Context {
    vd->setModified(modified);

    _x86State.getListByClass(C)[regIndex] = vd;
-    _x86State._occupied.add(C, regMask);
-    _x86State._modified.add(C, static_cast<uint32_t>(modified) << regIndex);
+    _x86State._occupied.or_(C, regMask);
+    _x86State._modified.or_(C, static_cast<uint32_t>(modified) << regIndex);

    ASMJIT_X86_CHECK_STATE
  }
@@ -174,8 +174,8 @@ struct X86Context : public Context {
    vd->setModified(false);

    _x86State.getListByClass(C)[regIndex] = NULL;
-    _x86State._occupied.del(C, regMask);
-    _x86State._modified.del(C, regMask);
+    _x86State._occupied.andNot(C, regMask);
+    _x86State._modified.andNot(C, regMask);

    ASMJIT_X86_CHECK_STATE
  }
@@ -244,7 +244,7 @@ struct X86Context : public Context {
    emitSave(vd, regIndex, "Save");

    vd->setModified(false);
-    _x86State._modified.del(C, regMask);
+    _x86State._modified.andNot(C, regMask);

    ASMJIT_X86_CHECK_STATE
  }
@@ -381,7 +381,7 @@ struct X86Context : public Context {
    uint32_t regMask = IntUtil::mask(regIndex);

    vd->setModified(true);
-    _x86State._modified.add(C, regMask);
+    _x86State._modified.or_(C, regMask);

    ASMJIT_X86_CHECK_STATE
  }
--- a/src/asmjit/x86/x86cpuinfo.cpp
+++ b/src/asmjit/x86/x86cpuinfo.cpp
@@ -88,20 +88,29 @@ _Skip:
 // in 64-bit mode not allows to use inline assembler, so we need intrinsic and
 // we need also asm version.

+union X86XCR {
+  uint64_t value;
+
+  struct {
+    uint32_t eax;
+    uint32_t edx;
+  };
+};
+
 // callCpuId() and detectCpuInfo() for x86 and x64 platforms begins here.
 #if defined(ASMJIT_HOST_X86) || defined(ASMJIT_HOST_X64)
-void X86CpuUtil::callCpuId(uint32_t inEax, uint32_t inEcx, X86CpuId* outResult) {
+void X86CpuUtil::callCpuId(uint32_t inEax, uint32_t inEcx, X86CpuId* result) {

 #if defined(_MSC_VER)
 // 2009-02-05: Thanks to Mike Tajmajer for supporting VC7.1 compiler.
 // ASMJIT_HOST_X64 is here only for readibility, only VS2005 can compile 64-bit code.
 # if _MSC_VER >= 1400 || defined(ASMJIT_HOST_X64)
  // Done by intrinsics.
-  __cpuidex(reinterpret_cast<int*>(outResult->i), inEax, inEcx);
+  __cpuidex(reinterpret_cast<int*>(result->i), inEax, inEcx);
 # else // _MSC_VER < 1400
  uint32_t cpuid_eax = inEax;
  uint32_t cpuid_ecx = inCax;
-  uint32_t* cpuid_out = outResult->i;
+  uint32_t* cpuid_out = result->i;

  __asm {
    mov     eax, cpuid_eax
@@ -119,18 +128,50 @@ void X86CpuUtil::callCpuId(uint32_t inEax, uint32_t inEcx, X86CpuId* outResult)
 // Note, patched to preserve ebx/rbx register which is used by GCC.
 # if defined(ASMJIT_HOST_X86)
 #  define __myCpuId(inEax, inEcx, outEax, outEbx, outEcx, outEdx) \
-  asm ("mov %%ebx, %%edi\n"  \
+  __asm__ __volatile__( \
+    "mov %%ebx, %%edi\n"  \
    "cpuid\n"             \
    "xchg %%edi, %%ebx\n" \
-       : "=a" (outEax), "=D" (outEbx), "=c" (outEcx), "=d" (outEdx) : "a" (inEax), "c" (inEcx))
+      : "=a" (outEax), "=D" (outEbx), "=c" (outEcx), "=d" (outEdx) \
+      : "a" (inEax), "c" (inEcx))
 # else
 #  define __myCpuId(inEax, inEcx, outEax, outEbx, outEcx, outEdx) \
-  asm ("mov %%rbx, %%rdi\n"  \
+  __asm__ __volatile__( \
+    "mov %%rbx, %%rdi\n"  \
    "cpuid\n"             \
    "xchg %%rdi, %%rbx\n" \
-       : "=a" (outEax), "=D" (outEbx), "=c" (outEcx), "=d" (outEdx) : "a" (inEax), "c" (inEcx))
+      : "=a" (outEax), "=D" (outEbx), "=c" (outEcx), "=d" (outEdx) \
+      : "a" (inEax), "c" (inEcx))
 # endif
-  __myCpuId(inEax, inEcx, outResult->eax, outResult->ebx, outResult->ecx, outResult->edx);
+  __myCpuId(inEax, inEcx, result->eax, result->ebx, result->ecx, result->edx);
+#endif // COMPILER
+}
+
+static void callXGetBV(uint32_t inEcx, X86XCR* result) {
+
+#if defined(_MSC_VER)
+
+# if (_MSC_FULL_VER >= 160040219) // 2010SP1+
+  result->value = _xgetbv(inEcx);
+# else
+  result->value = 0;
+# endif
+
+#elif defined(__GNUC__)
+
+  unsigned int eax, edx;
+# if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
+  __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(inEcx));
+# else
+  __asm__ __volatile__(".byte 0x0F, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(inEcx));
+# endif
+  result->eax = eax;
+  result->edx = edx;
+
+#else
+
+  result->value = 0;
+
 #endif // COMPILER
 }

@@ -138,7 +179,11 @@ void X86CpuUtil::detect(X86CpuInfo* cpuInfo) {
  X86CpuId regs;

  uint32_t i;
-  uint32_t maxId;
+  uint32_t maxBaseId;
+
+  bool maybeMPX = false;
+  X86XCR xcr0;
+  xcr0.value = 0;

  // Clear everything except the '_size' member.
  ::memset(reinterpret_cast<uint8_t*>(cpuInfo) + sizeof(uint32_t),
@@ -148,14 +193,13 @@ void X86CpuUtil::detect(X86CpuInfo* cpuInfo) {
  cpuInfo->_hwThreadsCount = CpuInfo::detectHwThreadsCount();

  // --------------------------------------------------------------------------
-  // [CPUID EAX=0x00000000]
+  // [CPUID EAX=0x0]
  // --------------------------------------------------------------------------

  // Get vendor string/id.
-  callCpuId(0, 0, &regs);
-
-  maxId = regs.eax;
+  callCpuId(0x0, 0x0, &regs);

+  maxBaseId = regs.eax;
  ::memcpy(cpuInfo->_vendorString, &regs.ebx, 4);
  ::memcpy(cpuInfo->_vendorString + 4, &regs.edx, 4);
  ::memcpy(cpuInfo->_vendorString + 8, &regs.ecx, 4);
@@ -168,11 +212,12 @@ void X86CpuUtil::detect(X86CpuInfo* cpuInfo) {
  }

  // --------------------------------------------------------------------------
-  // [CPUID EAX=0x00000001]
+  // [CPUID EAX=0x1]
  // --------------------------------------------------------------------------

-  // Get feature flags in ecx/edx and family/model in eax.
-  callCpuId(1, 0, &regs);
+  if (maxBaseId >= 0x1) {
+    // Get feature flags in ECX/EDX and family/model in EAX.
+    callCpuId(0x1, 0x0, &regs);

    // Fill family and model fields.
    cpuInfo->_family   = (regs.eax >> 8) & 0x0F;
@@ -190,57 +235,111 @@ void X86CpuUtil::detect(X86CpuInfo* cpuInfo) {
    cpuInfo->_flushCacheLineSize   = ((regs.ebx >>  8) & 0xFF) * 8;
    cpuInfo->_maxLogicalProcessors = ((regs.ebx >> 16) & 0xFF);

-  if (regs.ecx & 0x00000001U) cpuInfo->addFeature(kX86CpuFeatureSse3);
-  if (regs.ecx & 0x00000002U) cpuInfo->addFeature(kX86CpuFeaturePclmulqdq);
-  if (regs.ecx & 0x00000008U) cpuInfo->addFeature(kX86CpuFeatureMonitorMWait);
-  if (regs.ecx & 0x00000200U) cpuInfo->addFeature(kX86CpuFeatureSsse3);
-  if (regs.ecx & 0x00002000U) cpuInfo->addFeature(kX86CpuFeatureCmpXchg16B);
-  if (regs.ecx & 0x00080000U) cpuInfo->addFeature(kX86CpuFeatureSse41);
-  if (regs.ecx & 0x00100000U) cpuInfo->addFeature(kX86CpuFeatureSse42);
-  if (regs.ecx & 0x00400000U) cpuInfo->addFeature(kX86CpuFeatureMovbe);
-  if (regs.ecx & 0x00800000U) cpuInfo->addFeature(kX86CpuFeaturePopcnt);
-  if (regs.ecx & 0x02000000U) cpuInfo->addFeature(kX86CpuFeatureAesni);
-  if (regs.ecx & 0x40000000U) cpuInfo->addFeature(kX86CpuFeatureRdrand);
+    if (regs.ecx & 0x00000001U) cpuInfo->addFeature(kX86CpuFeatureSSE3);
+    if (regs.ecx & 0x00000002U) cpuInfo->addFeature(kX86CpuFeaturePCLMULQDQ);
+    if (regs.ecx & 0x00000008U) cpuInfo->addFeature(kX86CpuFeatureMONITOR);
+    if (regs.ecx & 0x00000200U) cpuInfo->addFeature(kX86CpuFeatureSSSE3);
+    if (regs.ecx & 0x00002000U) cpuInfo->addFeature(kX86CpuFeatureCMPXCHG16B);
+    if (regs.ecx & 0x00080000U) cpuInfo->addFeature(kX86CpuFeatureSSE4_1);
+    if (regs.ecx & 0x00100000U) cpuInfo->addFeature(kX86CpuFeatureSSE4_2);
+    if (regs.ecx & 0x00400000U) cpuInfo->addFeature(kX86CpuFeatureMOVBE);
+    if (regs.ecx & 0x00800000U) cpuInfo->addFeature(kX86CpuFeaturePOPCNT);
+    if (regs.ecx & 0x02000000U) cpuInfo->addFeature(kX86CpuFeatureAESNI);
+    if (regs.ecx & 0x04000000U) cpuInfo->addFeature(kX86CpuFeatureXSave);
+    if (regs.ecx & 0x08000000U) cpuInfo->addFeature(kX86CpuFeatureXSaveOS);
+    if (regs.ecx & 0x40000000U) cpuInfo->addFeature(kX86CpuFeatureRDRAND);

-  if (regs.edx & 0x00000010U) cpuInfo->addFeature(kX86CpuFeatureRdtsc);
-  if (regs.edx & 0x00000100U) cpuInfo->addFeature(kX86CpuFeatureCmpXchg8B);
-  if (regs.edx & 0x00008000U) cpuInfo->addFeature(kX86CpuFeatureCmov);
-  if (regs.edx & 0x00800000U) cpuInfo->addFeature(kX86CpuFeatureMmx);
-  if (regs.edx & 0x01000000U) cpuInfo->addFeature(kX86CpuFeatureFxsr);
-  if (regs.edx & 0x02000000U) cpuInfo->addFeature(kX86CpuFeatureSse).addFeature(kX86CpuFeatureMmxExt);
-  if (regs.edx & 0x04000000U) cpuInfo->addFeature(kX86CpuFeatureSse).addFeature(kX86CpuFeatureSse2);
-  if (regs.edx & 0x10000000U) cpuInfo->addFeature(kX86CpuFeatureMultithreading);
+    if (regs.edx & 0x00000010U) cpuInfo->addFeature(kX86CpuFeatureRDTSC);
+    if (regs.edx & 0x00000100U) cpuInfo->addFeature(kX86CpuFeatureCMPXCHG8B);
+    if (regs.edx & 0x00008000U) cpuInfo->addFeature(kX86CpuFeatureCMOV);
+    if (regs.edx & 0x00080000U) cpuInfo->addFeature(kX86CpuFeatureCLFLUSH);
+    if (regs.edx & 0x00800000U) cpuInfo->addFeature(kX86CpuFeatureMMX);
+    if (regs.edx & 0x01000000U) cpuInfo->addFeature(kX86CpuFeatureFXSR);
+    if (regs.edx & 0x02000000U) cpuInfo->addFeature(kX86CpuFeatureSSE).addFeature(kX86CpuFeatureMMX2);
+    if (regs.edx & 0x04000000U) cpuInfo->addFeature(kX86CpuFeatureSSE).addFeature(kX86CpuFeatureSSE2);
+    if (regs.edx & 0x10000000U) cpuInfo->addFeature(kX86CpuFeatureMT);

-  if (cpuInfo->_vendorId == kCpuVendorAmd && (regs.edx & 0x10000000U)) {
-    // AMD sets Multithreading to ON if it has more cores.
-    if (cpuInfo->_hwThreadsCount == 1)
+    // AMD sets Multithreading to ON if it has two or more cores.
+    if (cpuInfo->_hwThreadsCount == 1 && cpuInfo->_vendorId == kCpuVendorAmd && (regs.edx & 0x10000000U)) {
      cpuInfo->_hwThreadsCount = 2;
    }

-  // Detect AVX.
-  if (regs.ecx & 0x10000000U) {
-    cpuInfo->addFeature(kX86CpuFeatureAvx);
-
-    if (regs.ecx & 0x00000800U) cpuInfo->addFeature(kX86CpuFeatureXop);
-    if (regs.ecx & 0x00004000U) cpuInfo->addFeature(kX86CpuFeatureFma3);
-    if (regs.ecx & 0x00010000U) cpuInfo->addFeature(kX86CpuFeatureFma4);
-    if (regs.ecx & 0x20000000U) cpuInfo->addFeature(kX86CpuFeatureF16C);
+    // Get the content of XCR0 if supported by CPU and enabled by OS.
+    if ((regs.ecx & 0x0C000000U) == 0x0C000000U) {
+      callXGetBV(0, &xcr0);
    }

+    // Detect AVX+.
+    if (regs.ecx & 0x10000000U) {
+      // - XCR0[2:1] == 11b
+      //   XMM & YMM states are enabled by OS.
+      if ((xcr0.eax & 0x00000006U) == 0x00000006U) {
+        cpuInfo->addFeature(kX86CpuFeatureAVX);
+
+        if (regs.ecx & 0x00000800U) cpuInfo->addFeature(kX86CpuFeatureXOP);
+        if (regs.ecx & 0x00004000U) cpuInfo->addFeature(kX86CpuFeatureFMA3);
+        if (regs.ecx & 0x00010000U) cpuInfo->addFeature(kX86CpuFeatureFMA4);
+        if (regs.ecx & 0x20000000U) cpuInfo->addFeature(kX86CpuFeatureF16C);
+      }
+    }
+  }
+
+  // --------------------------------------------------------------------------
+  // [CPUID EAX=0x7 ECX=0x0]
+  // --------------------------------------------------------------------------
+
  // Detect new features if the processor supports CPUID-07.
-  if (maxId >= 7) {
-    callCpuId(7, 0, &regs);
+  if (maxBaseId >= 0x7) {
+    callCpuId(0x7, 0x0, &regs);

-    if (regs.ebx & 0x00000001) cpuInfo->addFeature(kX86CpuFeatureFsGsBase);
-    if (regs.ebx & 0x00000008) cpuInfo->addFeature(kX86CpuFeatureBmi);
-    if (regs.ebx & 0x00000010) cpuInfo->addFeature(kX86CpuFeatureHle);
-    if (regs.ebx & 0x00000100) cpuInfo->addFeature(kX86CpuFeatureBmi2);
-    if (regs.ebx & 0x00000200) cpuInfo->addFeature(kX86CpuFeatureRepMovsbStosbExt);
-    if (regs.ebx & 0x00000800) cpuInfo->addFeature(kX86CpuFeatureRtm);
+    if (regs.ebx & 0x00000001U) cpuInfo->addFeature(kX86CpuFeatureFSGSBase);
+    if (regs.ebx & 0x00000008U) cpuInfo->addFeature(kX86CpuFeatureBMI);
+    if (regs.ebx & 0x00000010U) cpuInfo->addFeature(kX86CpuFeatureHLE);
+    if (regs.ebx & 0x00000100U) cpuInfo->addFeature(kX86CpuFeatureBMI2);
+    if (regs.ebx & 0x00000200U) cpuInfo->addFeature(kX86CpuFeatureMOVSBSTOSBOpt);
+    if (regs.ebx & 0x00000800U) cpuInfo->addFeature(kX86CpuFeatureRTM);
+    if (regs.ebx & 0x00004000U) maybeMPX = true;
+    if (regs.ebx & 0x00040000U) cpuInfo->addFeature(kX86CpuFeatureRDSEED);
+    if (regs.ebx & 0x00080000U) cpuInfo->addFeature(kX86CpuFeatureADX);
+    if (regs.ebx & 0x00800000U) cpuInfo->addFeature(kX86CpuFeatureCLFLUSHOpt);
+    if (regs.ebx & 0x20000000U) cpuInfo->addFeature(kX86CpuFeatureSHA);

-    // AVX2 depends on AVX.
-    if (cpuInfo->hasFeature(kX86CpuFeatureAvx)) {
-      if (regs.ebx & 0x00000020) cpuInfo->addFeature(kX86CpuFeatureAvx2);
+    if (regs.ecx & 0x00000001U) cpuInfo->addFeature(kX86CpuFeaturePREFETCHWT1);
+
+    // Detect AVX2.
+    if (cpuInfo->hasFeature(kX86CpuFeatureAVX)) {
+      if (regs.ebx & 0x00000020U) cpuInfo->addFeature(kX86CpuFeatureAVX2);
+    }
+
+    // Detect AVX-512+.
+    if (regs.ebx & 0x00010000U) {
+      // - XCR0[2:1] == 11b
+      //   XMM & YMM states are enabled by OS.
+      // - XCR0[7:5] == 111b
+      //   Upper 256-bit of ZMM0-XMM15 and ZMM16-ZMM31 state are enabled by OS.
+      if ((xcr0.eax & 0x00000076U) == 0x00000076U) {
+        cpuInfo->addFeature(kX86CpuFeatureAVX512F);
+
+        if (regs.ebx & 0x00020000U) cpuInfo->addFeature(kX86CpuFeatureAVX512DQ);
+        if (regs.ebx & 0x04000000U) cpuInfo->addFeature(kX86CpuFeatureAVX512PF);
+        if (regs.ebx & 0x08000000U) cpuInfo->addFeature(kX86CpuFeatureAVX512ER);
+        if (regs.ebx & 0x10000000U) cpuInfo->addFeature(kX86CpuFeatureAVX512CD);
+        if (regs.ebx & 0x40000000U) cpuInfo->addFeature(kX86CpuFeatureAVX512BW);
+        if (regs.ebx & 0x80000000U) cpuInfo->addFeature(kX86CpuFeatureAVX512VL);
+      }
+    }
+  }
+
+  // --------------------------------------------------------------------------
+  // [CPUID EAX=0xD, ECX=0x0]
+  // --------------------------------------------------------------------------
+
+  if (maxBaseId >= 0xD && maybeMPX) {
+    callCpuId(0xD, 0x0, &regs);
+
+    // Both CPUID result and XCR0 has to be enabled to have support for MPX.
+    if (((regs.eax & xcr0.eax) & 0x00000018U) == 0x00000018U) {
+      cpuInfo->addFeature(kX86CpuFeatureMPX);
    }
  }

@@ -250,28 +349,28 @@ void X86CpuUtil::detect(X86CpuInfo* cpuInfo) {

  // Calling cpuid with 0x80000000 as the in argument gets the number of valid
  // extended IDs.
-  callCpuId(0x80000000, 0, &regs);
+  callCpuId(0x80000000, 0x0, &regs);

  uint32_t maxExtId = IntUtil::iMin<uint32_t>(regs.eax, 0x80000004);
  uint32_t* brand = reinterpret_cast<uint32_t*>(cpuInfo->_brandString);

  for (i = 0x80000001; i <= maxExtId; i++) {
-    callCpuId(i, 0, &regs);
+    callCpuId(i, 0x0, &regs);

    switch (i) {
      case 0x80000001:
        if (regs.ecx & 0x00000001U) cpuInfo->addFeature(kX86CpuFeatureLahfSahf);
-        if (regs.ecx & 0x00000020U) cpuInfo->addFeature(kX86CpuFeatureLzcnt);
-        if (regs.ecx & 0x00000040U) cpuInfo->addFeature(kX86CpuFeatureSse4A);
-        if (regs.ecx & 0x00000080U) cpuInfo->addFeature(kX86CpuFeatureMsse);
-        if (regs.ecx & 0x00000100U) cpuInfo->addFeature(kX86CpuFeaturePrefetch);
+        if (regs.ecx & 0x00000020U) cpuInfo->addFeature(kX86CpuFeatureLZCNT);
+        if (regs.ecx & 0x00000040U) cpuInfo->addFeature(kX86CpuFeatureSSE4A);
+        if (regs.ecx & 0x00000080U) cpuInfo->addFeature(kX86CpuFeatureMSSE);
+        if (regs.ecx & 0x00000100U) cpuInfo->addFeature(kX86CpuFeaturePREFETCH);

-        if (regs.edx & 0x00100000U) cpuInfo->addFeature(kX86CpuFeatureExecuteDisableBit);
-        if (regs.edx & 0x00200000U) cpuInfo->addFeature(kX86CpuFeatureFfxsr);
-        if (regs.edx & 0x00400000U) cpuInfo->addFeature(kX86CpuFeatureMmxExt);
-        if (regs.edx & 0x08000000U) cpuInfo->addFeature(kX86CpuFeatureRdtscp);
-        if (regs.edx & 0x40000000U) cpuInfo->addFeature(kX86CpuFeature3dNowExt).addFeature(kX86CpuFeatureMmxExt);
-        if (regs.edx & 0x80000000U) cpuInfo->addFeature(kX86CpuFeature3dNow);
+        if (regs.edx & 0x00100000U) cpuInfo->addFeature(kX86CpuFeatureNX);
+        if (regs.edx & 0x00200000U) cpuInfo->addFeature(kX86CpuFeatureFXSROpt);
+        if (regs.edx & 0x00400000U) cpuInfo->addFeature(kX86CpuFeatureMMX2);
+        if (regs.edx & 0x08000000U) cpuInfo->addFeature(kX86CpuFeatureRDTSCP);
+        if (regs.edx & 0x40000000U) cpuInfo->addFeature(kX86CpuFeature3DNOW2).addFeature(kX86CpuFeatureMMX2);
+        if (regs.edx & 0x80000000U) cpuInfo->addFeature(kX86CpuFeature3DNOW);
        break;

      case 0x80000002:
--- a/src/asmjit/x86/x86cpuinfo.h
+++ b/src/asmjit/x86/x86cpuinfo.h
@@ -31,92 +31,123 @@ struct X86CpuInfo;

 //! X86 CPU features.
 ASMJIT_ENUM(kX86CpuFeature) {
+  //! Cpu has Not-Execute-Bit.
+  kX86CpuFeatureNX = 0,
  //! Cpu has multithreading.
-  kX86CpuFeatureMultithreading = 1,
-  //! Cpu has execute disable bit.
-  kX86CpuFeatureExecuteDisableBit,
+  kX86CpuFeatureMT,
  //! Cpu has RDTSC.
-  kX86CpuFeatureRdtsc,
+  kX86CpuFeatureRDTSC,
  //! Cpu has RDTSCP.
-  kX86CpuFeatureRdtscp,
+  kX86CpuFeatureRDTSCP,
  //! Cpu has CMOV.
-  kX86CpuFeatureCmov,
+  kX86CpuFeatureCMOV,
  //! Cpu has CMPXCHG8B.
-  kX86CpuFeatureCmpXchg8B,
-  //! Cpu has CMPXCHG16B (x64).
-  kX86CpuFeatureCmpXchg16B,
+  kX86CpuFeatureCMPXCHG8B,
+  //! Cpu has CMPXCHG16B (X64).
+  kX86CpuFeatureCMPXCHG16B,
  //! Cpu has CLFUSH.
-  kX86CpuFeatureClflush,
+  kX86CpuFeatureCLFLUSH,
+  //! Cpu has CLFUSH (Optimized).
+  kX86CpuFeatureCLFLUSHOpt,
  //! Cpu has PREFETCH.
-  kX86CpuFeaturePrefetch,
+  kX86CpuFeaturePREFETCH,
+  //! Cpu has PREFETCHWT1.
+  kX86CpuFeaturePREFETCHWT1,
  //! Cpu has LAHF/SAHF.
  kX86CpuFeatureLahfSahf,
  //! Cpu has FXSAVE/FXRSTOR.
-  kX86CpuFeatureFxsr,
-  //! Cpu has FXSAVE/FXRSTOR optimizations.
-  kX86CpuFeatureFfxsr,
+  kX86CpuFeatureFXSR,
+  //! Cpu has FXSAVE/FXRSTOR (Optimized).
+  kX86CpuFeatureFXSROpt,
  //! Cpu has MMX.
-  kX86CpuFeatureMmx,
+  kX86CpuFeatureMMX,
  //! Cpu has extended MMX.
-  kX86CpuFeatureMmxExt,
+  kX86CpuFeatureMMX2,
  //! Cpu has 3dNow!
-  kX86CpuFeature3dNow,
+  kX86CpuFeature3DNOW,
  //! Cpu has enchanced 3dNow!
-  kX86CpuFeature3dNowExt,
+  kX86CpuFeature3DNOW2,
  //! Cpu has SSE.
-  kX86CpuFeatureSse,
+  kX86CpuFeatureSSE,
  //! Cpu has SSE2.
-  kX86CpuFeatureSse2,
+  kX86CpuFeatureSSE2,
  //! Cpu has SSE3.
-  kX86CpuFeatureSse3,
-  //! Cpu has Supplemental SSE3 (SSSE3).
-  kX86CpuFeatureSsse3,
+  kX86CpuFeatureSSE3,
+  //! Cpu has SSSE3.
+  kX86CpuFeatureSSSE3,
  //! Cpu has SSE4.A.
-  kX86CpuFeatureSse4A,
+  kX86CpuFeatureSSE4A,
  //! Cpu has SSE4.1.
-  kX86CpuFeatureSse41,
+  kX86CpuFeatureSSE4_1,
  //! Cpu has SSE4.2.
-  kX86CpuFeatureSse42,
+  kX86CpuFeatureSSE4_2,
  //! Cpu has Misaligned SSE (MSSE).
-  kX86CpuFeatureMsse,
+  kX86CpuFeatureMSSE,
  //! Cpu has MONITOR and MWAIT.
-  kX86CpuFeatureMonitorMWait,
+  kX86CpuFeatureMONITOR,
  //! Cpu has MOVBE.
-  kX86CpuFeatureMovbe,
+  kX86CpuFeatureMOVBE,
  //! Cpu has POPCNT.
-  kX86CpuFeaturePopcnt,
+  kX86CpuFeaturePOPCNT,
  //! Cpu has LZCNT.
-  kX86CpuFeatureLzcnt,
+  kX86CpuFeatureLZCNT,
  //! Cpu has AESNI.
-  kX86CpuFeatureAesni,
+  kX86CpuFeatureAESNI,
  //! Cpu has PCLMULQDQ.
-  kX86CpuFeaturePclmulqdq,
+  kX86CpuFeaturePCLMULQDQ,
  //! Cpu has RDRAND.
-  kX86CpuFeatureRdrand,
+  kX86CpuFeatureRDRAND,
+  //! Cpu has RDSEED.
+  kX86CpuFeatureRDSEED,
+  //! Cpu has SHA-1 and SHA-256.
+  kX86CpuFeatureSHA,
+  //! Cpu has XSAVE support - XSAVE/XRSTOR, XSETBV/XGETBV, and XCR0.
+  kX86CpuFeatureXSave,
+  //! OS has enabled XSAVE, you can call XGETBV to get value of XCR0.
+  kX86CpuFeatureXSaveOS,
  //! Cpu has AVX.
-  kX86CpuFeatureAvx,
+  kX86CpuFeatureAVX,
  //! Cpu has AVX2.
-  kX86CpuFeatureAvx2,
+  kX86CpuFeatureAVX2,
  //! Cpu has F16C.
  kX86CpuFeatureF16C,
  //! Cpu has FMA3.
-  kX86CpuFeatureFma3,
+  kX86CpuFeatureFMA3,
  //! Cpu has FMA4.
-  kX86CpuFeatureFma4,
+  kX86CpuFeatureFMA4,
  //! Cpu has XOP.
-  kX86CpuFeatureXop,
+  kX86CpuFeatureXOP,
  //! Cpu has BMI.
-  kX86CpuFeatureBmi,
+  kX86CpuFeatureBMI,
  //! Cpu has BMI2.
-  kX86CpuFeatureBmi2,
+  kX86CpuFeatureBMI2,
  //! Cpu has HLE.
-  kX86CpuFeatureHle,
+  kX86CpuFeatureHLE,
  //! Cpu has RTM.
-  kX86CpuFeatureRtm,
+  kX86CpuFeatureRTM,
+  //! Cpu has ADX.
+  kX86CpuFeatureADX,
+  //! Cpu has MPX (Memory Protection Extensions).
+  kX86CpuFeatureMPX,
  //! Cpu has FSGSBASE.
-  kX86CpuFeatureFsGsBase,
-  //! Cpu has enhanced REP MOVSB/STOSB.
-  kX86CpuFeatureRepMovsbStosbExt,
+  kX86CpuFeatureFSGSBase,
+  //! Cpu has optimized REP MOVSB/STOSB.
+  kX86CpuFeatureMOVSBSTOSBOpt,
+
+  //! Cpu has AVX-512F (Foundation).
+  kX86CpuFeatureAVX512F,
+  //! Cpu has AVX-512CD (Conflict Detection).
+  kX86CpuFeatureAVX512CD,
+  //! Cpu has AVX-512PF (Prefetch Instructions).
+  kX86CpuFeatureAVX512PF,
+  //! Cpu has AVX-512ER (Exponential and Reciprocal Instructions).
+  kX86CpuFeatureAVX512ER,
+  //! Cpu has AVX-512DQ (DWord/QWord).
+  kX86CpuFeatureAVX512DQ,
+  //! Cpu has AVX-512BW (Byte/Word).
+  kX86CpuFeatureAVX512BW,
+  //! Cpu has AVX VL (Vector Length Excensions).
+  kX86CpuFeatureAVX512VL,

  //! Count of X86/X64 Cpu features.
  kX86CpuFeatureCount
--- a/src/asmjit/x86/x86inst.cpp
+++ b/src/asmjit/x86/x86inst.cpp
--- a/src/asmjit/x86/x86inst.h
+++ b/src/asmjit/x86/x86inst.h
--- a/src/asmjit/x86/x86operand.h
+++ b/src/asmjit/x86/x86operand.h
--- a/src/asmjit/x86/x86operand_regs.cpp
+++ b/src/asmjit/x86/x86operand_regs.cpp
@@ -20,165 +20,262 @@

 namespace asmjit {

-// Prevent static initialization.
-//
-// Remap all classes to POD structs so they can be statically initialized
-// without calling a constructor. Compiler will store these in data section.
-struct X86GpReg  { Operand::VRegOp data; };
-struct X86FpReg  { Operand::VRegOp data; };
-struct X86MmReg  { Operand::VRegOp data; };
-struct X86XmmReg { Operand::VRegOp data; };
-struct X86YmmReg { Operand::VRegOp data; };
-struct X86SegReg { Operand::VRegOp data; };
-
-namespace x86 {
-
-// ============================================================================
-// [asmjit::x86::Registers]
-// ============================================================================
-
-#define REG(_Class_, _Name_, _Type_, _Index_, _Size_) \
-  const _Class_ _Name_ = {{ \
+#define REG(_Type_, _Index_, _Size_) {{ \
  kOperandTypeReg, _Size_, { ((_Type_) << 8) + _Index_ }, kInvalidValue, {{ kInvalidVar, 0 }} \
 }}

-REG(X86GpReg, noGpReg, kInvalidReg, kInvalidReg, 0);
+const X86RegData x86RegData = {
+  // RIP.
+  REG(kX86RegTypeRip, 0, 0),
+  // NpGp.
+  REG(kInvalidReg, kInvalidReg, 0),

-REG(X86GpReg, al, kX86RegTypeGpbLo, kX86RegIndexAx, 1);
-REG(X86GpReg, cl, kX86RegTypeGpbLo, kX86RegIndexCx, 1);
-REG(X86GpReg, dl, kX86RegTypeGpbLo, kX86RegIndexDx, 1);
-REG(X86GpReg, bl, kX86RegTypeGpbLo, kX86RegIndexBx, 1);
-REG(X86GpReg, spl, kX86RegTypeGpbLo, kX86RegIndexSp, 1);
-REG(X86GpReg, bpl, kX86RegTypeGpbLo, kX86RegIndexBp, 1);
-REG(X86GpReg, sil, kX86RegTypeGpbLo, kX86RegIndexSi, 1);
-REG(X86GpReg, dil, kX86RegTypeGpbLo, kX86RegIndexDi, 1);
-REG(X86GpReg, r8b, kX86RegTypeGpbLo, 8, 1);
-REG(X86GpReg, r9b, kX86RegTypeGpbLo, 9, 1);
-REG(X86GpReg, r10b, kX86RegTypeGpbLo, 10, 1);
-REG(X86GpReg, r11b, kX86RegTypeGpbLo, 11, 1);
-REG(X86GpReg, r12b, kX86RegTypeGpbLo, 12, 1);
-REG(X86GpReg, r13b, kX86RegTypeGpbLo, 13, 1);
-REG(X86GpReg, r14b, kX86RegTypeGpbLo, 14, 1);
-REG(X86GpReg, r15b, kX86RegTypeGpbLo, 15, 1);
+  // Segments.
+  {
+    REG(kX86RegTypeSeg, 0, 2), // Default.
+    REG(kX86RegTypeSeg, 1, 2), // ES.
+    REG(kX86RegTypeSeg, 2, 2), // CS.
+    REG(kX86RegTypeSeg, 3, 2), // SS.
+    REG(kX86RegTypeSeg, 4, 2), // DS.
+    REG(kX86RegTypeSeg, 5, 2), // FS.
+    REG(kX86RegTypeSeg, 6, 2)  // GS.
+  },

-REG(X86GpReg, ah, kX86RegTypeGpbHi, kX86RegIndexAx, 1);
-REG(X86GpReg, ch, kX86RegTypeGpbHi, kX86RegIndexCx, 1);
-REG(X86GpReg, dh, kX86RegTypeGpbHi, kX86RegIndexDx, 1);
-REG(X86GpReg, bh, kX86RegTypeGpbHi, kX86RegIndexBx, 1);
+  // GpbLo.
+  {
+    REG(kX86RegTypeGpbLo, 0, 1),
+    REG(kX86RegTypeGpbLo, 1, 1),
+    REG(kX86RegTypeGpbLo, 2, 1),
+    REG(kX86RegTypeGpbLo, 3, 1),
+    REG(kX86RegTypeGpbLo, 4, 1),
+    REG(kX86RegTypeGpbLo, 5, 1),
+    REG(kX86RegTypeGpbLo, 6, 1),
+    REG(kX86RegTypeGpbLo, 7, 1),
+    REG(kX86RegTypeGpbLo, 8, 1),
+    REG(kX86RegTypeGpbLo, 9, 1),
+    REG(kX86RegTypeGpbLo, 10, 1),
+    REG(kX86RegTypeGpbLo, 11, 1),
+    REG(kX86RegTypeGpbLo, 12, 1),
+    REG(kX86RegTypeGpbLo, 13, 1),
+    REG(kX86RegTypeGpbLo, 14, 1),
+    REG(kX86RegTypeGpbLo, 15, 1)
+  },

-REG(X86GpReg, ax, kX86RegTypeGpw, kX86RegIndexAx, 2);
-REG(X86GpReg, cx, kX86RegTypeGpw, kX86RegIndexCx, 2);
-REG(X86GpReg, dx, kX86RegTypeGpw, kX86RegIndexDx, 2);
-REG(X86GpReg, bx, kX86RegTypeGpw, kX86RegIndexBx, 2);
-REG(X86GpReg, sp, kX86RegTypeGpw, kX86RegIndexSp, 2);
-REG(X86GpReg, bp, kX86RegTypeGpw, kX86RegIndexBp, 2);
-REG(X86GpReg, si, kX86RegTypeGpw, kX86RegIndexSi, 2);
-REG(X86GpReg, di, kX86RegTypeGpw, kX86RegIndexDi, 2);
-REG(X86GpReg, r8w, kX86RegTypeGpw, 8, 2);
-REG(X86GpReg, r9w, kX86RegTypeGpw, 9, 2);
-REG(X86GpReg, r10w, kX86RegTypeGpw, 10, 2);
-REG(X86GpReg, r11w, kX86RegTypeGpw, 11, 2);
-REG(X86GpReg, r12w, kX86RegTypeGpw, 12, 2);
-REG(X86GpReg, r13w, kX86RegTypeGpw, 13, 2);
-REG(X86GpReg, r14w, kX86RegTypeGpw, 14, 2);
-REG(X86GpReg, r15w, kX86RegTypeGpw, 15, 2);
+  // GpbHi.
+  {
+    REG(kX86RegTypeGpbHi, 0, 1),
+    REG(kX86RegTypeGpbHi, 1, 1),
+    REG(kX86RegTypeGpbHi, 2, 1),
+    REG(kX86RegTypeGpbHi, 3, 1)
+  },

-REG(X86GpReg, eax, kX86RegTypeGpd, kX86RegIndexAx, 4);
-REG(X86GpReg, ecx, kX86RegTypeGpd, kX86RegIndexCx, 4);
-REG(X86GpReg, edx, kX86RegTypeGpd, kX86RegIndexDx, 4);
-REG(X86GpReg, ebx, kX86RegTypeGpd, kX86RegIndexBx, 4);
-REG(X86GpReg, esp, kX86RegTypeGpd, kX86RegIndexSp, 4);
-REG(X86GpReg, ebp, kX86RegTypeGpd, kX86RegIndexBp, 4);
-REG(X86GpReg, esi, kX86RegTypeGpd, kX86RegIndexSi, 4);
-REG(X86GpReg, edi, kX86RegTypeGpd, kX86RegIndexDi, 4);
-REG(X86GpReg, r8d, kX86RegTypeGpd, 8, 4);
-REG(X86GpReg, r9d, kX86RegTypeGpd, 9, 4);
-REG(X86GpReg, r10d, kX86RegTypeGpd, 10, 4);
-REG(X86GpReg, r11d, kX86RegTypeGpd, 11, 4);
-REG(X86GpReg, r12d, kX86RegTypeGpd, 12, 4);
-REG(X86GpReg, r13d, kX86RegTypeGpd, 13, 4);
-REG(X86GpReg, r14d, kX86RegTypeGpd, 14, 4);
-REG(X86GpReg, r15d, kX86RegTypeGpd, 15, 4);
+  // Gpw.
+  {
+    REG(kX86RegTypeGpw, 0, 2),
+    REG(kX86RegTypeGpw, 1, 2),
+    REG(kX86RegTypeGpw, 2, 2),
+    REG(kX86RegTypeGpw, 3, 2),
+    REG(kX86RegTypeGpw, 4, 2),
+    REG(kX86RegTypeGpw, 5, 2),
+    REG(kX86RegTypeGpw, 6, 2),
+    REG(kX86RegTypeGpw, 7, 2),
+    REG(kX86RegTypeGpw, 8, 2),
+    REG(kX86RegTypeGpw, 9, 2),
+    REG(kX86RegTypeGpw, 10, 2),
+    REG(kX86RegTypeGpw, 11, 2),
+    REG(kX86RegTypeGpw, 12, 2),
+    REG(kX86RegTypeGpw, 13, 2),
+    REG(kX86RegTypeGpw, 14, 2),
+    REG(kX86RegTypeGpw, 15, 2)
+  },

-REG(X86GpReg, rax, kX86RegTypeGpq, kX86RegIndexAx, 8);
-REG(X86GpReg, rcx, kX86RegTypeGpq, kX86RegIndexCx, 8);
-REG(X86GpReg, rdx, kX86RegTypeGpq, kX86RegIndexDx, 8);
-REG(X86GpReg, rbx, kX86RegTypeGpq, kX86RegIndexBx, 8);
-REG(X86GpReg, rsp, kX86RegTypeGpq, kX86RegIndexSp, 8);
-REG(X86GpReg, rbp, kX86RegTypeGpq, kX86RegIndexBp, 8);
-REG(X86GpReg, rsi, kX86RegTypeGpq, kX86RegIndexSi, 8);
-REG(X86GpReg, rdi, kX86RegTypeGpq, kX86RegIndexDi, 8);
-REG(X86GpReg, r8, kX86RegTypeGpq, 8, 8);
-REG(X86GpReg, r9, kX86RegTypeGpq, 9, 8);
-REG(X86GpReg, r10, kX86RegTypeGpq, 10, 8);
-REG(X86GpReg, r11, kX86RegTypeGpq, 11, 8);
-REG(X86GpReg, r12, kX86RegTypeGpq, 12, 8);
-REG(X86GpReg, r13, kX86RegTypeGpq, 13, 8);
-REG(X86GpReg, r14, kX86RegTypeGpq, 14, 8);
-REG(X86GpReg, r15, kX86RegTypeGpq, 15, 8);
+  // Gpd.
+  {
+    REG(kX86RegTypeGpd, 0, 4),
+    REG(kX86RegTypeGpd, 1, 4),
+    REG(kX86RegTypeGpd, 2, 4),
+    REG(kX86RegTypeGpd, 3, 4),
+    REG(kX86RegTypeGpd, 4, 4),
+    REG(kX86RegTypeGpd, 5, 4),
+    REG(kX86RegTypeGpd, 6, 4),
+    REG(kX86RegTypeGpd, 7, 4),
+    REG(kX86RegTypeGpd, 8, 4),
+    REG(kX86RegTypeGpd, 9, 4),
+    REG(kX86RegTypeGpd, 10, 4),
+    REG(kX86RegTypeGpd, 11, 4),
+    REG(kX86RegTypeGpd, 12, 4),
+    REG(kX86RegTypeGpd, 13, 4),
+    REG(kX86RegTypeGpd, 14, 4),
+    REG(kX86RegTypeGpd, 15, 4)
+  },

-REG(X86FpReg, fp0, kX86RegTypeFp, 0, 10);
-REG(X86FpReg, fp1, kX86RegTypeFp, 1, 10);
-REG(X86FpReg, fp2, kX86RegTypeFp, 2, 10);
-REG(X86FpReg, fp3, kX86RegTypeFp, 3, 10);
-REG(X86FpReg, fp4, kX86RegTypeFp, 4, 10);
-REG(X86FpReg, fp5, kX86RegTypeFp, 5, 10);
-REG(X86FpReg, fp6, kX86RegTypeFp, 6, 10);
-REG(X86FpReg, fp7, kX86RegTypeFp, 7, 10);
+  // Gpq.
+  {
+    REG(kX86RegTypeGpq, 0, 8),
+    REG(kX86RegTypeGpq, 1, 8),
+    REG(kX86RegTypeGpq, 2, 8),
+    REG(kX86RegTypeGpq, 3, 8),
+    REG(kX86RegTypeGpq, 4, 8),
+    REG(kX86RegTypeGpq, 5, 8),
+    REG(kX86RegTypeGpq, 6, 8),
+    REG(kX86RegTypeGpq, 7, 8),
+    REG(kX86RegTypeGpq, 8, 8),
+    REG(kX86RegTypeGpq, 9, 8),
+    REG(kX86RegTypeGpq, 10, 8),
+    REG(kX86RegTypeGpq, 11, 8),
+    REG(kX86RegTypeGpq, 12, 8),
+    REG(kX86RegTypeGpq, 13, 8),
+    REG(kX86RegTypeGpq, 14, 8),
+    REG(kX86RegTypeGpq, 15, 8)
+  },

-REG(X86MmReg, mm0, kX86RegTypeMm, 0, 8);
-REG(X86MmReg, mm1, kX86RegTypeMm, 1, 8);
-REG(X86MmReg, mm2, kX86RegTypeMm, 2, 8);
-REG(X86MmReg, mm3, kX86RegTypeMm, 3, 8);
-REG(X86MmReg, mm4, kX86RegTypeMm, 4, 8);
-REG(X86MmReg, mm5, kX86RegTypeMm, 5, 8);
-REG(X86MmReg, mm6, kX86RegTypeMm, 6, 8);
-REG(X86MmReg, mm7, kX86RegTypeMm, 7, 8);
+  // Fp.
+  {
+    REG(kX86RegTypeFp, 0, 10),
+    REG(kX86RegTypeFp, 1, 10),
+    REG(kX86RegTypeFp, 2, 10),
+    REG(kX86RegTypeFp, 3, 10),
+    REG(kX86RegTypeFp, 4, 10),
+    REG(kX86RegTypeFp, 5, 10),
+    REG(kX86RegTypeFp, 6, 10),
+    REG(kX86RegTypeFp, 7, 10)
+  },

-REG(X86XmmReg, xmm0, kX86RegTypeXmm, 0, 16);
-REG(X86XmmReg, xmm1, kX86RegTypeXmm, 1, 16);
-REG(X86XmmReg, xmm2, kX86RegTypeXmm, 2, 16);
-REG(X86XmmReg, xmm3, kX86RegTypeXmm, 3, 16);
-REG(X86XmmReg, xmm4, kX86RegTypeXmm, 4, 16);
-REG(X86XmmReg, xmm5, kX86RegTypeXmm, 5, 16);
-REG(X86XmmReg, xmm6, kX86RegTypeXmm, 6, 16);
-REG(X86XmmReg, xmm7, kX86RegTypeXmm, 7, 16);
-REG(X86XmmReg, xmm8, kX86RegTypeXmm, 8, 16);
-REG(X86XmmReg, xmm9, kX86RegTypeXmm, 9, 16);
-REG(X86XmmReg, xmm10, kX86RegTypeXmm, 10, 16);
-REG(X86XmmReg, xmm11, kX86RegTypeXmm, 11, 16);
-REG(X86XmmReg, xmm12, kX86RegTypeXmm, 12, 16);
-REG(X86XmmReg, xmm13, kX86RegTypeXmm, 13, 16);
-REG(X86XmmReg, xmm14, kX86RegTypeXmm, 14, 16);
-REG(X86XmmReg, xmm15, kX86RegTypeXmm, 15, 16);
+  // Mm.
+  {
+    REG(kX86RegTypeMm, 0, 8),
+    REG(kX86RegTypeMm, 1, 8),
+    REG(kX86RegTypeMm, 2, 8),
+    REG(kX86RegTypeMm, 3, 8),
+    REG(kX86RegTypeMm, 4, 8),
+    REG(kX86RegTypeMm, 5, 8),
+    REG(kX86RegTypeMm, 6, 8),
+    REG(kX86RegTypeMm, 7, 8)
+  },

-REG(X86YmmReg, ymm0, kX86RegTypeYmm, 0, 32);
-REG(X86YmmReg, ymm1, kX86RegTypeYmm, 1, 32);
-REG(X86YmmReg, ymm2, kX86RegTypeYmm, 2, 32);
-REG(X86YmmReg, ymm3, kX86RegTypeYmm, 3, 32);
-REG(X86YmmReg, ymm4, kX86RegTypeYmm, 4, 32);
-REG(X86YmmReg, ymm5, kX86RegTypeYmm, 5, 32);
-REG(X86YmmReg, ymm6, kX86RegTypeYmm, 6, 32);
-REG(X86YmmReg, ymm7, kX86RegTypeYmm, 7, 32);
-REG(X86YmmReg, ymm8, kX86RegTypeYmm, 8, 32);
-REG(X86YmmReg, ymm9, kX86RegTypeYmm, 9, 32);
-REG(X86YmmReg, ymm10, kX86RegTypeYmm, 10, 32);
-REG(X86YmmReg, ymm11, kX86RegTypeYmm, 11, 32);
-REG(X86YmmReg, ymm12, kX86RegTypeYmm, 12, 32);
-REG(X86YmmReg, ymm13, kX86RegTypeYmm, 13, 32);
-REG(X86YmmReg, ymm14, kX86RegTypeYmm, 14, 32);
-REG(X86YmmReg, ymm15, kX86RegTypeYmm, 15, 32);
+  // K.
+  {
+    REG(kX86RegTypeK, 0, 8),
+    REG(kX86RegTypeK, 1, 8),
+    REG(kX86RegTypeK, 2, 8),
+    REG(kX86RegTypeK, 3, 8),
+    REG(kX86RegTypeK, 4, 8),
+    REG(kX86RegTypeK, 5, 8),
+    REG(kX86RegTypeK, 6, 8),
+    REG(kX86RegTypeK, 7, 8)
+  },

-REG(X86SegReg, cs, kX86RegTypeSeg, kX86SegCs, 2);
-REG(X86SegReg, ss, kX86RegTypeSeg, kX86SegSs, 2);
-REG(X86SegReg, ds, kX86RegTypeSeg, kX86SegDs, 2);
-REG(X86SegReg, es, kX86RegTypeSeg, kX86SegEs, 2);
-REG(X86SegReg, fs, kX86RegTypeSeg, kX86SegFs, 2);
-REG(X86SegReg, gs, kX86RegTypeSeg, kX86SegGs, 2);
+  // Xmm.
+  {
+    REG(kX86RegTypeXmm, 0, 16),
+    REG(kX86RegTypeXmm, 1, 16),
+    REG(kX86RegTypeXmm, 2, 16),
+    REG(kX86RegTypeXmm, 3, 16),
+    REG(kX86RegTypeXmm, 4, 16),
+    REG(kX86RegTypeXmm, 5, 16),
+    REG(kX86RegTypeXmm, 6, 16),
+    REG(kX86RegTypeXmm, 7, 16),
+    REG(kX86RegTypeXmm, 8, 16),
+    REG(kX86RegTypeXmm, 9, 16),
+    REG(kX86RegTypeXmm, 10, 16),
+    REG(kX86RegTypeXmm, 11, 16),
+    REG(kX86RegTypeXmm, 12, 16),
+    REG(kX86RegTypeXmm, 13, 16),
+    REG(kX86RegTypeXmm, 14, 16),
+    REG(kX86RegTypeXmm, 15, 16),
+    REG(kX86RegTypeXmm, 16, 16),
+    REG(kX86RegTypeXmm, 17, 16),
+    REG(kX86RegTypeXmm, 18, 16),
+    REG(kX86RegTypeXmm, 19, 16),
+    REG(kX86RegTypeXmm, 20, 16),
+    REG(kX86RegTypeXmm, 21, 16),
+    REG(kX86RegTypeXmm, 22, 16),
+    REG(kX86RegTypeXmm, 23, 16),
+    REG(kX86RegTypeXmm, 24, 16),
+    REG(kX86RegTypeXmm, 25, 16),
+    REG(kX86RegTypeXmm, 26, 16),
+    REG(kX86RegTypeXmm, 27, 16),
+    REG(kX86RegTypeXmm, 28, 16),
+    REG(kX86RegTypeXmm, 29, 16),
+    REG(kX86RegTypeXmm, 30, 16),
+    REG(kX86RegTypeXmm, 31, 16)
+  },
+
+  // Ymm.
+  {
+    REG(kX86RegTypeYmm, 0, 32),
+    REG(kX86RegTypeYmm, 1, 32),
+    REG(kX86RegTypeYmm, 2, 32),
+    REG(kX86RegTypeYmm, 3, 32),
+    REG(kX86RegTypeYmm, 4, 32),
+    REG(kX86RegTypeYmm, 5, 32),
+    REG(kX86RegTypeYmm, 6, 32),
+    REG(kX86RegTypeYmm, 7, 32),
+    REG(kX86RegTypeYmm, 8, 32),
+    REG(kX86RegTypeYmm, 9, 32),
+    REG(kX86RegTypeYmm, 10, 32),
+    REG(kX86RegTypeYmm, 11, 32),
+    REG(kX86RegTypeYmm, 12, 32),
+    REG(kX86RegTypeYmm, 13, 32),
+    REG(kX86RegTypeYmm, 14, 32),
+    REG(kX86RegTypeYmm, 15, 32),
+    REG(kX86RegTypeYmm, 16, 32),
+    REG(kX86RegTypeYmm, 17, 32),
+    REG(kX86RegTypeYmm, 18, 32),
+    REG(kX86RegTypeYmm, 19, 32),
+    REG(kX86RegTypeYmm, 20, 32),
+    REG(kX86RegTypeYmm, 21, 32),
+    REG(kX86RegTypeYmm, 22, 32),
+    REG(kX86RegTypeYmm, 23, 32),
+    REG(kX86RegTypeYmm, 24, 32),
+    REG(kX86RegTypeYmm, 25, 32),
+    REG(kX86RegTypeYmm, 26, 32),
+    REG(kX86RegTypeYmm, 27, 32),
+    REG(kX86RegTypeYmm, 28, 32),
+    REG(kX86RegTypeYmm, 29, 32),
+    REG(kX86RegTypeYmm, 30, 32),
+    REG(kX86RegTypeYmm, 31, 32)
+  },
+
+  // Zmm.
+  {
+    REG(kX86RegTypeZmm, 0, 64),
+    REG(kX86RegTypeZmm, 1, 64),
+    REG(kX86RegTypeZmm, 2, 64),
+    REG(kX86RegTypeZmm, 3, 64),
+    REG(kX86RegTypeZmm, 4, 64),
+    REG(kX86RegTypeZmm, 5, 64),
+    REG(kX86RegTypeZmm, 6, 64),
+    REG(kX86RegTypeZmm, 7, 64),
+    REG(kX86RegTypeZmm, 8, 64),
+    REG(kX86RegTypeZmm, 9, 64),
+    REG(kX86RegTypeZmm, 10, 64),
+    REG(kX86RegTypeZmm, 11, 64),
+    REG(kX86RegTypeZmm, 12, 64),
+    REG(kX86RegTypeZmm, 13, 64),
+    REG(kX86RegTypeZmm, 14, 64),
+    REG(kX86RegTypeZmm, 15, 64),
+    REG(kX86RegTypeZmm, 16, 64),
+    REG(kX86RegTypeZmm, 17, 64),
+    REG(kX86RegTypeZmm, 18, 64),
+    REG(kX86RegTypeZmm, 19, 64),
+    REG(kX86RegTypeZmm, 20, 64),
+    REG(kX86RegTypeZmm, 21, 64),
+    REG(kX86RegTypeZmm, 22, 64),
+    REG(kX86RegTypeZmm, 23, 64),
+    REG(kX86RegTypeZmm, 24, 64),
+    REG(kX86RegTypeZmm, 25, 64),
+    REG(kX86RegTypeZmm, 26, 64),
+    REG(kX86RegTypeZmm, 27, 64),
+    REG(kX86RegTypeZmm, 28, 64),
+    REG(kX86RegTypeZmm, 29, 64),
+    REG(kX86RegTypeZmm, 30, 64),
+    REG(kX86RegTypeZmm, 31, 64)
+  }
+};

 #undef REG

-} // x86 namespace
 } // asmjit namespace

 // [Api-End]
--- a/src/asmjit/x86/x86scheduler.cpp
+++ b/src/asmjit/x86/x86scheduler.cpp
@@ -76,7 +76,7 @@ Error X86Scheduler::run(Node* start, Node* stop) {
    Node* next = node_->getNext();
    ASMJIT_ASSERT(node_->getType() == kNodeTypeInst);

-    printf("  %s\n", X86Util::getInstInfo(static_cast<InstNode*>(node_)->getCode()).getInstName());
+    printf("  %s\n", X86Util::getInstInfo(static_cast<InstNode*>(node_)->getInstId()).getInstName());
    node_ = next;
  }

--- a/tools/src-gendefs.js
+++ b/tools/src-gendefs.js
@@ -12,17 +12,23 @@ var fs = require("fs");
 // [Utilities]
 // ----------------------------------------------------------------------------

-var upFirst = function(s) {
+function upFirst(s) {
  if (!s)
    return s;
  return s[0].toUpperCase() + s.substr(1);
-};
+}

-var trimLeft = function(s) {
+function trimLeft(s) {
  return s.replace(/^\s+/, "");
 }

-var inject = function(s, start, end, code) {
+function padLeft(s, n) {
+  while (s.length < n)
+    s += " ";
+  return s;
+}
+
+function inject(s, start, end, code) {
  var iStart = s.indexOf(start);
  var iEnd   = s.indexOf(end);

@@ -33,7 +39,7 @@ var inject = function(s, start, end, code) {
    throw new Error("Couldn't locate end mark.");

  return s.substr(0, iStart + start.length) + code + s.substr(iEnd);
-};
+}

 // ----------------------------------------------------------------------------
 // [Database]
@@ -172,66 +178,100 @@ var generate = function(fileName, arch) {
  var code = "";
  var disclaimer = "// Automatically generated, do not edit.\n";

+  var instCount = 0;
+  var sizeof_X86InstInfo = 8;
+  var sizeof_X86InstExtendedInfo = 24;
+
  // Create database.
  var db = new Database();
  var re = new RegExp(
-    "INST\\(([A-Za-z0-9_]+)\\s*," +       // [01] Inst-Code.
-    "\\s*\\\"([A-Za-z0-9_ ]*)\\\"\\s*," + // [02] Inst-Name.
-    "([^,]+)," +                          // [03] Inst-Group.
-    "([^,]+)," +                          // [04] Inst-Flags.
-    "([^,]+)," +                          // [05] Move-Size.
-    "([^,]+)," +                          // [06] Operand-Flags[0].
-    "([^,]+)," +                          // [07] Operand-Flags[1].
-    "([^,]+)," +                          // [08] Operand-Flags[2].
-    "([^,]+)," +                          // [09] Operand-Flags[3].
-    "\\s*E\\(([A-Z_]+)\\)\\s*," +         // [10] EFLAGS.
-    "(.{17}[^,]*)," +                     // [11] OpCode[0].
-    "(.{17}[^\\)]*)\\)",                  // [12] OpCode[1].
+    "INST\\(([A-Za-z0-9_]+)\\s*," +       // [01] Id.
+    "\\s*\\\"([A-Za-z0-9_ ]*)\\\"\\s*," + // [02] Name.
+    "(.{20}[^,]*)," +                     // [03] Opcode[0].
+    "(.{20}[^,]*)," +                     // [04] Opcode[1].
+    "([^,]+)," +                          // [05] Encoding.
+    "([^,]+)," +                          // [06] IFLAGS.
+    "\\s*EF\\(([A-Z_]+)\\)\\s*," +        // [07] EFLAGS.
+    "([^,]+)," +                          // [08] Write-Index.
+    "([^,]+)," +                          // [09] Write-Size.
+    "([^,]+)," +                          // [10] Operand-Flags[0].
+    "([^,]+)," +                          // [11] Operand-Flags[1].
+    "([^,]+)," +                          // [12] Operand-Flags[2].
+    "([^,]+)," +                          // [13] Operand-Flags[3].
+    "([^\\)]+)\\)",                       // [14] Operand-Flags[4].
    "g");

+  var i, k, m;
+  var srcForm = "";
+
  while (m = re.exec(data)) {
    // Extract instruction ID and Name.
    var id = m[1];
    var name = m[2];

-    // Extract data that goes to the secondary table (ExtendedInfo).
-    var instGroup = trimLeft(m[3]);
-    var instFlags = trimLeft(m[4]);
-    var moveSize = trimLeft(m[5]);
-    
-    var opFlags0 = trimLeft(m[6]);
-    var opFlags1 = trimLeft(m[7]);
-    var opFlags2 = trimLeft(m[8]);
-    var opFlags3 = trimLeft(m[9]);
-    var eflags = m[10];
-    var opCode1 = trimLeft(m[12]);
+    // Extract data that goes to the secondary table (X86InstExtendedInfo).
+    var opcode0    = trimLeft(m[3]);
+    var opcode1    = trimLeft(m[4]);
+    var encoding   = trimLeft(m[5]);
+    var iflags     = trimLeft(m[6]);
+    var eflags     = m[7];
+    var writeIndex = trimLeft(m[8]);
+    var writeSize  = trimLeft(m[9]);
+    var oflags0    = trimLeft(m[10]);
+    var oflags1    = trimLeft(m[11]);
+    var oflags2    = trimLeft(m[12]);
+    var oflags3    = trimLeft(m[13]);
+    var oflags4    = trimLeft(m[14]);

    // Generate EFlags-In and EFlags-Out.
    var eflagsIn   = decToHex(getEFlagsMask(eflags, "RX" ), 2);
    var eflagsOut  = decToHex(getEFlagsMask(eflags, "WXU"), 2);

-    var extData = "" +
-      instGroup + ", " +
-      moveSize  + ", " +
+    var extData =
+      encoding   + ", " +
+      writeIndex + ", " +
+      writeSize  + ", " +
      eflagsIn   + ", " +
      eflagsOut  + ", " +
-      instFlags + ", " + 
-      "{ " + opFlags0 + ", " + opFlags1 + ", " + opFlags2 + ", " + opFlags3 + ", U }, " +
-      opCode1;
+      "0"        + ", " +
+      "{ "       + oflags0 + ", " + oflags1 + ", " + oflags2 + ", " + oflags3 + ", " + oflags4 + " }, " +
+      iflags     + ", " +
+      opcode1;
+
+    srcForm += "  INST(" +
+      padLeft(id, 27) + ", " +
+      padLeft('"' + name + '"', 19) + ", " +
+      opcode0    + ", " +
+      opcode1    + ", " +
+      encoding   + ", " +
+      iflags     + ", " +
+      "EF(" + eflags + "), " +
+      writeIndex + ", " +
+      writeSize  + ", " +
+      oflags0    + ", " +
+      oflags1    + ", " +
+      oflags2    + ", " +
+      oflags3    + ", " +
+      oflags4    + "),\n";

    db.add(name, id, extData);
+    instCount++;
  }
+  // fs.writeFileSync("srcform.cpp", srcForm, "utf8");
  db.index();

-  console.log("Number of instructions: " + db.instNames.array.length);
-  console.log("Instruction names size: " + db.instNames.getSize());
-  console.log("Extended-info length  : " + db.extendedData.length);
+  var instDataSize = instCount * sizeof_X86InstInfo + db.extendedData.length * sizeof_X86InstExtendedInfo;
+
+  console.log("Number of Instructions  : " + instCount);
+  console.log("Number of ExtInfo Rows  : " + db.extendedData.length);
+  console.log("Instructions' Data  Size: " + instDataSize);
+  console.log("Instructions' Names Size: " + db.instNames.getSize());

  // Generate InstName[] string.
  code += disclaimer;
-  code += "#if !defined(ASMJIT_DISABLE_INST_NAMES)\n";
+  code += "#if !defined(ASMJIT_DISABLE_NAMES)\n";
  code += "const char _" + arch + "InstName[] =\n";
-  for (var k in db.instMap) {
+  for (k in db.instMap) {
    var inst = db.instMap[k];
    code += "  \"" + k + "\\0\"\n";
  }
@@ -248,7 +288,7 @@ var generate = function(fileName, arch) {

  code += disclaimer;
  code += "static const uint16_t _" + arch + "InstAlphaIndex[26] = {\n";
-  for (var i = 0; i < db.instAlpha.length; i++) {
+  for (i = 0; i < db.instAlpha.length; i++) {
    var id = db.instAlpha[i];
    code += "  " + (id === undefined ? "0xFFFF" : id);
    if (i !== db.instAlpha.length - 1)
@@ -260,18 +300,18 @@ var generate = function(fileName, arch) {
  // Generate NameIndex.
  code += disclaimer;
  code += "enum k" + Arch + "InstData_NameIndex {\n";
-  for (var k in db.instMap) {
+  for (k in db.instMap) {
    var inst = db.instMap[k];
    code += "  " + inst.id + "_NameIndex = " + inst.nameIndex + ",\n";
  }
  code = code.substr(0, code.length - 2) + "\n};\n";
-  code += "#endif // !ASMJIT_DISABLE_INST_NAMES\n"
+  code += "#endif // !ASMJIT_DISABLE_NAMES\n"
  code += "\n";

  // Generate ExtendedInfo.
  code += disclaimer;
  code += "const " + Arch + "InstExtendedInfo _" + arch + "InstExtendedInfo[] = {\n";
-  for (var i = 0; i < db.extendedData.length; i++) {
+  for (i = 0; i < db.extendedData.length; i++) {
    code += "  { " + db.extendedData[i] + " }";
    if (i !== db.extendedData.length - 1)
      code += ",";
@@ -282,7 +322,7 @@ var generate = function(fileName, arch) {

  code += disclaimer;
  code += "enum k" + Arch + "InstData_ExtendedIndex {\n";
-  for (var k in db.instMap) {
+  for (k in db.instMap) {
    var inst = db.instMap[k];
    code += "  " + inst.id + "_ExtendedIndex = " + inst.extendedIndex + ",\n";
  }