[Bug] Use mremap() to allocate a dual mapped region on NetBSD

In addition, always enable DualMapping when RWX pages are not possible to allocate in JitAllocator, because otherwise the allocator would not be able to allocate memory for JIT code execution. New CI runners to test FreeBSD, NetBSD, and OpenBSD are also provided.
2025-12-17 12:34:35 +03:00 · 2023-02-23 00:28:27 +01:00
parent 8c31a8f34f
commit 9d33c892f7
5 changed files with 4082 additions and 3827 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -149,3 +149,93 @@ jobs:
      - name: "Test"
        run: python build-actions/action.py --step=test
  build-vm:
    strategy:
      fail-fast: false
      matrix:
        include:
          - { host: "macos-12", os: "freebsd", osver: "13.1", cc: "clang", arch: "x86-64", build_type: "Release", defs: "ASMJIT_TEST=ON" }
          - { host: "macos-12", os: "openbsd", osver: "7.2" , cc: "clang", arch: "x86-64", build_type: "Release", defs: "ASMJIT_TEST=ON" }
    name: "${{matrix.os}}-${{matrix.osver}} (${{matrix.cc}}, ${{matrix.arch}}, ${{matrix.build_type}})"
    runs-on: ${{matrix.host}}
    steps:
      - name: "Checkout"
        uses: actions/checkout@v3
        with:
          path: "source"
      - name: "Checkout Build Actions"
        uses: actions/checkout@v3
        with:
          repository: build-actions/build-actions
          path: "build-actions"
      - name: Build & Test in VM
        uses: cross-platform-actions/action@master
        with:
          operating_system: ${{matrix.os}}
          architecture: ${{matrix.arch}}
          version: ${{matrix.osver}}
          shell: bash
          run: |
            set -e
            PATH="/usr/sbin:/usr/pkg/sbin:/usr/pkg/bin:$PATH:$(pwd)/build-actions"
            export PATH
            sh ./build-actions/install-python3.sh
            python3 build-actions/action.py                       \
              --step=all                                          \
              --compiler=${{matrix.cc}}                           \
              --architecture=${{matrix.arch}}                     \
              --source-dir=source                                 \
              --config=source/.github/workflows/build-config.json \
              --build-type=${{matrix.build_type}}                 \
              --build-defs=${{matrix.defs}}
  build-netbsd:
    strategy:
      fail-fast: false
      matrix:
        include:
          - { title: "netbsd", host: "macos-12", os: "netbsd", cc: "clang", arch: "x86_64", build_type: "Release", defs: "ASMJIT_TEST=ON" }
    name: "${{matrix.title}} (${{matrix.cc}}, ${{matrix.arch}}, ${{matrix.build_type}})"
    runs-on: ${{matrix.host}}
    steps:
      - name: "Checkout"
        uses: actions/checkout@v3
        with:
          path: "source"
      - name: "Checkout Build Actions"
        uses: actions/checkout@v3
        with:
          repository: build-actions/build-actions
          path: "build-actions"
      - name: Build & Test (VM)
        uses: vmactions/netbsd-vm@v0
        with:
          mem: 6144
          usesh: true
          copyback: false
          run: |
            set -e
            PATH="/usr/sbin:/usr/pkg/sbin:/usr/pkg/bin:$PATH:$(pwd)/build-actions"
            export PATH
            bash ./build-actions/install-python3.sh
            python3 ./build-actions/action.py                     \
              --step=all                                          \
              --compiler=${{matrix.cc}}                           \
              --architecture=${{matrix.arch}}                     \
              --source-dir=source                                 \
              --config=source/.github/workflows/build-config.json \
              --build-type=${{matrix.build_type}}                 \
              --build-defs=${{matrix.defs}}
--- a/src/asmjit/core/jitallocator.cpp
+++ b/src/asmjit/core/jitallocator.cpp
@@ -430,6 +430,15 @@ static inline JitAllocatorPrivateImpl* JitAllocatorImpl_new(const JitAllocator::
  if (ASMJIT_UNLIKELY(!p))
    return nullptr;
  VirtMem::HardenedRuntimeInfo hardenedRtInfo = VirtMem::hardenedRuntimeInfo();
  if (Support::test(hardenedRtInfo.flags, VirtMem::HardenedRuntimeFlags::kEnabled)) {
    // If we are running within a hardened environment (mapping RWX is not allowed) then we have to use dual mapping
    // or other runtime capabilities like Apple specific MAP_JIT. There is no point in not enabling these as otherwise
    // the allocation would fail and JitAllocator would not be able to allocate memory.
    if (!Support::test(hardenedRtInfo.flags, VirtMem::HardenedRuntimeFlags::kMapJit))
      options |= JitAllocatorOptions::kUseDualMapping;
  }
  JitAllocatorPool* pools = reinterpret_cast<JitAllocatorPool*>((uint8_t*)p + sizeof(JitAllocatorPrivateImpl));
  JitAllocatorPrivateImpl* impl = new(p) JitAllocatorPrivateImpl(pools, poolCount);
--- a/src/asmjit/core/jitallocator.h
+++ b/src/asmjit/core/jitallocator.h
@@ -26,6 +26,10 @@ enum class JitAllocatorOptions : uint32_t {
  //! The first buffer has read and execute permissions and the second buffer has read+write permissions.
  //!
  //! See \ref VirtMem::allocDualMapping() for more details about this feature.
  //!
  //! \remarks Dual mapping would be automatically turned on by \ref JitAllocator in case of hardened runtime that
  //! enforces `W^X` policy, so specifying this flag is essentually forcing to use dual mapped pages even when RWX
  //! pages can be allocated and dual mapping is not necessary.
  kUseDualMapping = 0x00000001u,
  //! Enables the use of multiple pools with increasing granularity instead of a single pool. This flag would enable
--- a/src/asmjit/core/virtmem.cpp
+++ b/src/asmjit/core/virtmem.cpp
@@ -42,9 +42,8 @@
  #if !defined(MAP_ANONYMOUS)
    #define MAP_ANONYMOUS MAP_ANON
  #endif
 #endif
-#include <atomic>
+  #define ASMJIT_DUAL_MAPPING_ANON_FD
  #if defined(__APPLE__) || defined(__BIONIC__)
    #define ASMJIT_VM_SHM_DETECT 0
@@ -53,7 +52,7 @@
  #endif
  // Android NDK doesn't provide `shm_open()` and `shm_unlink()`.
-#if !defined(_WIN32) && !defined(__BIONIC__)
+  #if !defined(__BIONIC__)
    #define ASMJIT_HAS_SHM_OPEN_AND_UNLINK
  #endif
@@ -61,12 +60,20 @@
    #define ASMJIT_HAS_PTHREAD_JIT_WRITE_PROTECT_NP
  #endif
  #if defined(__NetBSD__) && defined(MAP_REMAPDUP) && defined(PROT_MPROTECT)
    #undef ASMJIT_DUAL_MAPPING_ANON_FD
    #define ASMJIT_DUAL_MAPPING_REMAPDUP
  #endif
 #endif
 #include <atomic>
 ASMJIT_BEGIN_SUB_NAMESPACE(VirtMem)
 // Virtual Memory Utilities
 // ========================
-static const MemoryFlags dualMappingFilter[2] = {
+static const constexpr MemoryFlags dualMappingFilter[2] = {
  MemoryFlags::kAccessWrite | MemoryFlags::kMMapMaxAccessWrite,
  MemoryFlags::kAccessExecute | MemoryFlags::kMMapMaxAccessExecute
 };
@@ -217,19 +224,8 @@ Error releaseDualMapping(DualMapping* dm, size_t size) noexcept {
 #if !defined(_WIN32)
-static void getVMInfo(Info& vmInfo) noexcept {
+// Virtual Memory [Posix] - Utilities
-  uint32_t pageSize = uint32_t(::getpagesize());
+// ==================================
  vmInfo.pageSize = pageSize;
  vmInfo.pageGranularity = Support::max<uint32_t>(pageSize, 65536);
 }
 #if !defined(SHM_ANON)
 static const char* getTmpDir() noexcept {
  const char* tmpDir = getenv("TMPDIR");
  return tmpDir ? tmpDir : "/tmp";
 }
 #endif
 // Translates libc errors specific to VirtualMemory mapping to `asmjit::Error`.
 static Error asmjitErrorFromErrno(int e) noexcept {
@@ -254,16 +250,61 @@ static Error asmjitErrorFromErrno(int e) noexcept {
  }
 }
 static void getVMInfo(Info& vmInfo) noexcept {
  uint32_t pageSize = uint32_t(::getpagesize());
  vmInfo.pageSize = pageSize;
  vmInfo.pageGranularity = Support::max<uint32_t>(pageSize, 65536);
 }
 #if defined(__APPLE__) && TARGET_OS_OSX
 static int getOSXVersion() noexcept {
  // MAP_JIT flag required to run unsigned JIT code is only supported by kernel version 10.14+ (Mojave).
  static std::atomic<int> globalVersion;
  int ver = globalVersion.load();
  if (!ver) {
    struct utsname osname {};
    uname(&osname);
    ver = atoi(osname.release);
    globalVersion.store(ver);
  }
  return ver;
 }
 #endif // __APPLE__ && TARGET_OS_OSX
 // Returns `mmap()` protection flags from \ref MemoryFlags.
 static int mmProtFromMemoryFlags(MemoryFlags memoryFlags) noexcept {
  int protection = 0;
  if (Support::test(memoryFlags, MemoryFlags::kAccessRead)) protection |= PROT_READ;
  if (Support::test(memoryFlags, MemoryFlags::kAccessWrite)) protection |= PROT_READ | PROT_WRITE;
  if (Support::test(memoryFlags, MemoryFlags::kAccessExecute)) protection |= PROT_READ | PROT_EXEC;
  return protection;
 }
 // Virtual Memory [Posix] - Anonymus Memory
 // ========================================
 #if defined(ASMJIT_DUAL_MAPPING_ANON_FD)
 // Some operating systems don't allow /dev/shm to be executable. On Linux this happens when /dev/shm is mounted with
 // 'noexec', which is enforced by systemd. Other operating systems like MacOS also restrict executable permissions
 // regarding /dev/shm, so we use a runtime detection before attempting to allocate executable memory. Sometimes we
-// don't need the detection as we know it would always result in `ShmStrategy::kTmpDir`.
+// don't need the detection as we know it would always result in `AnonymousMemStrategy::kTmpDir`.
-enum class ShmStrategy : uint32_t {
+enum class AnonymousMemStrategy : uint32_t {
  kUnknown = 0,
  kDevShm = 1,
  kTmpDir = 2
 };
 #if !defined(SHM_ANON)
 static const char* getTmpDir() noexcept {
  const char* tmpDir = getenv("TMPDIR");
  return tmpDir ? tmpDir : "/tmp";
 }
 #endif
 class AnonymousMemory {
 public:
  enum FileType : uint32_t {
@@ -398,31 +439,54 @@ public:
  }
 };
-// Returns `mmap()` protection flags from \ref MemoryFlags.
+#if ASMJIT_VM_SHM_DETECT
-static int mmProtFromMemoryFlags(MemoryFlags memoryFlags) noexcept {
+static Error detectAnonMemStrategy(AnonymousMemStrategy* strategyOut) noexcept {
-  int protection = 0;
+  AnonymousMemory anonMem;
-  if (Support::test(memoryFlags, MemoryFlags::kAccessRead)) protection |= PROT_READ;
+  Info vmInfo = info();
-  if (Support::test(memoryFlags, MemoryFlags::kAccessWrite)) protection |= PROT_READ | PROT_WRITE;
+
-  if (Support::test(memoryFlags, MemoryFlags::kAccessExecute)) protection |= PROT_READ | PROT_EXEC;
+  ASMJIT_PROPAGATE(anonMem.open(false));
-  return protection;
+  ASMJIT_PROPAGATE(anonMem.allocate(vmInfo.pageSize));
  void* ptr = mmap(nullptr, vmInfo.pageSize, PROT_READ | PROT_EXEC, MAP_SHARED, anonMem.fd(), 0);
  if (ptr == MAP_FAILED) {
    int e = errno;
    if (e == EINVAL) {
      *strategyOut = AnonymousMemStrategy::kTmpDir;
      return kErrorOk;
    }
    return DebugUtils::errored(asmjitErrorFromErrno(e));
  }
  else {
    munmap(ptr, vmInfo.pageSize);
    *strategyOut = AnonymousMemStrategy::kDevShm;
    return kErrorOk;
  }
 }
 #endif
 static Error getAnonMemStrategy(AnonymousMemStrategy* strategyOut) noexcept {
 #if ASMJIT_VM_SHM_DETECT
  // Initially don't assume anything. It has to be tested whether '/dev/shm' was mounted with 'noexec' flag or not.
  static std::atomic<uint32_t> globalShmStrategy;
  AnonymousMemStrategy strategy = static_cast<AnonymousMemStrategy>(globalShmStrategy.load());
  if (strategy == AnonymousMemStrategy::kUnknown) {
    ASMJIT_PROPAGATE(detectAnonMemStrategy(&strategy));
    globalShmStrategy.store(static_cast<uint32_t>(strategy));
  }
-#if defined(__APPLE__) && TARGET_OS_OSX
+  *strategyOut = strategy;
-static int getOSXVersion() noexcept {
+  return kErrorOk;
-  // MAP_JIT flag required to run unsigned JIT code is only supported by kernel version 10.14+ (Mojave).
+#else
-  static std::atomic<int> globalVersion;
+  *strategyOut = AnonymousMemStrategy::kTmpDir;
-
+  return kErrorOk;
-  int ver = globalVersion.load();
+#endif
  if (!ver) {
    struct utsname osname {};
    uname(&osname);
    ver = atoi(osname.release);
    globalVersion.store(ver);
 }
-  return ver;
+#endif // ASMJIT_DUAL_MAPPING_ANON_FD
-}
+
-#endif // __APPLE__ && TARGET_OS_OSX
+// Virtual Memory [Posix] - Hardened Runtime & MAP_JIT
 // ===================================================
 // Detects whether the current process is hardened, which means that pages that have WRITE and EXECUTABLE flags
 // cannot be normally allocated. On OSX + AArch64 such allocation requires MAP_JIT flag, other platforms don't
@@ -505,50 +569,6 @@ static inline int mmMaxProtFromMemoryFlags(MemoryFlags memoryFlags) noexcept {
 #endif
 }
 #if ASMJIT_VM_SHM_DETECT
 static Error detectShmStrategy(ShmStrategy* strategyOut) noexcept {
  AnonymousMemory anonMem;
  Info vmInfo = info();
  ASMJIT_PROPAGATE(anonMem.open(false));
  ASMJIT_PROPAGATE(anonMem.allocate(vmInfo.pageSize));
  void* ptr = mmap(nullptr, vmInfo.pageSize, PROT_READ | PROT_EXEC, MAP_SHARED, anonMem.fd(), 0);
  if (ptr == MAP_FAILED) {
    int e = errno;
    if (e == EINVAL) {
      *strategyOut = ShmStrategy::kTmpDir;
      return kErrorOk;
    }
    return DebugUtils::errored(asmjitErrorFromErrno(e));
  }
  else {
    munmap(ptr, vmInfo.pageSize);
    *strategyOut = ShmStrategy::kDevShm;
    return kErrorOk;
  }
 }
 #endif
 static Error getShmStrategy(ShmStrategy* strategyOut) noexcept {
 #if ASMJIT_VM_SHM_DETECT
  // Initially don't assume anything. It has to be tested whether '/dev/shm' was mounted with 'noexec' flag or not.
  static std::atomic<uint32_t> globalShmStrategy;
  ShmStrategy strategy = static_cast<ShmStrategy>(globalShmStrategy.load());
  if (strategy == ShmStrategy::kUnknown) {
    ASMJIT_PROPAGATE(detectShmStrategy(&strategy));
    globalShmStrategy.store(static_cast<uint32_t>(strategy));
  }
  *strategyOut = strategy;
  return kErrorOk;
 #else
  *strategyOut = ShmStrategy::kTmpDir;
  return kErrorOk;
 #endif
 }
 static HardenedRuntimeFlags getHardenedRuntimeFlags() noexcept {
  HardenedRuntimeFlags flags = HardenedRuntimeFlags::kNone;
@@ -593,6 +613,53 @@ Error protect(void* p, size_t size, MemoryFlags memoryFlags) noexcept {
  return DebugUtils::errored(kErrorInvalidArgument);
 }
 // Virtual Memory [Posix] - Dual Mapping
 // =====================================
 #if defined(ASMJIT_DUAL_MAPPING_REMAPDUP)
 static void unmapDualMapping(DualMapping* dm, size_t size) noexcept {
  if (dm->rw)
    munmap(dm->rw, size);
  if (dm->rx)
    munmap(dm->rx, size);
 }
 static Error allocDualMappingUsingRemapdup(DualMapping* dmOut, size_t size, MemoryFlags memoryFlags) noexcept {
  DualMapping dm {};
  dm.rw = mmap(NULL, size, PROT_MPROTECT(mmProtFromMemoryFlags(memoryFlags)), MAP_ANONYMOUS, -1, 0);
  if (dm.rw == MAP_FAILED) {
    return DebugUtils::errored(asmjitErrorFromErrno(errno));
  }
  dm.rx = mremap(dm.rw, size, NULL, size, MAP_REMAPDUP);
  if (dm.rx == MAP_FAILED) {
    int e = errno;
    unmapDualMapping(&dm, size);
    return DebugUtils::errored(asmjitErrorFromErrno(e));
  }
  MemoryFlags rxAccessFlags = memoryFlags & ~dualMappingFilter[0];
  MemoryFlags rwAccessFlags = memoryFlags & ~dualMappingFilter[1];
  if (mprotect(dm.rw, size, mmProtFromMemoryFlags(rwAccessFlags)) != 0) {
    int e = errno;
    unmapDualMapping(&dm, size);
    return DebugUtils::errored(asmjitErrorFromErrno(e));
  }
  if (mprotect(dm.rx, size, mmProtFromMemoryFlags(rxAccessFlags)) != 0) {
    int e = errno;
    unmapDualMapping(&dm, size);
    return DebugUtils::errored(asmjitErrorFromErrno(e));
  }
  *dmOut = dm;
  return kErrorOk;
 }
 #endif
 Error allocDualMapping(DualMapping* dm, size_t size, MemoryFlags memoryFlags) noexcept {
  dm->rx = nullptr;
  dm->rw = nullptr;
@@ -600,11 +667,14 @@ Error allocDualMapping(DualMapping* dm, size_t size, MemoryFlags memoryFlags) no
  if (off_t(size) <= 0)
    return DebugUtils::errored(size == 0 ? kErrorInvalidArgument : kErrorTooLarge);
 #if defined(ASMJIT_DUAL_MAPPING_REMAPDUP)
  return allocDualMappingUsingRemapdup(dm, size, memoryFlags);
 #elif defined(ASMJIT_DUAL_MAPPING_ANON_FD)
  bool preferTmpOverDevShm = Support::test(memoryFlags, MemoryFlags::kMappingPreferTmp);
  if (!preferTmpOverDevShm) {
-    ShmStrategy strategy;
+    AnonymousMemStrategy strategy;
-    ASMJIT_PROPAGATE(getShmStrategy(&strategy));
+    ASMJIT_PROPAGATE(getAnonMemStrategy(&strategy));
-    preferTmpOverDevShm = (strategy == ShmStrategy::kTmpDir);
+    preferTmpOverDevShm = (strategy == AnonymousMemStrategy::kTmpDir);
  }
  AnonymousMemory anonMem;
@@ -629,6 +699,9 @@ Error allocDualMapping(DualMapping* dm, size_t size, MemoryFlags memoryFlags) no
  dm->rx = ptr[0];
  dm->rw = ptr[1];
  return kErrorOk;
 #else
  #error "[asmjit] VirtMem::allocDualMapping() has no implementation"
 #endif
 }
 Error releaseDualMapping(DualMapping* dm, size_t size) noexcept {
--- a/test/asmjit_test_perf_x86.cpp
+++ b/test/asmjit_test_perf_x86.cpp
@@ -983,9 +983,8 @@ static void generateSseSequence(BaseEmitter& emitter, InstForm form, bool emitPr
 // Generates a long sequence of AVX instructions.
 template<typename Emitter>
-static void generateAvxSequenceInternal(
+static void generateAvxSequenceInternalRegOnly(
  Emitter& cc,
  InstForm form,
  const x86::Gp& gp,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {
@@ -1001,7 +1000,6 @@ static void generateAvxSequenceInternal(
  x86::Ymm ymmA = vecA.ymm();
  x86::Ymm ymmB = vecB.ymm();
  x86::Ymm ymmC = vecC.ymm();
  x86::Ymm ymmD = vecD.ymm();
  cc.xor_(gpd, gpd);
  cc.vxorps(xmmA, xmmA, xmmA);
@@ -1009,7 +1007,6 @@ static void generateAvxSequenceInternal(
  cc.vxorps(xmmC, xmmC, xmmC);
  cc.vxorps(xmmD, xmmD, xmmD);
  if (form == InstForm::kReg) {
  cc.vaddpd(xmmA, xmmB, xmmC);
  cc.vaddpd(ymmA, ymmB, ymmC);
  cc.vaddps(xmmA, xmmB, xmmC);
@@ -1577,13 +1574,40 @@ static void generateAvxSequenceInternal(
  cc.vfnmsub231sd(xmmA, xmmB, xmmC);
  cc.vfnmsub231ss(xmmA, xmmB, xmmC);
 }
-  else {
+
 // Generates a long sequence of AVX instructions.
 template<typename Emitter>
 static void generateAvxSequenceInternalRegMem(
  Emitter& cc,
  const x86::Gp& gp,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {
  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is32Bit() ? gpd : gpq;
  x86::Xmm xmmA = vecA.xmm();
  x86::Xmm xmmB = vecB.xmm();
  x86::Xmm xmmC = vecC.xmm();
  x86::Xmm xmmD = vecD.xmm();
  x86::Ymm ymmA = vecA.ymm();
  x86::Ymm ymmB = vecB.ymm();
  x86::Ymm ymmC = vecC.ymm();
  x86::Ymm ymmD = vecD.ymm();
  x86::Mem m = x86::ptr(gpz);
  x86::Mem m128 = x86::xmmword_ptr(gpz);
  x86::Mem m256 = x86::xmmword_ptr(gpz);
  x86::Mem vx_ptr = x86::ptr(gpz, xmmD);
  x86::Mem vy_ptr = x86::ptr(gpz, ymmD);
  cc.xor_(gpd, gpd);
  cc.vxorps(xmmA, xmmA, xmmA);
  cc.vxorps(xmmB, xmmB, xmmB);
  cc.vxorps(xmmC, xmmC, xmmC);
  cc.vxorps(xmmD, xmmD, xmmD);
  cc.vaddpd(xmmA, xmmB, m);
  cc.vaddpd(ymmA, ymmB, m);
  cc.vaddps(xmmA, xmmB, m);
@@ -2087,6 +2111,19 @@ static void generateAvxSequenceInternal(
  cc.vpunpcklwd(ymmA, ymmB, m);
  cc.vpxor(ymmA, ymmB, m);
 }
 // Generates a long sequence of AVX instructions.
 template<typename Emitter>
 static void generateAvxSequenceInternal(
  Emitter& cc,
  InstForm form,
  const x86::Gp& gp,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {
  if (form == InstForm::kReg)
    generateAvxSequenceInternalRegOnly(cc, gp, vecA, vecB, vecC, vecD);
  else
    generateAvxSequenceInternalRegMem(cc, gp, vecA, vecB, vecC, vecD);
 }
 static void generateAvxSequence(BaseEmitter& emitter, InstForm form, bool emitPrologEpilog) {
@@ -2153,9 +2190,8 @@ static void generateAvxSequence(BaseEmitter& emitter, InstForm form, bool emitPr
 // Generates a long sequence of AVX512 instructions.
 template<typename Emitter>
-static void generateAvx512SequenceInternal(
+static void generateAvx512SequenceInternalRegOnly(
  Emitter& cc,
  InstForm form,
  const x86::Gp& gp,
  const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {
@@ -2172,12 +2208,10 @@ static void generateAvx512SequenceInternal(
  x86::Ymm ymmA = vecA.ymm();
  x86::Ymm ymmB = vecB.ymm();
  x86::Ymm ymmC = vecC.ymm();
  x86::Ymm ymmD = vecD.ymm();
  x86::Zmm zmmA = vecA.zmm();
  x86::Zmm zmmB = vecB.zmm();
  x86::Zmm zmmC = vecC.zmm();
  x86::Zmm zmmD = vecD.zmm();
  cc.xor_(gpd, gpd);
  cc.vxorps(xmmA, xmmA, xmmA);
@@ -2185,7 +2219,6 @@ static void generateAvx512SequenceInternal(
  cc.vxorps(xmmC, xmmC, xmmC);
  cc.vxorps(xmmD, xmmD, xmmD);
  if (form == InstForm::kReg) {
  cc.kaddb(kA, kB, kC);
  cc.kaddd(kA, kB, kC);
  cc.kaddq(kA, kB, kC);
@@ -3505,7 +3538,33 @@ static void generateAvx512SequenceInternal(
  cc.evex().vxorps(ymmA, ymmB, ymmC);
  cc.evex().vxorps(zmmA, zmmB, zmmC);
 }
-  else {
+
 template<typename Emitter>
 static void generateAvx512SequenceInternalRegMem(
  Emitter& cc,
  const x86::Gp& gp,
  const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {
  DebugUtils::unused(kC);
  x86::Gp gpd = gp.r32();
  x86::Gp gpq = gp.r64();
  x86::Gp gpz = cc.is32Bit() ? gpd : gpq;
  x86::Xmm xmmA = vecA.xmm();
  x86::Xmm xmmB = vecB.xmm();
  x86::Xmm xmmC = vecC.xmm();
  x86::Xmm xmmD = vecD.xmm();
  x86::Ymm ymmA = vecA.ymm();
  x86::Ymm ymmB = vecB.ymm();
  x86::Ymm ymmD = vecD.ymm();
  x86::Zmm zmmA = vecA.zmm();
  x86::Zmm zmmB = vecB.zmm();
  x86::Zmm zmmD = vecD.zmm();
  x86::Mem m = x86::ptr(gpz);
  x86::Mem m32 = x86::dword_ptr(gpz);
  x86::Mem m64 = x86::qword_ptr(gpz);
@@ -3516,6 +3575,12 @@ static void generateAvx512SequenceInternal(
  x86::Mem vy_ptr = x86::ptr(gpz, ymmD);
  x86::Mem vz_ptr = x86::ptr(gpz, zmmD);
  cc.xor_(gpd, gpd);
  cc.vxorps(xmmA, xmmA, xmmA);
  cc.vxorps(xmmB, xmmB, xmmB);
  cc.vxorps(xmmC, xmmC, xmmC);
  cc.vxorps(xmmD, xmmD, xmmD);
  cc.kmovb(kA, m);
  cc.kmovb(m, kB);
  cc.kmovd(kA, m);
@@ -4839,6 +4904,20 @@ static void generateAvx512SequenceInternal(
  cc.evex().vxorps(ymmA, ymmB, m);
  cc.evex().vxorps(zmmA, zmmB, m);
 }
 // Generates a long sequence of AVX512 instructions.
 template<typename Emitter>
 static void generateAvx512SequenceInternal(
  Emitter& cc,
  InstForm form,
  const x86::Gp& gp,
  const x86::KReg& kA, const x86::KReg& kB, const x86::KReg& kC,
  const x86::Vec& vecA, const x86::Vec& vecB, const x86::Vec& vecC, const x86::Vec& vecD) {
  if (form == InstForm::kReg)
    generateAvx512SequenceInternalRegOnly(cc, gp, kA, kB, kC, vecA, vecB, vecC, vecD);
  else
    generateAvx512SequenceInternalRegMem(cc, gp, kA, kB, kC, vecA, vecB, vecC, vecD);
 }
 static void generateAvx512Sequence(BaseEmitter& emitter, InstForm form, bool emitPrologEpilog) {