diff --git a/.github/workflows/build-config.json b/.github/workflows/build-config.json
index 23a9686..473b1aa 100644
--- a/.github/workflows/build-config.json
+++ b/.github/workflows/build-config.json
@@ -12,7 +12,7 @@
   ],
 
   "tests": [
-    { "optional": true, "cmd": ["asmjit_test_unit", "--quick"] },
+    { "optional": true, "cmd": ["asmjit_test_runner", "--quick"] },
     { "optional": true, "cmd": ["asmjit_test_environment"] },
     { "optional": true, "cmd": ["asmjit_test_assembler"] },
     { "optional": true, "cmd": ["asmjit_test_assembler", "--validate"] },
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1903172..9c41ef7 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -18,10 +18,10 @@ jobs:
 
     steps:
     - name: "Checkout"
-      uses: actions/checkout@v4
+      uses: actions/checkout@v5
 
     - name: "Setup node.js"
-      uses: actions/setup-node@v4
+      uses: actions/setup-node@v5
       with:
         node-version: "*"
 
@@ -104,22 +104,25 @@ jobs:
           - { title: "linux"                 , host: "ubuntu-24.04"    , arch: "x64"    , cc: "clang-19", conf: "Release", defs: "ASMJIT_TEST=1" }
           - { title: "linux"                 , host: "ubuntu-24.04-arm", arch: "arm64"  , cc: "clang-19", conf: "Debug"  , defs: "ASMJIT_TEST=1" }
           - { title: "linux"                 , host: "ubuntu-24.04-arm", arch: "arm64"  , cc: "clang-19", conf: "Release", defs: "ASMJIT_TEST=1" }
-          - { title: "macos"                 , host: "macos-13"        , arch: "x64"    , cc: "gcc-14"  , conf: "Debug"  , defs: "ASMJIT_TEST=1" }
-          - { title: "macos"                 , host: "macos-13"        , arch: "x64"    , cc: "gcc-14"  , conf: "Release", defs: "ASMJIT_TEST=1" }
-          - { title: "macos"                 , host: "macos-13"        , arch: "x64"    , cc: "clang"   , conf: "Debug"  , defs: "ASMJIT_TEST=1" }
-          - { title: "macos"                 , host: "macos-13"        , arch: "x64"    , cc: "clang"   , conf: "Release", defs: "ASMJIT_TEST=1" }
-          - { title: "macos"                 , host: "macos-14"        , arch: "arm64"  , cc: "clang"   , conf: "Debug"  , defs: "ASMJIT_TEST=1" }
-          - { title: "macos"                 , host: "macos-14"        , arch: "arm64"  , cc: "clang"   , conf: "Release", defs: "ASMJIT_TEST=1" }
+          - { title: "linux"                 , host: "ubuntu-24.04"    , arch: "x86"    , cc: "clang-20", conf: "Debug"  , defs: "ASMJIT_TEST=1" }
+          - { title: "linux"                 , host: "ubuntu-24.04"    , arch: "x86"    , cc: "clang-20", conf: "Release", defs: "ASMJIT_TEST=1" }
+          - { title: "linux"                 , host: "ubuntu-24.04"    , arch: "x64"    , cc: "clang-20", conf: "Debug"  , defs: "ASMJIT_TEST=1" }
+          - { title: "linux"                 , host: "ubuntu-24.04"    , arch: "x64"    , cc: "clang-20", conf: "Release", defs: "ASMJIT_TEST=1" }
+          - { title: "linux"                 , host: "ubuntu-24.04-arm", arch: "arm64"  , cc: "clang-20", conf: "Debug"  , defs: "ASMJIT_TEST=1" }
+          - { title: "linux"                 , host: "ubuntu-24.04-arm", arch: "arm64"  , cc: "clang-20", conf: "Release", defs: "ASMJIT_TEST=1" }
+          - { title: "macos"                 , host: "macos-15-intel"  , arch: "x64"    , cc: "gcc-14"  , conf: "Debug"  , defs: "ASMJIT_TEST=1" }
+          - { title: "macos"                 , host: "macos-15-intel"  , arch: "x64"    , cc: "gcc-14"  , conf: "Release", defs: "ASMJIT_TEST=1" }
+          - { title: "macos"                 , host: "macos-15-intel"  , arch: "x64"    , cc: "clang"   , conf: "Debug"  , defs: "ASMJIT_TEST=1" }
+          - { title: "macos"                 , host: "macos-15-intel"  , arch: "x64"    , cc: "clang"   , conf: "Release", defs: "ASMJIT_TEST=1" }
+          - { title: "macos"                 , host: "macos-15"        , arch: "arm64"  , cc: "clang"   , conf: "Debug"  , defs: "ASMJIT_TEST=1" }
+          - { title: "macos"                 , host: "macos-15"        , arch: "arm64"  , cc: "clang"   , conf: "Release", defs: "ASMJIT_TEST=1" }
           - { title: "windows"               , host: "windows-2022"    , arch: "x86"    , cc: "vs2022"  , conf: "Debug"  , defs: "ASMJIT_TEST=1" }
           - { title: "windows"               , host: "windows-2022"    , arch: "x86"    , cc: "vs2022"  , conf: "Release", defs: "ASMJIT_TEST=1" }
           - { title: "windows"               , host: "windows-2022"    , arch: "x64"    , cc: "vs2022"  , conf: "Debug"  , defs: "ASMJIT_TEST=1" }
           - { title: "windows"               , host: "windows-2022"    , arch: "x64"    , cc: "vs2022"  , conf: "Release", defs: "ASMJIT_TEST=1" }
           - { title: "windows"               , host: "windows-11-arm"  , arch: "arm64"  , cc: "vs2022"  , conf: "Debug"  , defs: "ASMJIT_TEST=1" }
           - { title: "windows"               , host: "windows-11-arm"  , arch: "arm64"  , cc: "vs2022"  , conf: "Release", defs: "ASMJIT_TEST=1" }
-
-          # Cross compiled, cannot run tests (Windows/UWP).
           - { title: "windows/uwp"           , host: "windows-2022"    , arch: "x64"    , cc: "vs2022"  , conf: "Release", defs: "ASMJIT_TEST=0,CMAKE_SYSTEM_NAME=WindowsStore,CMAKE_SYSTEM_VERSION=10.0,CMAKE_CXX_FLAGS=-D_WIN32_WINNT=0x0A00" }
-
           - { title: "freebsd"               , host: "ubuntu-latest"   , arch: "x64"    , cc: "clang"   , conf: "Release", vm: "freebsd", vm_ver: "14.2", defs: "ASMJIT_TEST=1" }
           - { title: "freebsd"               , host: "ubuntu-latest"   , arch: "arm64"  , cc: "clang"   , conf: "Release", vm: "freebsd", vm_ver: "14.2", defs: "ASMJIT_TEST=1" }
           - { title: "netbsd"                , host: "ubuntu-latest"   , arch: "x64"    , cc: "clang"   , conf: "Release", vm: "netbsd" , vm_ver: "10.1", defs: "ASMJIT_TEST=1" }
@@ -135,18 +138,18 @@ jobs:
 
     steps:
       - name: "Checkout"
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           path: "source"
 
       - name: "Checkout Build Actions"
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
           repository: build-actions/build-actions
           path: "build-actions"
 
       - name: "Python"
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: "3.x"
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d948218..064b255 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,18 @@
-cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
+# AsmJit
+# ======
+
+# To consume asmjit as a dependency, use asmjit::asmjit alias.
+
+cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
 
 # Don't create a project if it was already created by another CMakeLists.txt. This makes
 # it possible to support both add_subdirectory() and include() ways of using AsmJit as a
 # dependency.
 if (NOT CMAKE_PROJECT_NAME OR "${CMAKE_PROJECT_NAME}" STREQUAL "asmjit")
-  project(asmjit CXX)
+  project(asmjit
+    LANGUAGES CXX
+    DESCRIPTION "Low-latency machine code generation"
+    HOMEPAGE_URL "https://asmjit.com")
 endif()
 
 include(CheckCXXCompilerFlag)
@@ -194,7 +202,7 @@ function(asmjit_detect_sanitizers out)
   set(${out} "${_out_array}" PARENT_SCOPE)
 endfunction()
 
-function(asmjit_add_target target target_type)
+function(asmjit_addapp target target_type)
   set(single_val "")
   set(multi_val SOURCES LIBRARIES CFLAGS CFLAGS_DBG CFLAGS_REL)
   cmake_parse_arguments("X" "" "${single_val}" "${multi_val}" ${ARGN})
@@ -227,47 +235,41 @@ set(ASMJIT_INCLUDE_DIR "${ASMJIT_INCLUDE_DIRS}")
 
 if (NOT ASMJIT_NO_CUSTOM_FLAGS)
   if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" OR "x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xMSVC")
-    list(APPEND ASMJIT_PRIVATE_CFLAGS
-      -MP                      # [+] Multi-Process Compilation.
-      -GF                      # [+] Eliminate duplicate strings.
-      -Zc:__cplusplus          # [+] Conforming __cplusplus definition.
-      -Zc:inline               # [+] Remove unreferenced COMDAT.
-      -Zc:strictStrings        # [+] Strict const qualification of string literals.
-      -Zc:threadSafeInit-      # [-] Thread-safe statics.
-      -W4)                     # [+] Warning level 4.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS -W4)                                      # [+] Warning level 4.
 
-    list(APPEND ASMJIT_PRIVATE_CFLAGS_DBG
-      -GS)                     # [+] Buffer security-check.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS -MP)                                      # [+] Multi-Process Compilation.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS -GF)                                      # [+] Eliminate duplicate strings.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS -Zc:__cplusplus)                          # [+] Conforming __cplusplus definition.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS -Zc:inline)                               # [+] Remove unreferenced COMDAT.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS -Zc:strictStrings)                        # [+] Strict const qualification of string literals.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS -Zc:threadSafeInit-)                      # [-] Thread-safe statics.
 
-    list(APPEND ASMJIT_PRIVATE_CFLAGS_REL
-      -GS-                     # [-] Buffer security-check.
-      -O2                      # [+] Favor speed over size.
-      -Oi)                     # [+] Generate intrinsic functions.
-  elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "^(GNU|Clang|AppleClang)$")
-    list(APPEND ASMJIT_PRIVATE_CFLAGS -Wall -Wextra -Wconversion)
-    list(APPEND ASMJIT_PRIVATE_CFLAGS -fno-math-errno)
-    list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -O2)
+    list(APPEND ASMJIT_PRIVATE_CFLAGS_DBG -GS)                                  # [+] Buffer security-check.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -GS-)                                 # [-] Buffer security-check.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -O2)                                  # [+] Favor speed over size.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -Oi)                                  # [+] Generate intrinsic functions.
+  elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU|Clang")
+    list(APPEND ASMJIT_PRIVATE_CFLAGS -Wall -Wextra -Wconversion)               # [+] Add baseline warnings that can be used safely even with system headers.
+    asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wdouble-promotion)              # [+] Warn about double promotions.
+    asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wduplicated-cond)               # [+] Warn about duplicate conditions.
+    asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wduplicated-branches)           # [+] Warn about duplicate branches.
+    asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wlogical-op)                    # [+] Warn about suspicious uses of logical operators in expressions.
+    asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wlogical-not-parentheses)       # [+] Warn about logical not used on the left hand side operand of a comparison.
+    asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wrestrict)
 
-    # We would like also '-Wzero-as-null-pointer-constant' but it would warn when it comes to system headers.
-    asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS
-      -Wdouble-promotion
-      -Wduplicated-cond
-      -Wduplicated-branches
-      -Wlogical-op
-      -Wrestrict
-    )
+    list(APPEND ASMJIT_PRIVATE_CFLAGS -fno-math-errno)                          # [-] Disable math functions setting errno (performance reasons).
+    list(APPEND ASMJIT_PRIVATE_CFLAGS -fno-threadsafe-statics)                  # [-] Don't add guards when initializing statics (we don't need it).
+    list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -O2)                                  # [+] Compiling with -O2 in release mode is what we generally want.
+    list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -fmerge-all-constants)                # [+] We don't need unique address per constant (merging improves library size).
 
     # -fno-semantic-interposition is not available on apple - the compiler issues a warning, which is not detected.
-    if (APPLE)
-      asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -fno-threadsafe-statics)
-    else()
-      asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -fno-threadsafe-statics -fno-semantic-interposition)
+    if (NOT APPLE)
+      asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -fno-semantic-interposition)
     endif()
 
-    # The following flags can save few bytes in the resulting binary.
-    asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS_REL
-      -fmerge-all-constants    # Merge all constants even if it violates ISO C++.
-      -fno-enforce-eh-specs)   # Don't enforce termination if noexcept function throws.
+    if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "iOS")
+      asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS_REL -fno-enforce-eh-specs)     # [-] Don't enforce termination if noexcept function throws.
+    endif()
   endif()
 endif()
 
@@ -547,9 +549,10 @@ set(ASMJIT_SRC_LIST
   asmjit/x86/x86rapass_p.h
 
   asmjit/ujit/ujitbase.h
+  asmjit/ujit/unicompiler.h
   asmjit/ujit/unicompiler_a64.cpp
   asmjit/ujit/unicompiler_x86.cpp
-  asmjit/ujit/unicompiler.h
+  asmjit/ujit/unicompiler_utils_p.h
   asmjit/ujit/uniop.h
   asmjit/ujit/vecconsttable.cpp
   asmjit/ujit/vecconsttable.h
@@ -592,12 +595,12 @@ message("   ASMJIT_PRIVATE_CFLAGS_REL=${ASMJIT_PRIVATE_CFLAGS_REL}")
 
 if (NOT ASMJIT_EMBED)
   # Add AsmJit target.
-  asmjit_add_target(asmjit "${ASMJIT_TARGET_TYPE}"
-                    SOURCES    ${ASMJIT_SRC}
-                    LIBRARIES  ${ASMJIT_DEPS}
-                    CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
-                    CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
-                    CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
+  asmjit_addapp(asmjit "${ASMJIT_TARGET_TYPE}"
+    SOURCES    ${ASMJIT_SRC}
+    LIBRARIES  ${ASMJIT_DEPS}
+    CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
+    CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
+    CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
 
   target_compile_options(asmjit INTERFACE ${ASMJIT_CFLAGS})
   target_include_directories(asmjit BEFORE INTERFACE
@@ -632,60 +635,46 @@ if (NOT ASMJIT_EMBED)
     enable_testing()
 
     # Special target that always uses embedded AsmJit.
-    asmjit_add_target(asmjit_test_unit TEST
-                      SOURCES    ${ASMJIT_SRC}
-                                 test/asmjit_test_unit.cpp
-                                 test/broken.cpp
-                                 test/broken.h
-                      LIBRARIES  ${ASMJIT_DEPS}
-                      CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
-                                 -DASMJIT_TEST
-                                 -DASMJIT_STATIC
-                      CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
-                      CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
-    target_include_directories(asmjit_test_unit BEFORE PRIVATE ${ASMJIT_INCLUDE_DIRS})
+    asmjit_addapp(asmjit_test_runner TEST
+      SOURCES    ${ASMJIT_SRC}
+                 testing/tests/asmjit_test_runner.cpp
+                 testing/tests/broken.cpp
+                 testing/tests/broken.h
+      LIBRARIES  ${ASMJIT_DEPS}
+      CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
+                 -DASMJIT_TEST
+                 -DASMJIT_STATIC
+      CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
+      CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
+    target_include_directories(asmjit_test_runner BEFORE PRIVATE ${ASMJIT_INCLUDE_DIRS})
 
-    asmjit_add_target(asmjit_test_assembler TEST
-                      SOURCES    test/asmjit_test_assembler.cpp
-                                 test/asmjit_test_assembler.h
-                                 test/asmjit_test_assembler_a64.cpp
-                                 test/asmjit_test_assembler_x64.cpp
-                                 test/asmjit_test_assembler_x86.cpp
-                      LIBRARIES  asmjit::asmjit
-                      CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
-                      CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
-                      CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
+    asmjit_addapp(asmjit_test_assembler TEST
+      SOURCES    testing/tests/asmjit_test_assembler.cpp
+                 testing/tests/asmjit_test_assembler.h
+                 testing/tests/asmjit_test_assembler_a64.cpp
+                 testing/tests/asmjit_test_assembler_x64.cpp
+                 testing/tests/asmjit_test_assembler_x86.cpp
+      LIBRARIES  asmjit::asmjit
+      CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
+      CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
+      CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
 
-    asmjit_add_target(asmjit_bench_codegen EXECUTABLE
-                      SOURCES    test/asmjit_bench_codegen.cpp
-                                 test/asmjit_bench_codegen_a64.cpp
-                                 test/asmjit_bench_codegen_x86.cpp
-                      SOURCES    test/asmjit_bench_codegen.h
-                      LIBRARIES  asmjit::asmjit
-                      CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
-                      CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
-                      CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
-
-    foreach(_target asmjit_bench_overhead
-                    asmjit_bench_regalloc
-                    asmjit_test_environment
-                    asmjit_test_emitters
-                    asmjit_test_x86_sections)
-      asmjit_add_target(${_target} TEST
-                        SOURCES    test/${_target}.cpp
-                        LIBRARIES  asmjit::asmjit
-                        CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
-                        CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
-                        CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
+    foreach(app asmjit_test_environment asmjit_test_emitters asmjit_test_x86_sections)
+      asmjit_addapp(${app} TEST
+        SOURCES    testing/tests/${app}.cpp
+        LIBRARIES  asmjit::asmjit
+        CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
+        CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
+        CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
     endforeach()
 
     if (NOT ASMJIT_NO_INTROSPECTION)
-      asmjit_add_target(asmjit_test_instinfo TEST
-                        SOURCES    test/asmjit_test_instinfo.cpp
-                        LIBRARIES  asmjit::asmjit
-                        CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
-                        CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
-                        CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
+      asmjit_addapp(asmjit_test_instinfo TEST
+        SOURCES    testing/tests/asmjit_test_instinfo.cpp
+        LIBRARIES  asmjit::asmjit
+        CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
+        CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
+        CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
     endif()
 
     if (NOT (ASMJIT_NO_BUILDER OR ASMJIT_NO_COMPILER))
@@ -727,28 +716,48 @@ if (NOT ASMJIT_EMBED)
         endif()
       endif()
 
-      set_property(SOURCE test/asmjit_test_unicompiler_avx2fma.cpp APPEND PROPERTY COMPILE_OPTIONS ${ASMJIT_AVX2FMA_CFLAGS})
-
-      asmjit_add_target(asmjit_test_compiler TEST
-                        SOURCES    test/asmjit_test_compiler.cpp
-                                   test/asmjit_test_compiler.h
-                                   test/asmjit_test_compiler_a64.cpp
-                                   test/asmjit_test_compiler_x86.cpp
-                        LIBRARIES  asmjit::asmjit
-                        CFLAGS     ${ASMJIT_PRIVATE_CFLAGS} ${ASMJIT_SSE2_CFLAGS}
-                        CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
-                        CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
-
-      asmjit_add_target(asmjit_test_unicompiler TEST
-                        SOURCES    test/asmjit_test_unicompiler.cpp
-                                   test/asmjit_test_unicompiler_sse2.cpp
-                                   test/asmjit_test_unicompiler_avx2fma.cpp
-                                   test/broken.cpp
-                        LIBRARIES  asmjit::asmjit
-                        CFLAGS     ${ASMJIT_PRIVATE_CFLAGS} ${ASMJIT_SSE2_CFLAGS}
-                        CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
-                        CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
+      asmjit_addapp(asmjit_test_compiler TEST
+        SOURCES    testing/tests/asmjit_test_compiler.cpp
+                   testing/tests/asmjit_test_compiler.h
+                   testing/tests/asmjit_test_compiler_a64.cpp
+                   testing/tests/asmjit_test_compiler_x86.cpp
+        LIBRARIES  asmjit::asmjit
+        CFLAGS     ${ASMJIT_PRIVATE_CFLAGS} ${ASMJIT_SSE2_CFLAGS}
+        CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
+        CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
     endif()
 
+    if (NOT ASMJIT_NO_UJIT)
+      asmjit_addapp(asmjit_test_unicompiler TEST
+        SOURCES    testing/tests/asmjit_test_unicompiler.cpp
+                   testing/tests/asmjit_test_unicompiler_sse2.cpp
+                   testing/tests/asmjit_test_unicompiler_avx2fma.cpp
+                   testing/tests/broken.cpp
+        LIBRARIES  asmjit::asmjit
+        CFLAGS     ${ASMJIT_PRIVATE_CFLAGS} ${ASMJIT_SSE2_CFLAGS}
+        CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
+        CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
+      set_property(SOURCE testing/tests/asmjit_test_unicompiler_avx2fma.cpp APPEND PROPERTY COMPILE_OPTIONS ${ASMJIT_AVX2FMA_CFLAGS})
+    endif()
+
+    asmjit_addapp(asmjit_bench_codegen EXECUTABLE
+      SOURCES    testing/bench/asmjit_bench_codegen.cpp
+                 testing/bench/asmjit_bench_codegen_a64.cpp
+                 testing/bench/asmjit_bench_codegen_x86.cpp
+      SOURCES    testing/bench/asmjit_bench_codegen.h
+      LIBRARIES  asmjit::asmjit
+      CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
+      CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
+      CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
+
+    foreach(app asmjit_bench_overhead asmjit_bench_regalloc)
+      asmjit_addapp(${app} TEST
+        SOURCES    testing/bench/${app}.cpp
+        LIBRARIES  asmjit::asmjit
+        CFLAGS     ${ASMJIT_PRIVATE_CFLAGS}
+        CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG}
+        CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL})
+    endforeach()
+
   endif()
 endif()
diff --git a/LICENSE.md b/LICENSE.md
index e01395c..7818b21 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,4 +1,4 @@
-Copyright (c) 2008-2025 The AsmJit Authors
+Copyright (c) 2008-2025 Petr Kobalicek
 
 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
diff --git a/README.md b/README.md
index 805930d..4f416b3 100644
--- a/README.md
+++ b/README.md
@@ -17,10 +17,12 @@ Project Organization
     * **src**      - Source code
       * **asmjit** - Source code and headers (always point include path in here)
         * **core** - Core API, backend independent except relocations
-        * **arm**  - ARM specific API, used only by ARM and AArch64 backends
+        * **arm**  - ARM specific API, designed to be common for both AArch32 and AArch64
+        * **a64**  - AArch64 specific API, used only by AArch64 backends
         * **x86**  - X86 specific API, used only by X86 and X64 backends
+        * **ujit** - Universal JIT API
     * **test**     - Unit and integration tests (don't embed in your project)
-    * **tools**    - Tools used for configuring, documenting, and generating files
+    * **tools**    - Tools used to re-regenerate generated files (instruction DB, enum strings)
 
 Roadmap
 -------
@@ -38,6 +40,11 @@ Documentation
   * [Documentation Index](https://asmjit.com/doc/index.html)
   * [Build Instructions](https://asmjit.com/doc/group__asmjit__build.html) (includes [CMake Integration](https://asmjit.com/doc/group__asmjit__build.html#cmake_integration))
 
+Development & Testing
+---------------------
+
+  * Basic configure scripts that invoke cmake are provided in project root.
+
 Breaking Changes
 ----------------
 
diff --git a/configure.sh b/configure.sh
new file mode 100755
index 0000000..659d6dc
--- /dev/null
+++ b/configure.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+BUILD_OPTIONS="-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1"
+
+echo "== [Configuring Build - Debug] =="
+eval cmake . -B build/Debug -DCMAKE_BUILD_TYPE=Debug ${BUILD_OPTIONS} "$@"
+echo ""
+
+echo "== [Configuring Build - Release] =="
+eval cmake . -B build/Release -DCMAKE_BUILD_TYPE=Release ${BUILD_OPTIONS} "$@"
+echo ""
diff --git a/configure_sanitizers.sh b/configure_sanitizers.sh
new file mode 100755
index 0000000..2634bcd
--- /dev/null
+++ b/configure_sanitizers.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+BUILD_OPTIONS="-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1"
+
+echo "== [Configuring Build - Release_ASAN] =="
+eval cmake . -B build/Release_ASAN ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=address "$@"
+echo ""
+
+echo "== [Configuring Build - Release_MSAN] =="
+eval cmake . -B build/Release_MSAN ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=memory "$@"
+echo ""
+
+echo "== [Configuring Build - Release_UBSAN] =="
+eval cmake . -B build/Release_UBSAN ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=undefined "$@"
+echo ""
diff --git a/configure_vs2022_x64.bat b/configure_vs2022_x64.bat
new file mode 100644
index 0000000..9bbadd7
--- /dev/null
+++ b/configure_vs2022_x64.bat
@@ -0,0 +1,2 @@
+@echo off
+cmake . -B build_x64 -G"Visual Studio 17" -A x64 -DASMJIT_TEST=ON
diff --git a/configure_vs2022_x86.bat b/configure_vs2022_x86.bat
new file mode 100644
index 0000000..8c123a0
--- /dev/null
+++ b/configure_vs2022_x86.bat
@@ -0,0 +1,2 @@
+@echo off
+cmake . -B build_x86 -G"Visual Studio 17" -A Win32 -DASMJIT_TEST=ON
diff --git a/db/isa_x86.json b/db/isa_x86.json
index 657ec7c..b4bdaa8 100644
--- a/db/isa_x86.json
+++ b/db/isa_x86.json
@@ -3633,22 +3633,22 @@
       {"apx": "and{nf}        W:r8, R:r8/m8, imm8"                         , "op": "[VM ] EVEX.ND=1.LLZ.NP.MAP4.WIG 80 /4 ib"  , "io": "OF=0 SF=W ZF=W AF=U PF=W CF=0"},
       {"apx": "and{nf}        W:rv, R:rv/mv, imms8"                        , "op": "[VM ] EVEX.ND=1.LLZ.Pv.MAP4.Wv 83 /4 ib"   , "io": "OF=0 SF=W ZF=W AF=U PF=W CF=0"},
       {"apx": "and{nf}        W:rv, R:rv/mv, immv"                         , "op": "[VM ] EVEX.ND=1.LLZ.Pv.MAP4.Wv 81 /4 iv"   , "io": "OF=0 SF=W ZF=W AF=U PF=W CF=0"},
-      {"apx": "cmovb          W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 42 /r"      , "io": "CF=R"},
-      {"apx": "cmovbe         W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 46 /r"      , "io": "CF=R ZF=R"},
-      {"apx": "cmovl          W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4C /r"      , "io": "SF=R OF=R"},
-      {"apx": "cmovle         W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4E /r"      , "io": "ZF=R SF=R OF=R"},
-      {"apx": "cmovnb         W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 43 /r"      , "io": "CF=R"},
-      {"apx": "cmovnbe        W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 47 /r"      , "io": "CF=R ZF=R"},
-      {"apx": "cmovnl         W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4D /r"      , "io": "SF=R OF=R"},
-      {"apx": "cmovnle        W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4F /r"      , "io": "ZF=R SF=R OF=R"},
-      {"apx": "cmovno         W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 41 /r"      , "io": "OF=R"},
-      {"apx": "cmovnp         W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4B /r"      , "io": "PF=R"},
-      {"apx": "cmovns         W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 49 /r"      , "io": "SF=R"},
-      {"apx": "cmovnz         W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 45 /r"      , "io": "ZF=R"},
-      {"apx": "cmovo          W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 40 /r"      , "io": "OF=R"},
-      {"apx": "cmovp          W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4A /r"      , "io": "PF=R"},
-      {"apx": "cmovs          W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 48 /r"      , "io": "SF=R"},
-      {"apx": "cmovz          W:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 44 /r"      , "io": "ZF=R"},
+      {"apx": "cmovb          X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 42 /r"      , "io": "CF=R"},
+      {"apx": "cmovbe         X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 46 /r"      , "io": "CF=R ZF=R"},
+      {"apx": "cmovl          X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4C /r"      , "io": "SF=R OF=R"},
+      {"apx": "cmovle         X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4E /r"      , "io": "ZF=R SF=R OF=R"},
+      {"apx": "cmovnb         X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 43 /r"      , "io": "CF=R"},
+      {"apx": "cmovnbe        X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 47 /r"      , "io": "CF=R ZF=R"},
+      {"apx": "cmovnl         X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4D /r"      , "io": "SF=R OF=R"},
+      {"apx": "cmovnle        X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4F /r"      , "io": "ZF=R SF=R OF=R"},
+      {"apx": "cmovno         X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 41 /r"      , "io": "OF=R"},
+      {"apx": "cmovnp         X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4B /r"      , "io": "PF=R"},
+      {"apx": "cmovns         X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 49 /r"      , "io": "SF=R"},
+      {"apx": "cmovnz         X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 45 /r"      , "io": "ZF=R"},
+      {"apx": "cmovo          X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 40 /r"      , "io": "OF=R"},
+      {"apx": "cmovp          X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4A /r"      , "io": "PF=R"},
+      {"apx": "cmovs          X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 48 /r"      , "io": "SF=R"},
+      {"apx": "cmovz          X:rv, R:rv, R:rv/mv"                         , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 44 /r"      , "io": "ZF=R"},
       {"any": "crc32          X:r32, R:r8/m8"                              , "op": "[RM ] EVEX.ND=0.LLZ.NP.MAP4.W0 F0 /r"},
       {"any": "crc32          X:r64, R:r8/m8"                              , "op": "[RM ] EVEX.ND=0.LLZ.NP.MAP4.W1 F0 /r"},
       {"any": "crc32          X:r32, R:r16/m16"                            , "op": "[RM ] EVEX.ND=0.LLZ.66.MAP4.W0 F1 /r"},
@@ -3970,69 +3970,69 @@
       {"x64": "ccmpz     dfv, R:r8/m8, imm8"                               , "op": "[M  ] EVEX.ND=0.SCC=4.LLZ.NP.MAP4.WIG 80 /7 ib"  },
       {"x64": "ccmpz     dfv, R:rv/mv, imms8"                              , "op": "[M  ] EVEX.ND=0.SCC=4.LLZ.Pv.MAP4.Wv 83 /7 ib"   },
       {"x64": "ccmpz     dfv, R:rv/mv, immv"                               , "op": "[M  ] EVEX.ND=0.SCC=4.LLZ.Pv.MAP4.Wv 81 /7 iv"   },
-      {"x64": "cfcmovb   W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"},
-      {"x64": "cfcmovb   W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"},
-      {"x64": "cfcmovb   W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"},
+      {"x64": "cfcmovb   X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"},
+      {"x64": "cfcmovb   X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"},
+      {"x64": "cfcmovb   X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"},
       {"x64": "cfcmovb   W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"},
-      {"x64": "cfcmovbe  W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"},
-      {"x64": "cfcmovbe  W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"},
-      {"x64": "cfcmovbe  W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"},
+      {"x64": "cfcmovbe  X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"},
+      {"x64": "cfcmovbe  X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"},
+      {"x64": "cfcmovbe  X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"},
       {"x64": "cfcmovbe  W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"},
-      {"x64": "cfcmovl   W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"},
-      {"x64": "cfcmovl   W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"},
-      {"x64": "cfcmovl   W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"},
+      {"x64": "cfcmovl   X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"},
+      {"x64": "cfcmovl   X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"},
+      {"x64": "cfcmovl   X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"},
       {"x64": "cfcmovl   W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"},
-      {"x64": "cfcmovle  W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"},
-      {"x64": "cfcmovle  W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"},
-      {"x64": "cfcmovle  W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"},
+      {"x64": "cfcmovle  X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"},
+      {"x64": "cfcmovle  X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"},
+      {"x64": "cfcmovle  X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"},
       {"x64": "cfcmovle  W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"},
-      {"x64": "cfcmovnb  W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"},
-      {"x64": "cfcmovnb  W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"},
-      {"x64": "cfcmovnb  W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"},
+      {"x64": "cfcmovnb  X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"},
+      {"x64": "cfcmovnb  X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"},
+      {"x64": "cfcmovnb  X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"},
       {"x64": "cfcmovnb  W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"},
-      {"x64": "cfcmovnbe W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"},
-      {"x64": "cfcmovnbe W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"},
-      {"x64": "cfcmovnbe W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"},
+      {"x64": "cfcmovnbe X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"},
+      {"x64": "cfcmovnbe X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"},
+      {"x64": "cfcmovnbe X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"},
       {"x64": "cfcmovnbe W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"},
-      {"x64": "cfcmovnl  W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"},
-      {"x64": "cfcmovnl  W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"},
-      {"x64": "cfcmovnl  W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"},
+      {"x64": "cfcmovnl  X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"},
+      {"x64": "cfcmovnl  X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"},
+      {"x64": "cfcmovnl  X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"},
       {"x64": "cfcmovnl  W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"},
-      {"x64": "cfcmovnle W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"},
-      {"x64": "cfcmovnle W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"},
-      {"x64": "cfcmovnle W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"},
+      {"x64": "cfcmovnle X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"},
+      {"x64": "cfcmovnle X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"},
+      {"x64": "cfcmovnle X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"},
       {"x64": "cfcmovnle W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"},
-      {"x64": "cfcmovno  W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"},
-      {"x64": "cfcmovno  W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"},
-      {"x64": "cfcmovno  W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"},
+      {"x64": "cfcmovno  X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"},
+      {"x64": "cfcmovno  X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"},
+      {"x64": "cfcmovno  X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"},
       {"x64": "cfcmovno  W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"},
-      {"x64": "cfcmovnp  W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"},
-      {"x64": "cfcmovnp  W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"},
-      {"x64": "cfcmovnp  W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"},
+      {"x64": "cfcmovnp  X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"},
+      {"x64": "cfcmovnp  X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"},
+      {"x64": "cfcmovnp  X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"},
       {"x64": "cfcmovnp  W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"},
-      {"x64": "cfcmovns  W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"},
-      {"x64": "cfcmovns  W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"},
-      {"x64": "cfcmovns  W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"},
+      {"x64": "cfcmovns  X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"},
+      {"x64": "cfcmovns  X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"},
+      {"x64": "cfcmovns  X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"},
       {"x64": "cfcmovns  W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"},
-      {"x64": "cfcmovnz  W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"},
-      {"x64": "cfcmovnz  W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"},
-      {"x64": "cfcmovnz  W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"},
+      {"x64": "cfcmovnz  X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"},
+      {"x64": "cfcmovnz  X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"},
+      {"x64": "cfcmovnz  X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"},
       {"x64": "cfcmovnz  W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"},
-      {"x64": "cfcmovo   W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"},
-      {"x64": "cfcmovo   W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"},
-      {"x64": "cfcmovo   W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"},
+      {"x64": "cfcmovo   X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"},
+      {"x64": "cfcmovo   X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"},
+      {"x64": "cfcmovo   X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"},
       {"x64": "cfcmovo   W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"},
-      {"x64": "cfcmovp   W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"},
-      {"x64": "cfcmovp   W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"},
-      {"x64": "cfcmovp   W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"},
+      {"x64": "cfcmovp   X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"},
+      {"x64": "cfcmovp   X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"},
+      {"x64": "cfcmovp   X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"},
       {"x64": "cfcmovp   W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"},
-      {"x64": "cfcmovs   W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"},
-      {"x64": "cfcmovs   W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"},
-      {"x64": "cfcmovs   W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"},
+      {"x64": "cfcmovs   X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"},
+      {"x64": "cfcmovs   X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"},
+      {"x64": "cfcmovs   X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"},
       {"x64": "cfcmovs   W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"},
-      {"x64": "cfcmovz   W:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"},
-      {"x64": "cfcmovz   W:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"},
-      {"x64": "cfcmovz   W?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"},
+      {"x64": "cfcmovz   X:rv, R:rv/mv"                                    , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"},
+      {"x64": "cfcmovz   X:rv, R:rv"                                       , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"},
+      {"x64": "cfcmovz   X?:mv, R:rv"                                      , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"},
       {"x64": "cfcmovz   W:rv, R:rv, R?:rv/mv"                             , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"},
       {"x64": "ctestb    dfv, R:r8/m8, R:r8"                               , "op": "[MR ] EVEX.ND=0.SCC=2.LLZ.NP.MAP4.WIG 84 /r"     },
       {"x64": "ctestb    dfv, R:rv/mv, R:rv"                               , "op": "[MR ] EVEX.ND=0.SCC=2.LLZ.Pv.MAP4.Wv 85 /r"      },
diff --git a/src/asmjit/arm/a64assembler.cpp b/src/asmjit/arm/a64assembler.cpp
index 990a395..82af1f7 100644
--- a/src/asmjit/arm/a64assembler.cpp
+++ b/src/asmjit/arm/a64assembler.cpp
@@ -4491,14 +4491,14 @@ Case_BaseLdurStur:
           goto InvalidInstruction;
 
         uint32_t x = o0.as<Gp>().is_gp64();
-        uint32_t gpMustBeX = uint32_t(size_op.size() >= 3u - op_data.is_signed);
+        uint32_t gp_must_be_x = uint32_t(size_op.size() >= 3u - op_data.is_signed);
 
         if (op_data.is_signed) {
-          if (gpMustBeX && !x)
+          if (gp_must_be_x && !x)
             goto InvalidInstruction;
         }
         else {
-          if (x != gpMustBeX)
+          if (x != gp_must_be_x)
             goto InvalidInstruction;
         }
 
diff --git a/src/asmjit/arm/a64compiler.h b/src/asmjit/arm/a64compiler.h
index 17f82b4..2d09c5b 100644
--- a/src/asmjit/arm/a64compiler.h
+++ b/src/asmjit/arm/a64compiler.h
@@ -37,75 +37,87 @@ public:
   //! \name Virtual Registers
   //! \{
 
-  //! \cond INTERNAL
-  template<typename RegT, typename Type>
-  ASMJIT_INLINE_NODEBUG RegT _new_reg_internal(const Type& type) {
-    RegT reg(Globals::NoInit);
-    _new_reg(Out<Reg>{reg}, type, nullptr);
-    return reg;
-  }
+  //! Creates a new general-purpose register with `type_id` type and optional name passed via `args`.
+  //!
+  //! \note Using \ref TypeId is too generic. In general it's recommended to use \ref new_gp32(),
+  //! \ref new_gp64(), and \ref new_gpz() or \ref new_gp_ptr().
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Gp new_gp(TypeId type_id, Args&&... args) { return new_reg<Gp>(type_id, std::forward<Args>(args)...); }
 
-  template<typename RegT, typename Type>
-  ASMJIT_INLINE_NODEBUG RegT _new_reg_internal(const Type& type, const char* s) {
-#ifndef ASMJIT_NO_LOGGING
-    RegT reg(Globals::NoInit);
-    _new_reg(Out<Reg>{reg}, type, s);
-    return reg;
-#else
-    Support::maybe_unused(s);
-    return _new_reg_internal<RegT>(type);
-#endif
-  }
+  //! Creates a new vector register with `type_id` type and optional name passed via `args`.
+  //!
+  //! \note Using \ref TypeId is too generic. In general it's recommended to use \ref new_vec128(),
+  //! \ref new_vec_s(), \ref new_vec_d(), \ref new_vec_q(), ...
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec(TypeId type_id, Args&&... args) { return new_reg<Vec>(type_id, std::forward<Args>(args)...); }
 
-  template<typename RegT, typename Type, typename... Args>
-  ASMJIT_INLINE_NODEBUG RegT _new_reg_internal(const Type& type, const char* s, Args&&... args) {
-#ifndef ASMJIT_NO_LOGGING
-    RegT reg(Globals::NoInit);
-    _new_reg_fmt(Out<Reg>{reg}, type, s, std::forward<Args>(args)...);
-    return reg;
-#else
-    Support::maybe_unused(s, std::forward<Args>(args)...);
-    return _new_reg_internal<RegT>(type);
-#endif
-  }
-  //! \endcond
+  //! Creates a new 32-bit general purpose register mapped to low 32 bits of a full register (on 64-bit targets).
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Gp new_gp32(Args&&... args) { return new_reg<Gp>(TypeId::kUInt32, std::forward<Args>(args)...); }
 
-  template<typename RegT, typename... Args>
-  ASMJIT_INLINE_NODEBUG RegT new_similar_reg(const RegT& ref, Args&&... args) {
-    return _new_reg_internal<RegT>(ref, std::forward<Args>(args)...);
-  }
+  //! Creates a new 64-bit general purpose register.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Gp new_gp64(Args&&... args) { return new_reg<Gp>(TypeId::kUInt64, std::forward<Args>(args)...); }
 
+  //! Creates a new 32-bit general purpose register.
+  //!
+  //! \note This is a convenience function alias of \ref new_gp32().
   template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Reg new_reg(TypeId type_id, Args&&... args) { return _new_reg_internal<Reg>(type_id, std::forward<Args>(args)...); }
+  ASMJIT_INLINE_NODEBUG Gp new_gpw(Args&&... args) { return new_reg<Gp>(TypeId::kUIntPtr, std::forward<Args>(args)...); }
 
+  //! Creates a new 64-bit general purpose register.
+  //!
+  //! \note This is a convenience function alias of \ref new_gp64().
   template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Gp new_gp(TypeId type_id, Args&&... args) { return _new_reg_internal<Gp>(type_id, std::forward<Args>(args)...); }
+  ASMJIT_INLINE_NODEBUG Gp new_gpx(Args&&... args) { return new_reg<Gp>(TypeId::kUIntPtr, std::forward<Args>(args)...); }
 
+  //! Creates a new 32-bit or 64-bit general purpose register depending on the target register width.
+  //!
+  //! \note This is a convenience function, on aarch64 target it always creates a 64-bit general-purpose register.
   template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Gp new_gp32(Args&&... args) { return _new_reg_internal<Gp>(TypeId::kUInt32, std::forward<Args>(args)...); }
-  template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Gp new_gp64(Args&&... args) { return _new_reg_internal<Gp>(TypeId::kUInt64, std::forward<Args>(args)...); }
+  ASMJIT_INLINE_NODEBUG Gp new_gpz(Args&&... args) { return new_reg<Gp>(TypeId::kUIntPtr, std::forward<Args>(args)...); }
 
+  //! Creates a new 32-bit or 64-bit general purpose register depending on the target register width.
+  //!
+  //! \note This is a convenience function, on aarch64 target it always creates a 64-bit general-purpose register.
   template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Gp new_gpw(Args&&... args) { return _new_reg_internal<Gp>(TypeId::kUInt32, std::forward<Args>(args)...); }
-  template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Gp new_gpx(Args&&... args) { return _new_reg_internal<Gp>(TypeId::kUInt64, std::forward<Args>(args)...); }
-  template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Gp new_gpz(Args&&... args) { return _new_reg_internal<Gp>(TypeId::kUIntPtr, std::forward<Args>(args)...); }
-  template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Gp new_gp_ptr(Args&&... args) { return _new_reg_internal<Gp>(TypeId::kUIntPtr, std::forward<Args>(args)...); }
+  ASMJIT_INLINE_NODEBUG Gp new_gp_ptr(Args&&... args) { return new_reg<Gp>(TypeId::kUIntPtr, std::forward<Args>(args)...); }
 
+  //! Creates a new 128-bit vector register.
   template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Vec new_vec(TypeId type_id, Args&&... args) { return _new_reg_internal<Vec>(type_id, std::forward<Args>(args)...); }
+  ASMJIT_INLINE_NODEBUG Vec new_vec128(Args&&... args) { return new_reg<Vec>(TypeId::kInt32x4, std::forward<Args>(args)...); }
 
+  //! Creates a new 128-bit vector register that will be used for scalar 32-bit floating point operation.
   template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Vec new_vec_s(Args&&... args) { return _new_reg_internal<Vec>(TypeId::kFloat32, std::forward<Args>(args)...); }
+  ASMJIT_INLINE_NODEBUG Vec new_vec128_f32x1(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32x1, std::forward<Args>(args)...); }
 
+  //! Creates a new 128-bit vector register that will be used for scalar 64-bit floating point operation.
   template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Vec new_vec_d(Args&&... args) { return _new_reg_internal<Vec>(TypeId::kFloat64, std::forward<Args>(args)...); }
+  ASMJIT_INLINE_NODEBUG Vec new_vec128_f64x1(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64x1, std::forward<Args>(args)...); }
 
+  //! Creates a new 128-bit vector register that will be used for packed 32-bit floating point operation.
   template<typename... Args>
-  ASMJIT_INLINE_NODEBUG Vec new_vec_q(Args&&... args) { return _new_reg_internal<Vec>(TypeId::kUInt8x16, std::forward<Args>(args)...); }
+  ASMJIT_INLINE_NODEBUG Vec new_vec128_f32x4(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32x4, std::forward<Args>(args)...); }
+
+  //! Creates a new 128-bit vector register that will be used for packed 64-bit floating point operation.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec128_f64x2(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64x2, std::forward<Args>(args)...); }
+
+  //! Creates a new 32-bit vector register (S).
+  //!
+  //! \note This may look like an alias of \ref new_vec128_f32x1(), but it's not. This really creates a 32-bit
+  //! register, which has a type \ref RegType::kVec32, whereas \ref new_vec128_f32x1() creates a register,
+  //! which has a type \ref RegType::kVec64
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec_s(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec128_f64x1() that matches aarch64 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec_d(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec128() that matches aarch64 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec_q(Args&&... args) { return new_reg<Vec>(TypeId::kInt32x4, std::forward<Args>(args)...); }
 
   //! \}
 
diff --git a/src/asmjit/core.h b/src/asmjit/core.h
index 58c1751..7de2aa6 100644
--- a/src/asmjit/core.h
+++ b/src/asmjit/core.h
@@ -188,19 +188,16 @@ namespace asmjit {
 //! you can just use the following CMake snippet that integrates AsmJit with your own CMake project:
 //!
 //! ```cmake
-//! cmake_minimum_required(VERSION 3.30)
+//! cmake_minimum_required(VERSION 3.30 FATAL_ERROR)
+//! project(app C CXX)
 //!
-//! project(asmjit_consumer C CXX)    # Both C and CXX are required.
-//! set(CMAKE_CXX_STANDARD 17)        # C++17 and never is supported.
+//! set(ASMJIT_DIR "3rdparty/asmjit")              # Location of AsmJit.
+//! set(ASMJIT_STATIC TRUE)                        # Force static build.
+//! add_subdirectory("${ASMJIT_DIR}")              # Adds AsmJit sub-project to your project.
 //!
-//! set(ASMJIT_DIR "3rdparty/asmjit") # Location of AsmJit.
-//! set(ASMJIT_STATIC TRUE)           # Force static build.
-//!
-//! add_subdirectory("${ASMJIT_DIR}") # This adds AsmJit as a part of your project.
-//!
-//! add_executable(asmjit_consumer asmjit_consumer.cpp)
-//! target_link_libraries(
-//!   asmjit_consumer asmjit::asmjit) # This adds AsmJit as a dependency to your target.
+//! add_executable(app asmjit_consumer.cpp)        # Adds executable that uses AsmJit.
+//! target_link_libraries(app asmjit::asmjit)      # Adds AsmJit as a dependency to app.
+//! target_compile_features(app PUBLIC cxx_std_17) # Makes C++17 as a requirement.
 //! ```
 //!
 //! \section build_type Build Type Configuration
@@ -2111,8 +2108,53 @@ namespace asmjit {
 //! \defgroup asmjit_a64 AArch64 Backend
 //! \brief AArch64 backend.
 
-//! \defgroup asmjit_ujit UJIT
-//! \brief Universal JIT - abstracts X86|X86_64 and AArch64 code generation.
+//! \defgroup asmjit_ujit Universal JIT
+//! \brief Universal JIT abstracts X86, X86_64, and AArch64 code generation.
+//!
+//! ### Overview
+//!
+//! Universal JIT (UJIT) is an abstraction that uses AsmJit's Compiler, but provides target independent API that
+//! users can use to target multiple target architectures at a time. The goal of Universal JIT is not to provide
+//! its own IR. Instead, it translates user calls into target-dependent instructions (or instruction sequences)
+//! and allows users to switch to target-specific assembly only where required for extra performance.
+//!
+//! \warning UJIT is still in an experimental phase, expect minor API breaks in the future.
+//!
+//! API Overview
+//!
+//! Compiler:
+//!
+//!  - \ref ujit::UniCompiler - UniCompiler that wraps an existing \ref ujit::BackendCompiler.
+//!  - \ref ujit::BackendCompiler - alias of a platform-dependent Compiler (\ref x86::Compiler or \ref a64::Compiler).
+//!
+//! Operands:
+//!
+//!  - \ref ujit::Gp - alias of a platform-dependent general-purpose register (\ref x86::Gp, \ref a64::Gp).
+//!  - \ref ujit::Vec - alias of a platform-dependent vector register (\ref x86::Vec, \ref a64::Vec).
+//!  - \ref ujit::Mem - alias of a platform-dependent memory operand (\ref x86::Mem, \ref a64::Mem).
+//!
+//! Conditions:
+//!
+//!  - \ref ujit::CondCode - alias of a platform-dependent condition code (\ref x86::CondCode, a64::CondCode).
+//!  - \ref ujit::UniCondition - platform-independent condition representation that can be used with some ujit
+//!    instructions.
+//!
+//! Instructions:
+//!
+//!  - \ref ujit::UniOpCond - instruction that can be used by \ref ujit::UniCondition.
+//!  - \ref ujit::UniOpM - instruction with a single `[mem]` operand.
+//!  - \ref ujit::UniOpRM - instruction with `[reg, mem]` operands.
+//!  - \ref ujit::UniOpMR - instruction with `[mem, reg]` operands.
+//!  - \ref ujit::UniOpRR - instruction with `[reg, reg]` operands.
+//!  - \ref ujit::UniOpRRR - instruction with `[reg, reg, reg]` operands.
+//!  - \ref ujit::UniOpVR - instruction with `[vec, reg]` operands.
+//!  - \ref ujit::UniOpVM - instruction with `[vec, mem]` operands.
+//!  - \ref ujit::UniOpMV - instruction with `[mem, vec]` operands.
+//!  - \ref ujit::UniOpVV - instruction with `[vec, vec]` operands.
+//!  - \ref ujit::UniOpVVI - instruction with `[vec, vec, imm]` operands.
+//!  - \ref ujit::UniOpVVV - instruction with `[vec, vec, vec]` operands.
+//!  - \ref ujit::UniOpVVVI - instruction with `[vec, vec, vec, imm]` operands.
+//!  - \ref ujit::UniOpVVVV - instruction with `[vec, vec, vec, vec]` operands.
 
 //! \cond INTERNAL
 //! \defgroup asmjit_ra RA
@@ -2122,7 +2164,16 @@ namespace asmjit {
 } // {asmjit}
 
 #include "asmjit-scope-begin.h"
+#include "core/api-config.h"
+#include "core/archcommons.h"
 #include "core/archtraits.h"
+#include "core/arena.h"
+#include "core/arenahash.h"
+#include "core/arenalist.h"
+#include "core/arenapool.h"
+#include "core/arenastring.h"
+#include "core/arenatree.h"
+#include "core/arenavector.h"
 #include "core/assembler.h"
 #include "core/builder.h"
 #include "core/codebuffer.h"
@@ -2149,13 +2200,6 @@ namespace asmjit {
 #include "core/target.h"
 #include "core/type.h"
 #include "core/virtmem.h"
-#include "core/arena.h"
-#include "core/arenahash.h"
-#include "core/arenalist.h"
-#include "core/arenapool.h"
-#include "core/arenatree.h"
-#include "core/arenastring.h"
-#include "core/arenavector.h"
 #include "asmjit-scope-end.h"
 
 #endif // ASMJIT_CORE_H_INCLUDED
diff --git a/src/asmjit/core/api-build_p.h b/src/asmjit/core/api-build_p.h
index 03315fa..af56b64 100644
--- a/src/asmjit/core/api-build_p.h
+++ b/src/asmjit/core/api-build_p.h
@@ -68,7 +68,7 @@
 
 // Include a unit testing package if this is a `asmjit_test_unit` build.
 #if defined(ASMJIT_TEST)
-  #include "../../../test/broken.h"
+  #include "../../../testing/tests/broken.h"
 #endif
 
 #endif // ASMJIT_CORE_API_BUILD_P_H_INCLUDED
diff --git a/src/asmjit/core/api-config.h b/src/asmjit/core/api-config.h
index 398fb7f..2d51778 100644
--- a/src/asmjit/core/api-config.h
+++ b/src/asmjit/core/api-config.h
@@ -16,7 +16,7 @@
 #define ASMJIT_LIBRARY_MAKE_VERSION(major, minor, patch) ((major << 16) | (minor << 8) | (patch))
 
 //! AsmJit library version, see \ref ASMJIT_LIBRARY_MAKE_VERSION for a version format reference.
-#define ASMJIT_LIBRARY_VERSION ASMJIT_LIBRARY_MAKE_VERSION(1, 18, 1)
+#define ASMJIT_LIBRARY_VERSION ASMJIT_LIBRARY_MAKE_VERSION(1, 19, 0)
 
 //! \def ASMJIT_ABI_NAMESPACE
 //!
@@ -27,7 +27,7 @@
 //! AsmJit default, which makes it possible to use multiple AsmJit libraries within a single project, totally
 //! controlled by users. This is useful especially in cases in which some of such library comes from third party.
 #if !defined(ASMJIT_ABI_NAMESPACE)
-  #define ASMJIT_ABI_NAMESPACE v1_18
+  #define ASMJIT_ABI_NAMESPACE v1_19
 #endif // !ASMJIT_ABI_NAMESPACE
 
 //! \}
diff --git a/src/asmjit/core/builder.h b/src/asmjit/core/builder.h
index 5e03c16..797d65c 100644
--- a/src/asmjit/core/builder.h
+++ b/src/asmjit/core/builder.h
@@ -21,8 +21,6 @@
 #include "../core/support.h"
 #include "../core/type.h"
 
-#define ASMJIT_NO_NODE_USERDATA
-
 ASMJIT_BEGIN_NAMESPACE
 
 //! \addtogroup asmjit_builder
diff --git a/src/asmjit/core/compiler.cpp b/src/asmjit/core/compiler.cpp
index 11f0f6b..62f79b4 100644
--- a/src/asmjit/core/compiler.cpp
+++ b/src/asmjit/core/compiler.cpp
@@ -259,7 +259,7 @@ Error BaseCompiler::new_virt_reg(Out<VirtReg*> out, TypeId type_id, OperandSigna
   return Error::kOk;
 }
 
-Error BaseCompiler::_new_reg(Out<Reg> out, TypeId type_id, const char* name) {
+Error BaseCompiler::_new_reg_with_name(Out<Reg> out, TypeId type_id, const char* name) {
   OperandSignature reg_signature;
   out->reset();
 
@@ -276,7 +276,7 @@ Error BaseCompiler::_new_reg(Out<Reg> out, TypeId type_id, const char* name) {
   return Error::kOk;
 }
 
-Error BaseCompiler::_new_reg(Out<Reg> out, const Reg& ref, const char* name) {
+Error BaseCompiler::_new_reg_with_name(Out<Reg> out, const Reg& ref, const char* name) {
   out->reset();
 
   OperandSignature reg_signature;
@@ -351,7 +351,7 @@ Error BaseCompiler::_new_reg(Out<Reg> out, const Reg& ref, const char* name) {
   return Error::kOk;
 }
 
-Error BaseCompiler::_new_reg_fmt(Out<Reg> out, TypeId type_id, const char* fmt, ...) {
+Error BaseCompiler::_new_reg_with_vfmt(Out<Reg> out, TypeId type_id, const char* fmt, ...) {
   va_list ap;
   StringTmp<256> sb;
 
@@ -362,7 +362,7 @@ Error BaseCompiler::_new_reg_fmt(Out<Reg> out, TypeId type_id, const char* fmt,
   return _new_reg(out, type_id, sb.data());
 }
 
-Error BaseCompiler::_new_reg_fmt(Out<Reg> out, const Reg& ref, const char* fmt, ...) {
+Error BaseCompiler::_new_reg_with_vfmt(Out<Reg> out, const Reg& ref, const char* fmt, ...) {
   va_list ap;
   StringTmp<256> sb;
 
diff --git a/src/asmjit/core/compiler.h b/src/asmjit/core/compiler.h
index e4cdc8b..bddf749 100644
--- a/src/asmjit/core/compiler.h
+++ b/src/asmjit/core/compiler.h
@@ -152,26 +152,86 @@ public:
 
   //! Creates a new virtual register representing the given `type_id` and `signature`.
   //!
-  //! \note This function is public, but it's not generally recommended to be used by AsmJit users, use architecture
-  //! specific `new_reg()` functionality instead or functions like \ref _new_reg() and \ref _new_reg_fmt().
+  //! \note This function is public, but it's not generally recommended to be used by AsmJit users, use `new_reg()`,
+  //! `new_similar_reg()`, and architecture specific functions like \ref x86::Compiler::new_gp32(), etc...
   ASMJIT_API Error new_virt_reg(Out<VirtReg*> out, TypeId type_id, OperandSignature signature, const char* name);
 
+  //! \cond INTERNAL
+
   //! Creates a new virtual register of the given `type_id` and stores it to `out` operand.
-  ASMJIT_API Error _new_reg(Out<Reg> out, TypeId type_id, const char* name = nullptr);
+  ASMJIT_API Error _new_reg_with_name(Out<Reg> out, TypeId type_id, const char* name);
+
   //! Creates a new virtual register compatible with the provided reference register `ref`.
-  ASMJIT_API Error _new_reg(Out<Reg> out, const Reg& ref, const char* name = nullptr);
+  ASMJIT_API Error _new_reg_with_name(Out<Reg> out, const Reg& ref, const char* name);
 
   //! Creates a new virtual register of the given `type_id` and stores it to `out` operand.
   //!
   //! \note This version accepts a snprintf() format `fmt` followed by variadic arguments.
-  ASMJIT_API Error _new_reg_fmt(Out<Reg> out, TypeId type_id, const char* fmt, ...);
-  //! \overload
-  ASMJIT_INLINE Error _new_reg_fmt(Out<Reg> out, TypeId type_id) { return _new_reg(out, type_id); }
+  ASMJIT_API Error _new_reg_with_vfmt(Out<Reg> out, TypeId type_id, const char* fmt, ...);
 
   //! Creates a new virtual register compatible with the provided reference register `ref`.
   //!
   //! \note This version accepts a snprintf() format `fmt` followed by variadic arguments.
-  ASMJIT_API Error _new_reg_fmt(Out<Reg> out, const Reg& ref, const char* fmt, ...);
+  ASMJIT_API Error _new_reg_with_vfmt(Out<Reg> out, const Reg& ref, const char* fmt, ...);
+
+  template<typename RegT>
+  ASMJIT_INLINE Error _new_reg(Out<RegT> out, TypeId type_id) {
+    return _new_reg_with_name(Out<Reg>(out.value()), type_id, nullptr);
+  }
+
+  template<typename RegT, typename... Args>
+  ASMJIT_INLINE Error _new_reg(Out<RegT> out, TypeId type_id, const char* name_or_fmt, Args&&... args) {
+#ifndef ASMJIT_NO_LOGGING
+    if constexpr (sizeof...(args) == 0u) {
+      return _new_reg_with_name(Out<Reg>(out.value()), type_id, name_or_fmt);
+    }
+    else {
+      return _new_reg_with_vfmt(Out<Reg>(out.value()), type_id, name_or_fmt, std::forward<Args>(args)...);
+    }
+#else
+    Support::maybe_unused(name_or_fmt, std::forward<Args>(args)...);
+    return _new_reg_with_name(Out<Reg>(out.value()), type_id, nullptr);
+#endif
+  }
+
+  template<typename RegT>
+  ASMJIT_INLINE Error _new_reg(Out<RegT> out, const Reg& ref) {
+    return _new_reg_with_name(Out<Reg>(out.value()), ref, nullptr);
+  }
+
+  template<typename RegT, typename... Args>
+  ASMJIT_INLINE Error _new_reg(Out<RegT> out, const Reg& ref, const char* name_or_fmt, Args&&... args) {
+#ifndef ASMJIT_NO_LOGGING
+    if constexpr (sizeof...(args) == 0u) {
+      return _new_reg_with_name(Out<Reg>(out.value()), ref, name_or_fmt);
+    }
+    else {
+      return _new_reg_with_vfmt(Out<Reg>(out.value()), ref, name_or_fmt, std::forward<Args>(args)...);
+    }
+#else
+    Support::maybe_unused(name_or_fmt, std::forward<Args>(args)...);
+    return _new_reg_with_name(Out<Reg>(out.value()), ref, nullptr);
+#endif
+  }
+
+  //! \endcond
+
+  template<typename RegT, typename... Args>
+  ASMJIT_INLINE_NODEBUG RegT new_reg(TypeId type_id, Args&&... args) {
+    RegT reg(Globals::NoInit);
+    (void)_new_reg<RegT>(Out(reg), type_id, std::forward<Args>(args)...);
+    return reg;
+  }
+
+  //! Creates and returns a new register, which is similar to `ref` in terms of size and type.
+  //!
+  //! \note Optionally you can provide a name and format parameters via `args`.
+  template<typename RegT, typename... Args>
+  ASMJIT_INLINE_NODEBUG RegT new_similar_reg(const RegT& ref, Args&&... args) {
+    RegT reg(Globals::NoInit);
+    (void)_new_reg<RegT>(Out(reg), ref, std::forward<Args>(args)...);
+    return reg;
+  }
 
   //! Tests whether the given `virt_id` is a valid virtual register id.
   [[nodiscard]]
diff --git a/src/asmjit/core/cpuinfo.cpp b/src/asmjit/core/cpuinfo.cpp
index edd488d..2133a7d 100644
--- a/src/asmjit/core/cpuinfo.cpp
+++ b/src/asmjit/core/cpuinfo.cpp
@@ -593,6 +593,98 @@ static ASMJIT_FAVOR_SIZE void detect_x86_cpu(CpuInfo& cpu) noexcept {
   simplify_cpu_brand(cpu._brand.str);
 }
 
+static ASMJIT_FAVOR_SIZE CpuHints recalculate_hints(const CpuInfo& cpu_info, const CpuFeatures::X86& features) noexcept {
+  CpuHints hints {};
+
+  // Vendor Independent CPU Hints
+  // ----------------------------
+
+  if (features.has_avx2()) {
+    hints |= CpuHints::kVecMaskedOps32 | CpuHints::kVecMaskedOps64;
+  }
+
+  if (features.has_avx512_bw()) {
+    hints |= CpuHints::kVecMaskedOps8 | CpuHints::kVecMaskedOps16 | CpuHints::kVecMaskedOps32 | CpuHints::kVecMaskedOps64;
+  }
+
+  // Select optimization flags based on CPU vendor and micro-architecture.
+
+  // AMD Specific CPU Hints
+  // ----------------------
+
+  if (cpu_info.is_vendor("AMD")) {
+    // Zen 3+ has fast gathers, scalar loads and shuffles are faster on Zen 2 and older CPUs.
+    if (cpu_info.family_id() >= 0x19u) {
+      hints |= CpuHints::kVecFastGather;
+    }
+
+    // Zen 1+ provides low-latency VPMULLD instruction.
+    if (features.has_avx2()) {
+      hints |= CpuHints::kVecFastIntMul32;
+    }
+
+    // Zen 4+ provides low-latency VPMULLQ instruction.
+    if (features.has_avx512_dq()) {
+      hints |= CpuHints::kVecFastIntMul64;
+    }
+
+    // Zen 4+ has fast mask operations (starts with AVX-512).
+    if (features.has_avx512_f()) {
+      hints |= CpuHints::kVecMaskedStore;
+    }
+  }
+
+  // Intel Specific CPU Hints
+  // ------------------------
+
+  if (cpu_info.is_vendor("INTEL")) {
+    if (features.has_avx2()) {
+      uint32_t family_id = cpu_info.family_id();
+      uint32_t model_id = cpu_info.model_id();
+
+      // NOTE: We only want to hint fast gathers in cases the CPU is immune to DOWNFALL. The reason is that the
+      // DOWNFALL mitigation delivered via a micro-code update makes gathers almost useless in a way that scalar
+      // loads can beat it significantly (in Blend2D case scalar loads can offer up to 50% more performance).
+      // This table basically picks CPUs that are known to not be affected by DOWNFALL.
+      if (family_id == 0x06u) {
+        switch (model_id) {
+          case 0x8Fu: // Sapphire Rapids.
+          case 0x96u: // Elkhart Lake.
+          case 0x97u: // Alder Lake / Catlow.
+          case 0x9Au: // Alder Lake / Arizona Beach.
+          case 0x9Cu: // Jasper Lake.
+          case 0xAAu: // Meteor Lake.
+          case 0xACu: // Meteor Lake.
+          case 0xADu: // Granite Rapids.
+          case 0xAEu: // Granite Rapids.
+          case 0xAFu: // Sierra Forest.
+          case 0xBAu: // Raptor Lake.
+          case 0xB5u: // Arrow Lake.
+          case 0xB6u: // Grand Ridge.
+          case 0xB7u: // Raptor Lake / Catlow.
+          case 0xBDu: // Lunar Lake.
+          case 0xBEu: // Alder Lake (N).
+          case 0xBFu: // Raptor Lake.
+          case 0xC5u: // Arrow Lake.
+          case 0xC6u: // Arrow Lake.
+          case 0xCFu: // Emerald Rapids.
+          case 0xDDu: // Clearwater Forest.
+            hints |= CpuHints::kVecFastGather;
+            break;
+
+          default:
+            break;
+        }
+      }
+    }
+
+    // TODO: It seems masked stores are very expensive on consumer INTEL CPUs.
+    // hints |= CpuHints::kVecMaskedStore;
+  }
+
+  return hints;
+}
+
 } // {x86}
 
 #endif // ASMJIT_ARCH_X86
@@ -2237,6 +2329,15 @@ static ASMJIT_FAVOR_SIZE void detect_arm_cpu(CpuInfo& cpu) noexcept {
 }
 #endif
 
+static ASMJIT_FAVOR_SIZE CpuHints recalculate_hints(const CpuInfo& cpu_info, const CpuFeatures::ARM& features) noexcept {
+  Support::maybe_unused(cpu_info, features);
+
+  // Assume ARM CPUs have fast 32-bit SIMD integer multiplication.
+  CpuHints hints = CpuHints::kVecFastIntMul32;
+
+  return hints;
+}
+
 } // {arm}
 
 #endif
@@ -2261,8 +2362,9 @@ const CpuInfo& CpuInfo::host() noexcept {
 #elif ASMJIT_ARCH_ARM
     arm::detect_arm_cpu(cpu_info_local);
 #endif
-
     cpu_info_local._hw_thread_count = detect_hw_thread_count();
+    cpu_info_local.update_hints();
+
     cpu_info_global = cpu_info_local;
     cpu_info_initialized_flag.store(1, std::memory_order_seq_cst);
   }
@@ -2270,4 +2372,15 @@ const CpuInfo& CpuInfo::host() noexcept {
   return cpu_info_global;
 }
 
+CpuHints CpuInfo::recalculate_hints(const CpuInfo& info, const CpuFeatures& features) noexcept {
+#if ASMJIT_ARCH_X86
+  return x86::recalculate_hints(info, features.x86());
+#elif ASMJIT_ARCH_ARM
+  return arm::recalculate_hints(info, features.arm());
+#else
+  Support::maybe_unused(info, features);
+  return CpuHints::kNone;
+#endif
+}
+
 ASMJIT_END_NAMESPACE
diff --git a/src/asmjit/core/cpuinfo.h b/src/asmjit/core/cpuinfo.h
index 12ea382..6225a8c 100644
--- a/src/asmjit/core/cpuinfo.h
+++ b/src/asmjit/core/cpuinfo.h
@@ -525,6 +525,59 @@ public:
     ASMJIT_X86_FEATURE(has_amx_transpose, kAMX_TRANSPOSE)
 
     #undef ASMJIT_X86_FEATURE
+
+    ASMJIT_INLINE void remove_avx() noexcept {
+      remove(kAVX                 ,
+             kAVX2                ,
+             kAVX_IFMA            ,
+             kAVX_NE_CONVERT      ,
+             kAVX_VNNI            ,
+             kAVX_VNNI_INT16      ,
+             kAVX_VNNI_INT8       ,
+             kF16C                ,
+             kFMA                 ,
+             kFMA4                ,
+             kVAES                ,
+             kVPCLMULQDQ          ,
+             kXOP);
+      remove_avx512();
+    }
+
+    ASMJIT_INLINE void remove_avx512() noexcept {
+      remove(kAVX512_BF16         ,
+             kAVX512_BITALG       ,
+             kAVX512_BW           ,
+             kAVX512_CD           ,
+             kAVX512_DQ           ,
+             kAVX512_F            ,
+             kAVX512_FP16         ,
+             kAVX512_IFMA         ,
+             kAVX512_VBMI         ,
+             kAVX512_VBMI2        ,
+             kAVX512_VL           ,
+             kAVX512_VNNI         ,
+             kAVX512_VP2INTERSECT ,
+             kAVX512_VPOPCNTDQ    ,
+             kAMX_AVX512);
+      remove_avx10();
+    }
+
+    ASMJIT_INLINE void remove_avx10() noexcept {
+      remove(kAVX10_1 | kAVX10_2);
+    }
+
+    ASMJIT_INLINE void remove_amx() noexcept {
+      remove(kAMX_AVX512          ,
+             kAMX_BF16            ,
+             kAMX_COMPLEX         ,
+             kAMX_FP16            ,
+             kAMX_FP8             ,
+             kAMX_INT8            ,
+             kAMX_MOVRS           ,
+             kAMX_TF32            ,
+             kAMX_TILE            ,
+             kAMX_TRANSPOSE);
+    }
   };
 
   //! ARM specific features data.
@@ -1104,6 +1157,39 @@ public:
   //! \}
 };
 
+//! Describe micro-architectural hints that can be used for optimization purposes and are not part of \ref CpuFeatures.
+enum class CpuHints : uint32_t {
+  //! No honts.
+  kNone = 0x0u,
+
+  //! CPU provides fast 8-bit masked loads and stores.
+  kVecMaskedOps8 = 0x00000001u,
+
+  //! CPU provides fast 16-bit masked loads and stores.
+  kVecMaskedOps16 = 0x00000002u,
+
+  //! CPU provides fast 32-bit masked loads and stores.
+  kVecMaskedOps32 = 0x00000004u,
+
+  //! CPU provides fast 64-bit masked loads and stores.
+  kVecMaskedOps64 = 0x00000008u,
+
+  //! CPU provides low-latency 32-bit multiplication (AMD CPUs).
+  kVecFastIntMul32 = 0x00000010u,
+
+  //! CPU provides low-latency 64-bit multiplication (AMD CPUs).
+  kVecFastIntMul64 = 0x00000020u,
+
+  //! CPU provides fast hardware gathers, which are faster than a sequence of loads and inserts.
+  kVecFastGather = 0x00000040u,
+
+  //! CPU has fast stores with mask.
+  //!
+  //! \note This is a hint to the compiler to emit a masked store instead of a sequence having branches.
+  kVecMaskedStore = 0x00000080u
+};
+ASMJIT_DEFINE_ENUM_FLAGS(CpuHints)
+
 //! CPU information.
 class CpuInfo {
 public:
@@ -1142,6 +1228,9 @@ public:
   //! CPU features.
   CpuFeatures _features {};
 
+  //! CPU hints.
+  CpuHints _hints {};
+
   //! \}
 
   //! \name Construction & Destruction
@@ -1167,6 +1256,12 @@ public:
   [[nodiscard]]
   ASMJIT_API static const CpuInfo& host() noexcept;
 
+  //! Updates CPU hints based on the CPU data and features.
+  //!
+  //! \note This function is called automatically by the CPU detection logic. However, if you change the CPU features
+  //! in your own instance of \ref CpuInfo, CPU hints must be updated too, otherwise they would be out of sync.
+  ASMJIT_API static CpuHints recalculate_hints(const CpuInfo& info, const CpuFeatures& features) noexcept;
+
   //! \}
 
   //! \name Overloaded Operators
@@ -1298,6 +1393,16 @@ public:
   template<typename... Args>
   ASMJIT_INLINE_NODEBUG void remove_feature(Args&&... args) noexcept { return _features.remove(std::forward<Args>(args)...); }
 
+  //! Returns CPU hints.
+  [[nodiscard]]
+  ASMJIT_INLINE_NODEBUG CpuHints hints() const noexcept { return _hints; }
+
+  //! Updates CPU hints based on the CPU data and features.
+  //!
+  //! \note This function is called automatically by the CPU detection logic. However, if you change the CPU features
+  //! in your own instance of \ref CpuInfo, CPU hints must be updated too, otherwise they would be out of sync.
+  ASMJIT_INLINE void update_hints() noexcept { _hints = recalculate_hints(*this, _features); }
+
   //! \}
 };
 
diff --git a/src/asmjit/core/emitter.h b/src/asmjit/core/emitter.h
index d604c08..8da9248 100644
--- a/src/asmjit/core/emitter.h
+++ b/src/asmjit/core/emitter.h
@@ -668,9 +668,9 @@ public:
   //! \name Sections
   //! \{
 
-  //! Switches the given `section`.
+  //! Switches to the given `section`.
   //!
-  //! Once switched, everything is added to the given `section`.
+  //! Once switched, everything is emitted to `section`.
   ASMJIT_API virtual Error section(Section* section);
 
   //! \}
diff --git a/src/asmjit/core/jitallocator.cpp b/src/asmjit/core/jitallocator.cpp
index e5ac4ce..c7f8e98 100644
--- a/src/asmjit/core/jitallocator.cpp
+++ b/src/asmjit/core/jitallocator.cpp
@@ -17,7 +17,7 @@
 #include "../core/virtmem.h"
 
 #if defined(ASMJIT_TEST)
-#include "../../../test/asmjit_test_random.h"
+#include "../../../testing/commons/random.h"
 #endif // ASMJIT_TEST
 
 ASMJIT_BEGIN_NAMESPACE
diff --git a/src/asmjit/core/jitruntime.cpp b/src/asmjit/core/jitruntime.cpp
index a22c10e..b4ef077 100644
--- a/src/asmjit/core/jitruntime.cpp
+++ b/src/asmjit/core/jitruntime.cpp
@@ -15,7 +15,10 @@ JitRuntime::JitRuntime(const JitAllocator::CreateParams* params) noexcept
   : _allocator(params) {
   _environment = Environment::host();
   _environment.set_object_format(ObjectFormat::kJIT);
-  _cpu_features = CpuInfo::host().features();
+
+  const CpuInfo& host_cpu = CpuInfo::host();
+  _cpu_features = host_cpu.features();
+  _cpu_hints = host_cpu.hints();
 }
 
 JitRuntime::~JitRuntime() noexcept {}
diff --git a/src/asmjit/core/support.h b/src/asmjit/core/support.h
index ac28fae..3149ae1 100644
--- a/src/asmjit/core/support.h
+++ b/src/asmjit/core/support.h
@@ -11,9 +11,7 @@
 
 #if defined(_MSC_VER)
   #include <intrin.h>
-#endif
-
-#if defined(__BMI2__)
+#elif defined(__BMI2__)
   #include <x86intrin.h>
 #endif
 
diff --git a/src/asmjit/core/target.cpp b/src/asmjit/core/target.cpp
index df8dc14..59c6fc7 100644
--- a/src/asmjit/core/target.cpp
+++ b/src/asmjit/core/target.cpp
@@ -10,7 +10,8 @@ ASMJIT_BEGIN_NAMESPACE
 
 Target::Target() noexcept
   : _environment{},
-    _cpu_features{} {}
+    _cpu_features{},
+    _cpu_hints{} {}
 Target::~Target() noexcept {}
 
 ASMJIT_END_NAMESPACE
diff --git a/src/asmjit/core/target.h b/src/asmjit/core/target.h
index 73228b6..36cbef1 100644
--- a/src/asmjit/core/target.h
+++ b/src/asmjit/core/target.h
@@ -25,6 +25,8 @@ public:
   Environment _environment;
   //! Target CPU features.
   CpuFeatures _cpu_features;
+  //! Target CPU hints.
+  CpuHints _cpu_hints;
 
   //! \name Construction & Destruction
   //! \{
@@ -55,6 +57,10 @@ public:
   //! Returns target CPU features.
   ASMJIT_INLINE_NODEBUG const CpuFeatures& cpu_features() const noexcept { return _cpu_features; }
 
+  [[nodiscard]]
+  //! Returns target CPU hints.
+  ASMJIT_INLINE_NODEBUG CpuHints cpu_hints() const noexcept { return _cpu_hints; }
+
   //! \}
 };
 
diff --git a/src/asmjit/ujit.h b/src/asmjit/ujit.h
index 3ad4bc1..78c1082 100644
--- a/src/asmjit/ujit.h
+++ b/src/asmjit/ujit.h
@@ -9,6 +9,7 @@
 #include "asmjit-scope-begin.h"
 #include "ujit/ujitbase.h"
 #include "ujit/unicompiler.h"
+#include "ujit/unicondition.h"
 #include "ujit/uniop.h"
 #include "ujit/vecconsttable.h"
 #include "asmjit-scope-end.h"
diff --git a/src/asmjit/ujit/ujitbase.h b/src/asmjit/ujit/ujitbase.h
index 3dcf3c4..ecc72cb 100644
--- a/src/asmjit/ujit/ujitbase.h
+++ b/src/asmjit/ujit/ujitbase.h
@@ -14,23 +14,35 @@
 
 #if !defined(ASMJIT_NO_UJIT)
 
+//! \namespace asmjit::ujit
+//! \ingroup asmjit_ujit
+//!
+//! Namespace that provides all UJIT (Universal JIT) functionality.
+
 ASMJIT_BEGIN_SUB_NAMESPACE(ujit)
 
 //! \addtogroup asmjit_ujit
 //! \{
 
+//! Backend compiler is simply an alias to a `host::Compiler`, which would be used by \ref UniCompiler.
 using BackendCompiler = host::Compiler;
+//! Condition code is simply an alias to a `host::CondCode`.
 using CondCode = host::CondCode;
+//! Target memory operand.
 using Mem = host::Mem;
+//! Target general-purpose register.
 using Gp = host::Gp;
+//! Target vector register.
 using Vec = host::Vec;
 
 #if defined(ASMJIT_UJIT_X86)
+static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Label& label, int32_t disp = 0) noexcept { return x86::ptr(label, disp); }
 static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Gp& base, int32_t disp = 0) noexcept { return x86::ptr(base, disp); }
 static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Gp& base, const Gp& index, uint32_t shift = 0, int32_t disp = 0) noexcept { return x86::ptr(base, index, shift, disp); }
 #endif
 
 #if defined(ASMJIT_UJIT_AARCH64)
+static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Label& label, int32_t disp = 0) noexcept { return a64::ptr(label, disp); }
 static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Gp& base, int32_t disp = 0) noexcept { return a64::ptr(base, disp); }
 static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Gp& base, const Gp& index, uint32_t shift = 0) noexcept { return a64::ptr(base, index, a64::lsl(shift)); }
 #endif
@@ -38,6 +50,7 @@ static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Gp& base, const Gp& index, uint32
 // Types & Enums
 // -------------
 
+//! Data alignment.
 enum class Alignment : uint32_t {};
 
 //! The behavior of a floating point scalar operation.
@@ -48,6 +61,16 @@ enum class ScalarOpBehavior : uint8_t {
   kPreservingVec128
 };
 
+//! The behavior of floating point to int conversion.
+enum class FloatToIntOutsideRangeBehavior : uint8_t {
+  //! In case that the floating point is outside of the integer range, the value is the smallest integer value,
+  //! which would be `0x80`, `0x8000`, `0x80000000`, or `0x8000000000000000` depending on the target integer width.
+  kSmallestValue,
+  //! In case that the floating point is outside of the integer range, the resulting integer will be saturated. If
+  //! the floating point is NaN, the resulting integer value would be zero.
+  kSaturatedValue
+};
+
 //! The behavior of a floating point min/max instructions when comparing against NaN.
 enum class FMinFMaxOpBehavior : uint8_t {
   //! Min and max selects a finite value if one of the compared values is NaN.
@@ -68,16 +91,21 @@ enum class FMAddOpBehavior : uint8_t {
 
 //! SIMD data width.
 enum class DataWidth : uint8_t {
+  //! 8-bit elements.
   k8 = 0,
+  //! 16-bit elements.
   k16 = 1,
+  //! 32-bit elements.
   k32 = 2,
+  //! 64-bit elements or 64-bit wide data is used.
   k64 = 3,
+  //! 128-bit elements or 128-bit wide data is used.
   k128 = 4
 };
 
 //! Vector register width.
 enum class VecWidth : uint8_t {
-  //! 128-bit vector register (baseline, SSE/AVX, NEON, ASIMD, etc...).
+  //! 128-bit vector register (baseline, SSE/AVX, NEON, etc...).
   k128 = 0,
   //! 256-bit vector register (AVX2+).
   k256 = 1,
@@ -89,9 +117,13 @@ enum class VecWidth : uint8_t {
 
 //! Broadcast width.
 enum class Bcst : uint8_t {
+  //! Broadcast 8-bit elements.
   k8 = 0,
+  //! Broadcast 16-bit elements.
   k16 = 1,
+  //! Broadcast 32-bit elements.
   k32 = 2,
+  //! Broadcast 64-bit elements.
   k64 = 3,
 
   kNA = 0xFE,
@@ -104,7 +136,7 @@ static ASMJIT_INLINE OperandSignature signature_of(VecWidth vw) noexcept {
   RegType reg_type = RegType(uint32_t(RegType::kVec128) + uint32_t(vw));
   uint32_t reg_size = 16u << uint32_t(vw);
 
-  return OperandSignature::from_reg_type_and_group(reg_type, RegGroup::kVec) | OperandSignature::from_size(reg_size);
+  return OperandSignature::from_op_type(OperandType::kReg) | OperandSignature::from_reg_type_and_group(reg_type, RegGroup::kVec) | OperandSignature::from_size(reg_size);
 }
 
 static ASMJIT_INLINE TypeId type_id_of(VecWidth vw) noexcept {
@@ -143,7 +175,7 @@ static ASMJIT_INLINE Vec clone_vec_as(const Vec& src, VecWidth vw) noexcept {
 // AsmJit Helpers
 // ==============
 
-//! Operand array used by SIMD pipeline.
+//! Operand array, mostly used for code generation that uses SIMD.
 //!
 //! Can hold up to `kMaxSize` registers, however, the number of actual registers is dynamic and depends
 //! on initialization.
@@ -151,12 +183,15 @@ class OpArray {
 public:
   using Op = Operand_;
 
+  //! Maximum number of active operands `OpArray` can hold.
   static inline constexpr size_t kMaxSize = 8;
 
   //! \name Members
   //! \{
 
+  //! Number of operands in OpArray
   size_t _size;
+  //! Underlying operand array.
   Operand_ v[kMaxSize];
 
   //! \}
@@ -405,6 +440,16 @@ public:
   ASMJIT_INLINE_NODEBUG OpArray even_odd(size_t from) const noexcept { return OpArray(*this, _size > 1u ? from : size_t(0), 2u, _size); }
 };
 
+//! Vector operand array.
+//!
+//! Used to model SIMD code generation where the code generator can use up to \ref OpArray::kMaxSize registers per
+//! `VecArray`. The advantage of `VecArray` is that it allows to parametrize the ideal number of registers at runtime
+//! and to use a single code-path to generate advanced SIMD code.
+//!
+//! In addition, \ref UniCompiler fully understands `VecArray` so it can be passed instead of a regular operand when
+//! emitting code, which greatly simplifies designing high-performance SIMD code.
+//!
+//! \note VecArray is like \ref OpArray, just the whole API works with \ref Vec instead of \ref Operand_.
 class VecArray : public OpArray {
 public:
   //! \name Construction & Destruction
@@ -587,7 +632,7 @@ static ASMJIT_INLINE void reset_var_array(T* array, size_t size) noexcept {
 
 template<typename T>
 static ASMJIT_INLINE void reset_var_struct(T* data, size_t size = sizeof(T)) noexcept {
-  reset_var_array(reinterpret_cast<asmjit::Reg*>(data), size / sizeof(asmjit::Reg));
+  reset_var_array(reinterpret_cast<Reg*>(data), size / sizeof(Reg));
 }
 
 static ASMJIT_INLINE_NODEBUG const Operand_& first_op(const Operand_& operand) noexcept { return operand; }
@@ -615,10 +660,12 @@ struct Swizzle4 {
   ASMJIT_INLINE_CONSTEXPR bool operator!=(const Swizzle4& other) const noexcept { return value != other.value; }
 };
 
+//! Constructs a backend-independent 2-element vector swizzle parameter.
 static ASMJIT_INLINE_CONSTEXPR Swizzle2 swizzle(uint8_t b, uint8_t a) noexcept {
   return Swizzle2{(uint32_t(b) << 8) | a};
 }
 
+//! Constructs a backend-independent 4-element vector swizzle parameter.
 static ASMJIT_INLINE_CONSTEXPR Swizzle4 swizzle(uint8_t d, uint8_t c, uint8_t b, uint8_t a) noexcept {
   return Swizzle4{(uint32_t(d) << 24) | (uint32_t(c) << 16) | (uint32_t(b) << 8) | a};
 }
@@ -631,6 +678,9 @@ enum class Perm2x128 : uint32_t {
   kZero = 8
 };
 
+//! Constructs a backend-independent permutation of 128-bit lanes.
+//!
+//! \note This is currently only used by AVX2 and AVX-512 backends.
 static ASMJIT_INLINE_CONSTEXPR uint8_t perm_2x128_imm(Perm2x128 hi, Perm2x128 lo) noexcept {
   return uint8_t((uint32_t(hi) << 4) | (uint32_t(lo)));
 }
diff --git a/src/asmjit/ujit/unicompiler.h b/src/asmjit/ujit/unicompiler.h
index 34126c1..26ee3e4 100644
--- a/src/asmjit/ujit/unicompiler.h
+++ b/src/asmjit/ujit/unicompiler.h
@@ -17,159 +17,9 @@ ASMJIT_BEGIN_SUB_NAMESPACE(ujit)
 //! \addtogroup asmjit_ujit
 //! \{
 
-//! Condition represents either a condition or an assignment operation that can be checked.
-class Condition {
-public:
-  //! \name Members
-  //! \{
+class UniCondition;
 
-  UniOpCond op;
-  CondCode cond;
-  Operand a;
-  Operand b;
-
-  //! \}
-
-  //! \name Construction & Destruction
-  //! \{
-
-  ASMJIT_INLINE_NODEBUG Condition(UniOpCond op, CondCode cond, const Operand& a, const Operand& b) noexcept
-    : op(op),
-      cond(cond),
-      a(a),
-      b(b) {}
-
-  ASMJIT_INLINE_NODEBUG Condition(const Condition& other) noexcept = default;
-
-  //! \}
-
-  //! \name Overloaded Operators
-  //! \{
-
-  ASMJIT_INLINE_NODEBUG Condition& operator=(const Condition& other) noexcept = default;
-
-  //! \}
-};
-
-static ASMJIT_INLINE Condition and_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition and_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition and_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition and_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition and_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition and_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); }
-
-static ASMJIT_INLINE Condition or_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition or_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition or_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition or_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition or_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition or_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); }
-
-static ASMJIT_INLINE Condition xor_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition xor_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition xor_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition xor_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition xor_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition xor_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); }
-
-static ASMJIT_INLINE Condition add_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition add_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition add_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition add_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition add_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition add_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition add_c(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); }
-static ASMJIT_INLINE Condition add_c(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); }
-static ASMJIT_INLINE Condition add_c(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); }
-static ASMJIT_INLINE Condition add_nc(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); }
-static ASMJIT_INLINE Condition add_nc(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); }
-static ASMJIT_INLINE Condition add_nc(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); }
-static ASMJIT_INLINE Condition add_s(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); }
-static ASMJIT_INLINE Condition add_s(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); }
-static ASMJIT_INLINE Condition add_s(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); }
-static ASMJIT_INLINE Condition add_ns(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); }
-static ASMJIT_INLINE Condition add_ns(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); }
-static ASMJIT_INLINE Condition add_ns(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); }
-
-static ASMJIT_INLINE Condition sub_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition sub_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition sub_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition sub_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition sub_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition sub_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition sub_c(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); }
-static ASMJIT_INLINE Condition sub_c(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); }
-static ASMJIT_INLINE Condition sub_c(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); }
-static ASMJIT_INLINE Condition sub_nc(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); }
-static ASMJIT_INLINE Condition sub_nc(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); }
-static ASMJIT_INLINE Condition sub_nc(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); }
-static ASMJIT_INLINE Condition sub_s(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kSign, a, b); }
-static ASMJIT_INLINE Condition sub_s(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kSign, a, b); }
-static ASMJIT_INLINE Condition sub_s(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kSign, a, b); }
-static ASMJIT_INLINE Condition sub_ns(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); }
-static ASMJIT_INLINE Condition sub_ns(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); }
-static ASMJIT_INLINE Condition sub_ns(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); }
-
-static ASMJIT_INLINE Condition sub_ugt(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); }
-static ASMJIT_INLINE Condition sub_ugt(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); }
-static ASMJIT_INLINE Condition sub_ugt(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); }
-
-static ASMJIT_INLINE Condition shr_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition shr_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition shr_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition shr_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition shr_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition shr_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); }
-
-static ASMJIT_INLINE Condition cmp_eq(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kEqual, a, b); }
-static ASMJIT_INLINE Condition cmp_eq(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kEqual, a, b); }
-static ASMJIT_INLINE Condition cmp_eq(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kEqual, a, b); }
-static ASMJIT_INLINE Condition cmp_ne(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); }
-static ASMJIT_INLINE Condition cmp_ne(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); }
-static ASMJIT_INLINE Condition cmp_ne(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); }
-static ASMJIT_INLINE Condition scmp_lt(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); }
-static ASMJIT_INLINE Condition scmp_lt(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); }
-static ASMJIT_INLINE Condition scmp_lt(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); }
-static ASMJIT_INLINE Condition scmp_le(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); }
-static ASMJIT_INLINE Condition scmp_le(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); }
-static ASMJIT_INLINE Condition scmp_le(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); }
-static ASMJIT_INLINE Condition scmp_gt(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); }
-static ASMJIT_INLINE Condition scmp_gt(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); }
-static ASMJIT_INLINE Condition scmp_gt(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); }
-static ASMJIT_INLINE Condition scmp_ge(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); }
-static ASMJIT_INLINE Condition scmp_ge(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); }
-static ASMJIT_INLINE Condition scmp_ge(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); }
-static ASMJIT_INLINE Condition ucmp_lt(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); }
-static ASMJIT_INLINE Condition ucmp_lt(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); }
-static ASMJIT_INLINE Condition ucmp_lt(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); }
-static ASMJIT_INLINE Condition ucmp_le(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); }
-static ASMJIT_INLINE Condition ucmp_le(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); }
-static ASMJIT_INLINE Condition ucmp_le(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); }
-static ASMJIT_INLINE Condition ucmp_gt(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); }
-static ASMJIT_INLINE Condition ucmp_gt(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); }
-static ASMJIT_INLINE Condition ucmp_gt(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); }
-static ASMJIT_INLINE Condition ucmp_ge(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); }
-static ASMJIT_INLINE Condition ucmp_ge(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); }
-static ASMJIT_INLINE Condition ucmp_ge(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); }
-
-static ASMJIT_INLINE Condition test_z(const Gp& a) noexcept { return Condition(UniOpCond::kCompare, CondCode::kEqual, a, Imm(0)); }
-static ASMJIT_INLINE Condition test_nz(const Gp& a) noexcept { return Condition(UniOpCond::kCompare, CondCode::kNotEqual, a, Imm(0)); }
-
-static ASMJIT_INLINE Condition test_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition test_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition test_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kZero, a, b); }
-static ASMJIT_INLINE Condition test_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition test_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kNotZero, a, b); }
-static ASMJIT_INLINE Condition test_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kNotZero, a, b); }
-
-static ASMJIT_INLINE Condition bt_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); }
-static ASMJIT_INLINE Condition bt_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); }
-static ASMJIT_INLINE Condition bt_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); }
-static ASMJIT_INLINE Condition bt_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); }
-static ASMJIT_INLINE Condition bt_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); }
-static ASMJIT_INLINE Condition bt_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); }
-
-//! Pipeline compiler.
+//! Universal compiler.
 class UniCompiler {
 public:
   ASMJIT_NONCOPYABLE(UniCompiler)
@@ -285,7 +135,10 @@ public:
   //! AsmJit compiler.
   BackendCompiler* cc = nullptr;
 
-  const VecConstTable& ct;
+  //! Reference to a table that provides global constants.
+  //!
+  //! \note This table can be extended by users so it fits a particular use-case, see \ref UniCompiler constructor.
+  VecConstTableRef _ct_ref;
 
 #if defined(ASMJIT_UJIT_X86)
   //! General purpose extension mask (X86 and X86_64 only).
@@ -306,14 +159,16 @@ public:
   //! The behavior of scalar operations (mostly floating point).
   ScalarOpBehavior _scalar_op_behavior {};
   //! The behavior of floating point min/max operation.
-  FMinFMaxOpBehavior _fmin_fmax_op_hehavior {};
+  FMinFMaxOpBehavior _fmin_fmax_op_behavior {};
   //! The behavior of floating point `madd` operation.
   FMAddOpBehavior _fmadd_op_behavior {};
+  //! The behavior of a float-to-int conversion when the float is out of integer range, infinite, or NaN.
+  FloatToIntOutsideRangeBehavior _float_to_int_outside_range_behavior {};
 
   //! Target CPU features.
   CpuFeatures _features {};
   //! Optimization flags.
-  UniOptFlags _opt_flags = UniOptFlags::kNone;
+  CpuHints _cpu_hints {};
 
   //! Number of available vector registers.
   uint32_t _vec_reg_count = 0;
@@ -323,20 +178,20 @@ public:
   //! SIMD multiplier, derived from `_vec_width` (1, 2, 4).
   uint8_t _vec_multiplier = 0;
   //! SIMD register type (AsmJit).
-  asmjit::RegType _vec_reg_type = asmjit::RegType::kNone;
+  RegType _vec_reg_type = RegType::kNone;
   //! SIMD type id (AsmJit).
-  asmjit::TypeId _vec_type_id = asmjit::TypeId::kVoid;
+  TypeId _vec_type_id = TypeId::kVoid;
 
   //! Function node.
-  asmjit::FuncNode* _func_node = nullptr;
+  FuncNode* _func_node = nullptr;
   //! Function initialization hook.
-  asmjit::BaseNode* _func_init = nullptr;
+  BaseNode* _func_init = nullptr;
   //! Function end hook (to add 'unlikely' branches).
-  asmjit::BaseNode* _func_end = nullptr;
+  BaseNode* _func_end = nullptr;
 
   //! Invalid GP register.
   Gp _gp_none;
-  //! Temporary stack used to transfer SIMD regs to GP/MM.
+  //! Temporary stack used to transfer SIMD regs to GP.
   Mem _tmp_stack[size_t(StackId::kMaxValue) + 1];
 
   //! Offset to the first constant to the `commonTable` global.
@@ -359,15 +214,22 @@ public:
     uint32_t virt_reg_id;
   };
 
-  asmjit::ArenaVector<VecConstData> _vec_consts;
-  asmjit::ArenaVector<VecConstDataEx> _vec_consts_ex;
+  ArenaVector<VecConstData> _vec_consts;
+  ArenaVector<VecConstDataEx> _vec_consts_ex;
 
   //! \}
 
   //! \name Construction & Destruction
   //! \{
 
-  ASMJIT_API UniCompiler(BackendCompiler* cc, const CpuFeatures& features, UniOptFlags opt_flags) noexcept;
+  //! Creates `UniCompiler` that would use the existing BackendCompiler (it would keep the pointer to it).
+  ASMJIT_API UniCompiler(BackendCompiler* cc, const CpuFeatures& features, CpuHints cpu_hints, VecConstTableRef ct_ref) noexcept;
+
+  //! Creates `UniCompiler` that would use the existing BackendCompiler (it would keep the pointer to it).
+  ASMJIT_INLINE UniCompiler(BackendCompiler* cc, const CpuFeatures& features, CpuHints cpu_hints) noexcept
+    : UniCompiler(cc, features, cpu_hints, VecConstTableRef{vec_const_table, sizeof(VecConstTable)}) {}
+
+  //! Destroys `UniCompiler` - the existing BackendCompiler would be untouched.
   ASMJIT_API ~UniCompiler() noexcept;
 
   //! \}
@@ -375,22 +237,39 @@ public:
   //! \name Allocators
   //! \{
 
-  ASMJIT_INLINE_NODEBUG asmjit::Arena& arena() noexcept { return cc->_builder_arena; }
+  //! Returns the arena used by `UniCompiler`.
+  ASMJIT_INLINE_NODEBUG Arena& arena() noexcept { return cc->_builder_arena; }
+
+  //! \}
+
+  //! \name Constant Table
+  //! \{
+
+  template<typename T = VecConstTable>
+  ASMJIT_INLINE_NODEBUG const T& ct() const noexcept { return static_cast<const T&>(_ct_ref.table); }
+
+  template<typename T = VecConstTable>
+  ASMJIT_INLINE_NODEBUG const T* ct_ptr() const noexcept { return static_cast<const T*>(&_ct_ref.table); }
+
+  ASMJIT_INLINE_NODEBUG size_t ct_size() const noexcept { return _ct_ref.size; }
 
   //! \}
 
   //! \name CPU Architecture, Features and Optimization Options
   //! \{
 
-  ASMJIT_API void _init_extensions(const asmjit::CpuFeatures& features) noexcept;
+  ASMJIT_API void _init_extensions(const CpuFeatures& features) noexcept;
 
   ASMJIT_INLINE_NODEBUG bool is_32bit() const noexcept { return cc->is_32bit(); }
   ASMJIT_INLINE_NODEBUG bool is_64bit() const noexcept { return cc->is_64bit(); }
   ASMJIT_INLINE_NODEBUG uint32_t register_size() const noexcept { return cc->register_size(); }
 
 #if defined(ASMJIT_UJIT_X86)
+  //! Tests whether a general purpose extension `ext` is available.
   ASMJIT_INLINE_NODEBUG bool has_gp_ext(GPExt ext) const noexcept { return (_gp_ext_mask & (1u << uint32_t(ext))) != 0; }
+  //! Tests whether an SSE extension `ext` is available.
   ASMJIT_INLINE_NODEBUG bool has_sse_ext(SSEExt ext) const noexcept { return (_sse_ext_mask & (1u << uint32_t(ext))) != 0; }
+  //! Tests whether an AVX or AVX-512 extension `ext` is available.
   ASMJIT_INLINE_NODEBUG bool has_avx_ext(AVXExt ext) const noexcept { return (_avx_ext_mask & (uint64_t(1) << uint32_t(ext))) != 0; }
 
   //! Tests whether ADX extension is available.
@@ -468,7 +347,9 @@ public:
 #endif // ASMJIT_UJIT_X86
 
 #if defined(ASMJIT_UJIT_AARCH64)
+  //! Tests whether a general purpose extension `ext` is available.
   ASMJIT_INLINE_NODEBUG bool has_gp_ext(GPExt ext) const noexcept { return (_gp_ext_mask & (uint64_t(1) << uint32_t(ext))) != 0; }
+  //! Tests whether an ASIMD extension `ext` is available.
   ASMJIT_INLINE_NODEBUG bool has_asimd_ext(ASIMDExt ext) const noexcept { return (_asimd_ext_mask & (uint64_t(1) << uint32_t(ext))) != 0; }
 
   //! Tests whether CSSC extension is available.
@@ -535,9 +416,12 @@ public:
   //! Returns the behavior of scalar operations (mostly floating point).
   ASMJIT_INLINE_NODEBUG ScalarOpBehavior scalar_op_behavior() const noexcept { return _scalar_op_behavior; }
   //! Returns the behavior of floating point min/max operations.
-  ASMJIT_INLINE_NODEBUG FMinFMaxOpBehavior fmin_fmax_op_hehavior() const noexcept { return _fmin_fmax_op_hehavior; }
+  ASMJIT_INLINE_NODEBUG FMinFMaxOpBehavior fmin_fmax_op_behavior() const noexcept { return _fmin_fmax_op_behavior; }
   //! Returns the behavior of floating point mul+add (`madd`) operations.
   ASMJIT_INLINE_NODEBUG FMAddOpBehavior fmadd_op_behavior() const noexcept { return _fmadd_op_behavior; }
+  //! Returns the behavior of float-to-integer conversion when the floating point is outside of the integer representable
+  //! range, infinite, or NaN.
+  ASMJIT_INLINE_NODEBUG FloatToIntOutsideRangeBehavior float_to_int_outside_range_behavior() const noexcept { return _float_to_int_outside_range_behavior; }
 
   //! Tests whether a scalar operation is zeroing the rest of the destination register (AArch64).
   ASMJIT_INLINE_NODEBUG bool is_scalar_op_zeroing() const noexcept { return _scalar_op_behavior == ScalarOpBehavior::kZeroing; }
@@ -545,9 +429,9 @@ public:
   ASMJIT_INLINE_NODEBUG bool is_scalar_op_preserving_vec128() const noexcept { return _scalar_op_behavior == ScalarOpBehavior::kPreservingVec128; }
 
   //! Tests whether a floating point min/max operation selects a finite value if one of the values is NaN (AArch64).
-  ASMJIT_INLINE_NODEBUG bool is_fmin_fmax_finite() const noexcept { return _fmin_fmax_op_hehavior == FMinFMaxOpBehavior::kFiniteValue; }
+  ASMJIT_INLINE_NODEBUG bool is_fmin_fmax_finite() const noexcept { return _fmin_fmax_op_behavior == FMinFMaxOpBehavior::kFiniteValue; }
   //! Tests whether a floating point min/max operation works as a ternary if - `if a <|> b ? a : b` (X86|X86_64).
-  ASMJIT_INLINE_NODEBUG bool is_fmin_fmax_ternary() const noexcept { return _fmin_fmax_op_hehavior == FMinFMaxOpBehavior::kTernaryLogic; }
+  ASMJIT_INLINE_NODEBUG bool is_fmin_fmax_ternary() const noexcept { return _fmin_fmax_op_behavior == FMinFMaxOpBehavior::kTernaryLogic; }
 
   //! Tests whether a floating point mul+add operation is fused (uses FMA).
   ASMJIT_INLINE_NODEBUG bool is_fmadd_fused() const noexcept { return _fmadd_op_behavior != FMAddOpBehavior::kNoFMA; }
@@ -556,8 +440,10 @@ public:
   //! Tests whether a FMA operation is available and that it only stores the result to accumulator register.
   ASMJIT_INLINE_NODEBUG bool is_fma_storing_to_any_accumulator() const noexcept { return _fmadd_op_behavior == FMAddOpBehavior::kFMAStoreToAccumulator; }
 
-  ASMJIT_INLINE_NODEBUG UniOptFlags opt_flags() const noexcept { return _opt_flags; }
-  ASMJIT_INLINE_NODEBUG bool has_opt_flag(UniOptFlags flag) const noexcept { return Support::test(_opt_flags, flag); }
+  //! Returns CPU hints.
+  ASMJIT_INLINE_NODEBUG CpuHints cpu_hints() const noexcept { return _cpu_hints; }
+  //! Tests whether a CPU hint `hint` is enabled.
+  ASMJIT_INLINE_NODEBUG bool has_cpu_hint(CpuHints hint) const noexcept { return Support::test(_cpu_hints, hint); }
 
   //! Returns a native register signature, either 32-bit or 64-bit depending on the target architecture).
   ASMJIT_INLINE_NODEBUG OperandSignature gp_signature() const noexcept { return cc->gp_signature(); }
@@ -599,7 +485,7 @@ public:
   //! \name Function
   //! \{
 
-  ASMJIT_API void init_function(asmjit::FuncNode* func_node) noexcept;
+  ASMJIT_API void init_function(FuncNode* func_node) noexcept;
 
   //! \}
 
@@ -607,13 +493,15 @@ public:
   //! \{
 
   ASMJIT_INLINE void rename(const OpArray& op_array, const char* name) noexcept {
-    for (uint32_t i = 0; i < op_array.size(); i++)
-      cc->rename(op_array[i].as<asmjit::Reg>(), "%s%u", name, unsigned(i));
+    for (uint32_t i = 0; i < op_array.size(); i++) {
+      cc->rename(op_array[i].as<Reg>(), "%s%u", name, unsigned(i));
+    }
   }
 
   ASMJIT_INLINE void rename(const OpArray& op_array, const char* prefix, const char* name) noexcept {
-    for (uint32_t i = 0; i < op_array.size(); i++)
-      cc->rename(op_array[i].as<asmjit::Reg>(), "%s%s%u", prefix, name, unsigned(i));
+    for (uint32_t i = 0; i < op_array.size(); i++) {
+      cc->rename(op_array[i].as<Reg>(), "%s%s%u", prefix, name, unsigned(i));
+    }
   }
 
   //! \}
@@ -631,84 +519,163 @@ public:
   //! \name Virtual Registers & Memory (Target Independent)
   //! \{
 
-  ASMJIT_INLINE Gp new_gp32() noexcept { return cc->new_gp32(); }
-  ASMJIT_INLINE Gp new_gp64() noexcept { return cc->new_gp64(); }
-  ASMJIT_INLINE Gp new_gpz() noexcept { return cc->new_gpz(); }
-
-  template<typename... Args>
-  ASMJIT_INLINE Gp new_gp32(const char* name, Args&&... args) noexcept { return cc->new_gp32(name, std::forward<Args>(args)...); }
-  template<typename... Args>
-  ASMJIT_INLINE Gp new_gp64(const char* name, Args&&... args) noexcept { return cc->new_gp64(name, std::forward<Args>(args)...); }
-  template<typename... Args>
-  ASMJIT_INLINE Gp new_gpz(const char* name, Args&&... args) noexcept { return cc->new_gpz(name, std::forward<Args>(args)...); }
-
-  template<typename RegT>
-  ASMJIT_INLINE RegT new_similar_reg(const RegT& ref) noexcept { return cc->new_similar_reg(ref); }
+  //! Wraps `BackendCompiler::new_reg(type_id, args...)`.
   template<typename RegT, typename... Args>
-  ASMJIT_INLINE RegT new_similar_reg(const RegT& ref, Args&&... args) noexcept { return cc->new_similar_reg(ref, std::forward<Args>(args)...); }
+  ASMJIT_INLINE RegT new_reg(TypeId type_id, Args&&... args) noexcept {
+    return cc->new_similar_reg<RegT>(type_id, std::forward<Args>(args)...);
+  }
 
+  //! Wraps `BackendCompiler::new_similar_reg(ref, args...)`.
+  template<typename RegT, typename... Args>
+  ASMJIT_INLINE RegT new_similar_reg(const RegT& ref, Args&&... args) noexcept {
+    return cc->new_similar_reg(ref, std::forward<Args>(args)...);
+  }
+
+  //! Wraps `BackendCompiler::new_gp32(args...)`.
   template<typename... Args>
-  ASMJIT_INLINE Vec new_vec(const char* name, Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), _vec_type_id, name, std::forward<Args>(args)...);
-    return reg;
+  ASMJIT_INLINE Gp new_gp32(Args&&... args) noexcept {
+    return cc->new_gp32(std::forward<Args>(args)...);
+  }
+
+  //! Wraps `BackendCompiler::new_gp64(args...)`.
+  template<typename... Args>
+  ASMJIT_INLINE Gp new_gp64(Args&&... args) noexcept {
+    return cc->new_gp64(std::forward<Args>(args)...);
+  }
+
+  //! Wraps `BackendCompiler::new_gpz(args...)`.
+  template<typename... Args>
+  ASMJIT_INLINE Gp new_gpz(Args&&... args) noexcept {
+    return cc->new_gpz(std::forward<Args>(args)...);
+  }
+
+  //! Wraps `BackendCompiler::new_gpz(args...)`.
+  template<typename... Args>
+  ASMJIT_INLINE Gp new_gp_ptr(Args&&... args) noexcept {
+    return cc->new_gp_ptr(std::forward<Args>(args)...);
   }
 
   template<typename... Args>
-  ASMJIT_INLINE Vec new_vec(VecWidth vw, const char* name, Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), VecWidthUtils::type_id_of(vw), name, std::forward<Args>(args)...);
-    return reg;
+  ASMJIT_INLINE Vec new_vec(Args&&... args) noexcept {
+    return cc->new_vec(_vec_type_id, std::forward<Args>(args)...);
   }
 
-  ASMJIT_NOINLINE void new_reg_array(OpArray& dst, uint32_t n, asmjit::TypeId type_id, const char* name) noexcept {
+  template<typename... Args>
+  ASMJIT_INLINE Vec new_vec_with_width(VecWidth vw, Args&&... args) noexcept {
+    return cc->new_reg<Vec>(VecWidthUtils::type_id_of(vw), std::forward<Args>(args)...);
+  }
+
+  template<typename... Args>
+  ASMJIT_INLINE Vec new_vec128(Args&&... args) noexcept {
+    return cc->new_vec128(std::forward<Args>(args)...);
+  }
+
+  template<typename... Args>
+  ASMJIT_INLINE Vec new_vec128_f32x1(Args&&... args) noexcept {
+    return cc->new_vec128_f32x1(std::forward<Args>(args)...);
+  }
+
+  template<typename... Args>
+  ASMJIT_INLINE Vec new_vec128_f64x1(Args&&... args) noexcept {
+    return cc->new_vec128_f64x1(std::forward<Args>(args)...);
+  }
+
+  template<typename... Args>
+  ASMJIT_INLINE Vec new_vec128_f32x4(Args&&... args) noexcept {
+    return cc->new_vec128_f32x4(std::forward<Args>(args)...);
+  }
+
+  template<typename... Args>
+  ASMJIT_INLINE Vec new_vec128_f64x2(Args&&... args) noexcept {
+    return cc->new_vec128_f64x2(std::forward<Args>(args)...);
+  }
+
+#if defined(ASMJIT_UJIT_X86)
+  template<typename... Args>
+  ASMJIT_INLINE Vec new_vec256(Args&&... args) noexcept {
+    return cc->new_vec256(std::forward<Args>(args)...);
+  }
+
+  template<typename... Args>
+  ASMJIT_INLINE Vec new_vec512(Args&&... args) noexcept {
+    return cc->new_vec512(std::forward<Args>(args)...);
+  }
+#endif // ASMJIT_UJIT_X86
+
+  ASMJIT_NOINLINE void new_reg_array(OpArray& dst, size_t n, TypeId type_id, const char* name) noexcept {
     ASMJIT_ASSERT(n <= OpArray::kMaxSize);
     dst._size = n;
-    for (uint32_t i = 0; i < n; i++) {
-      cc->_new_reg_fmt(Out(dst[i].as<asmjit::Reg>()), type_id, "%s%u", name, i);
+    for (size_t i = 0; i < n; i++) {
+      cc->_new_reg(Out(dst[i].as<Reg>()), type_id, "%s%u", name, i);
     }
   }
 
-  ASMJIT_NOINLINE void new_reg_array(OpArray& dst, uint32_t n, asmjit::TypeId type_id, const char* prefix, const char* name) noexcept {
+  ASMJIT_NOINLINE void new_reg_array(OpArray& dst, size_t n, TypeId type_id, const char* prefix, const char* name) noexcept {
     ASMJIT_ASSERT(n <= OpArray::kMaxSize);
     dst._size = n;
-    for (uint32_t i = 0; i < n; i++) {
-      cc->_new_reg_fmt(Out(dst[i].as<asmjit::Reg>()), type_id, "%s%s%u", prefix, name, i);
+    for (size_t i = 0; i < n; i++) {
+      cc->_new_reg(Out(dst[i].as<Reg>()), type_id, "%s%s%u", prefix, name, i);
     }
   }
 
-  ASMJIT_NOINLINE void new_reg_array(OpArray& dst, uint32_t n, const asmjit::Reg& ref, const char* name) noexcept {
+  ASMJIT_NOINLINE void new_reg_array(OpArray& dst, size_t n, const Reg& ref, const char* name) noexcept {
     ASMJIT_ASSERT(n <= OpArray::kMaxSize);
     dst._size = n;
-    for (uint32_t i = 0; i < n; i++) {
-      cc->_new_reg_fmt(Out(dst[i].as<asmjit::Reg>()), ref, "%s%u", name, i);
+    for (size_t i = 0; i < n; i++) {
+      cc->_new_reg(Out(dst[i].as<Reg>()), ref, "%s%u", name, i);
     }
   }
 
-  ASMJIT_NOINLINE void new_reg_array(OpArray& dst, uint32_t n, const asmjit::Reg& ref, const char* prefix, const char* name) noexcept {
+  ASMJIT_NOINLINE void new_reg_array(OpArray& dst, size_t n, const Reg& ref, const char* prefix, const char* name) noexcept {
     ASMJIT_ASSERT(n <= OpArray::kMaxSize);
     dst._size = n;
-    for (uint32_t i = 0; i < n; i++) {
-      cc->_new_reg_fmt(Out(dst[i].as<asmjit::Reg>()), ref, "%s%s%u", prefix, name, i);
+    for (size_t i = 0; i < n; i++) {
+      cc->_new_reg(Out(dst[i].as<Reg>()), ref, "%s%s%u", prefix, name, i);
     }
   }
 
-  ASMJIT_INLINE void new_vec_array(OpArray& dst, uint32_t n, VecWidth vw, const char* name) noexcept {
+  ASMJIT_INLINE void new_vec_array(OpArray& dst, size_t n, VecWidth vw, const char* name) noexcept {
     new_reg_array(dst, n, VecWidthUtils::type_id_of(vw), name);
   }
 
-  ASMJIT_INLINE void new_vec_array(OpArray& dst, uint32_t n, VecWidth vw, const char* prefix, const char* name) noexcept {
+  ASMJIT_INLINE void new_vec_array(OpArray& dst, size_t n, VecWidth vw, const char* prefix, const char* name) noexcept {
     new_reg_array(dst, n, VecWidthUtils::type_id_of(vw), prefix, name);
   }
 
-  ASMJIT_INLINE void new_vec_array(OpArray& dst, uint32_t n, const Vec& ref, const char* name) noexcept {
+  ASMJIT_INLINE void new_vec_array(OpArray& dst, size_t n, const Vec& ref, const char* name) noexcept {
     new_reg_array(dst, n, ref, name);
   }
 
-  ASMJIT_INLINE void new_vec_array(OpArray& dst, uint32_t n, const Vec& ref, const char* prefix, const char* name) noexcept {
+  ASMJIT_INLINE void new_vec_array(OpArray& dst, size_t n, const Vec& ref, const char* prefix, const char* name) noexcept {
     new_reg_array(dst, n, ref, prefix, name);
   }
 
+  ASMJIT_INLINE void new_vec128_array(OpArray& dst, size_t n, const char* name) noexcept {
+    new_reg_array(dst, n, TypeId::kInt32x4, name);
+  }
+
+  ASMJIT_INLINE void new_vec128_array(OpArray& dst, size_t n, const char* prefix, const char* name) noexcept {
+    new_reg_array(dst, n, TypeId::kInt32x4, prefix, name);
+  }
+
+#if defined(ASMJIT_UJIT_X86)
+  ASMJIT_INLINE void new_vec256_array(OpArray& dst, size_t n, const char* name) noexcept {
+    new_reg_array(dst, n, TypeId::kInt32x8, name);
+  }
+
+  ASMJIT_INLINE void new_vec256_array(OpArray& dst, size_t n, const char* prefix, const char* name) noexcept {
+    new_reg_array(dst, n, TypeId::kInt32x8, prefix, name);
+  }
+
+  ASMJIT_INLINE void new_vec512_array(OpArray& dst, size_t n, const char* name) noexcept {
+    new_reg_array(dst, n, TypeId::kInt32x16, name);
+  }
+
+  ASMJIT_INLINE void new_vec512_array(OpArray& dst, size_t n, const char* prefix, const char* name) noexcept {
+    new_reg_array(dst, n, TypeId::kInt32x16, prefix, name);
+  }
+#endif // ASMJIT_UJIT_X86
+
   ASMJIT_API Mem tmp_stack(StackId id, uint32_t size) noexcept;
 
   //! \}
@@ -722,165 +689,6 @@ public:
 
   ASMJIT_API void _init_vec_const_table_ptr() noexcept;
 
-  //! \name Virtual Registers
-  //! \{
-
-#if defined(ASMJIT_UJIT_X86)
-
-  ASMJIT_INLINE Vec new_vec128() noexcept {
-    Vec reg;
-    cc->_new_reg(Out<Reg>(reg), asmjit::TypeId::kInt32x4);
-    return reg;
-  }
-
-  ASMJIT_INLINE Vec new_vec128_1xf32() noexcept {
-    Vec reg;
-    cc->_new_reg(Out<Reg>(reg), asmjit::TypeId::kFloat32x1);
-    return reg;
-  }
-
-  ASMJIT_INLINE Vec new_vec128_1xf64() noexcept {
-    Vec reg;
-    cc->_new_reg(Out<Reg>(reg), asmjit::TypeId::kFloat64x1);
-    return reg;
-  }
-
-  ASMJIT_INLINE Vec new_vec128_4xf32() noexcept {
-    Vec reg;
-    cc->_new_reg(Out<Reg>(reg), asmjit::TypeId::kFloat32x4);
-    return reg;
-  }
-
-  ASMJIT_INLINE Vec new_vec128_2xf64() noexcept {
-    Vec reg;
-    cc->_new_reg(Out<Reg>(reg), asmjit::TypeId::kFloat64x2);
-    return reg;
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec128(Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kInt32x4, std::forward<Args>(args)...);
-    return reg;
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec128_1xf32(Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kFloat32x1, std::forward<Args>(args)...);
-    return reg;
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec128_1xf64(Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kFloat64x1, std::forward<Args>(args)...);
-    return reg;
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec128_4xf32(Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kFloat32x4, std::forward<Args>(args)...);
-    return reg;
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec128_2xf64(Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kFloat64x2, std::forward<Args>(args)...);
-    return reg;
-  }
-
-  ASMJIT_INLINE void new_vec128_array(OpArray& dst, uint32_t n, const char* name) noexcept {
-    new_reg_array(dst, n, asmjit::TypeId::kInt32x4, name);
-  }
-
-  ASMJIT_INLINE void new_vec128_array(OpArray& dst, uint32_t n, const char* prefix, const char* name) noexcept {
-    new_reg_array(dst, n, asmjit::TypeId::kInt32x4, prefix, name);
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec256(const char* name, Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kInt32x8, name, std::forward<Args>(args)...);
-    return reg;
-  }
-
-  ASMJIT_INLINE void new_vec256_array(OpArray& dst, uint32_t n, const char* name) noexcept {
-    new_reg_array(dst, n, asmjit::TypeId::kInt32x8, name);
-  }
-
-  ASMJIT_INLINE void new_vec256_array(OpArray& dst, uint32_t n, const char* prefix, const char* name) noexcept {
-    new_reg_array(dst, n, asmjit::TypeId::kInt32x8, prefix, name);
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec512(const char* name, Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kInt32x16, name, std::forward<Args>(args)...);
-    return reg;
-  }
-
-  ASMJIT_INLINE void new_vec512_array(OpArray& dst, uint32_t n, const char* name) noexcept {
-    new_reg_array(dst, n, asmjit::TypeId::kInt32x16, name);
-  }
-
-  ASMJIT_INLINE void new_vec512_array(OpArray& dst, uint32_t n, const char* prefix, const char* name) noexcept {
-    new_reg_array(dst, n, asmjit::TypeId::kInt32x16, prefix, name);
-  }
-
-#endif // ASMJIT_UJIT_X86
-
-#if defined(ASMJIT_UJIT_AARCH64)
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec128(const char* name, Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kInt32x4, name, std::forward<Args>(args)...);
-    return reg;
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec128_1xf32(const char* name, Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kFloat32x1, name, std::forward<Args>(args)...);
-    return reg.v128();
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec128_1xf64(const char* name, Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kFloat64x1, name, std::forward<Args>(args)...);
-    return reg.v128();
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec128_4xf32(const char* name, Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kFloat32x4, name, std::forward<Args>(args)...);
-    return reg;
-  }
-
-  template<typename... Args>
-  ASMJIT_INLINE Vec new_vec128_2xf64(const char* name, Args&&... args) noexcept {
-    Vec reg;
-    cc->_new_reg_fmt(Out<Reg>(reg), asmjit::TypeId::kFloat64x2, name, std::forward<Args>(args)...);
-    return reg;
-  }
-
-  ASMJIT_INLINE void new_vec128_array(OpArray& dst, uint32_t n, const char* name) noexcept {
-    new_reg_array(dst, n, asmjit::TypeId::kInt32x4, name);
-  }
-
-  ASMJIT_INLINE void new_vec128_array(OpArray& dst, uint32_t n, const char* prefix, const char* name) noexcept {
-    new_reg_array(dst, n, asmjit::TypeId::kInt32x4, prefix, name);
-  }
-
-#endif
-
-  //! \}
-
   //! \name Constants (X86|X86_64)
   //! \{
 
@@ -901,14 +709,14 @@ public:
   ASMJIT_API Mem simd_mem_const(const void* c, Bcst bcst_width, const VecArray& similar_to) noexcept;
 
   ASMJIT_API Mem _get_mem_const(const void* c) noexcept;
-  ASMJIT_API Vec _new_vecConst(const void* c, bool is_unique_const) noexcept;
+  ASMJIT_API Vec _new_vec_const(const void* c, bool is_unique_const) noexcept;
 
 #if defined(ASMJIT_UJIT_AARCH64)
   ASMJIT_API Vec simd_const_16b(const void* data16) noexcept;
 #endif // ASMJIT_UJIT_AARCH64
 
 #if defined(ASMJIT_UJIT_AARCH64)
-  inline Vec simd_vec_zero(const Vec& similar_to) noexcept { return simd_vec_const(&ct.p_0000000000000000, Bcst::k32, similar_to); }
+  inline Vec simd_vec_zero(const Vec& similar_to) noexcept { return simd_vec_const(&ct().p_0000000000000000, Bcst::k32, similar_to); }
 #endif // ASMJIT_UJIT_AARCH64
 
   //! \}
@@ -920,12 +728,12 @@ public:
   ASMJIT_API void emit_m(UniOpM op, const Mem& m) noexcept;
   ASMJIT_API void emit_rm(UniOpRM op, const Gp& dst, const Mem& src) noexcept;
   ASMJIT_API void emit_mr(UniOpMR op, const Mem& dst, const Gp& src) noexcept;
-  ASMJIT_API void emit_cmov(const Gp& dst, const Operand_& sel, const Condition& condition) noexcept;
-  ASMJIT_API void emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const Condition& condition) noexcept;
+  ASMJIT_API void emit_cmov(const Gp& dst, const Operand_& sel, const UniCondition& condition) noexcept;
+  ASMJIT_API void emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const UniCondition& condition) noexcept;
   ASMJIT_API void emit_2i(UniOpRR op, const Gp& dst, const Operand_& src_) noexcept;
   ASMJIT_API void emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, const Operand_& src2_) noexcept;
   ASMJIT_API void emit_j(const Operand_& target) noexcept;
-  ASMJIT_API void emit_j_if(const Label& target, const Condition& condition) noexcept;
+  ASMJIT_API void emit_j_if(const Label& target, const UniCondition& condition) noexcept;
 
   ASMJIT_INLINE void mov(const Gp& dst, const Gp& src) noexcept { return emit_mov(dst, src); }
   ASMJIT_INLINE void mov(const Gp& dst, const Imm& src) noexcept { return emit_mov(dst, src); }
@@ -963,11 +771,11 @@ public:
   ASMJIT_INLINE void mem_add_u32(const Mem& dst, const Gp& src) noexcept { return emit_mr(UniOpMR::kAddU32, dst, src); }
   ASMJIT_INLINE void mem_add_u64(const Mem& dst, const Gp& src) noexcept { return emit_mr(UniOpMR::kAddU64, dst, src); }
 
-  ASMJIT_INLINE void cmov(const Gp& dst, const Gp& sel, const Condition& condition) noexcept { emit_cmov(dst, sel, condition); }
-  ASMJIT_INLINE void cmov(const Gp& dst, const Mem& sel, const Condition& condition) noexcept { emit_cmov(dst, sel, condition); }
+  ASMJIT_INLINE void cmov(const Gp& dst, const Gp& sel, const UniCondition& condition) noexcept { emit_cmov(dst, sel, condition); }
+  ASMJIT_INLINE void cmov(const Gp& dst, const Mem& sel, const UniCondition& condition) noexcept { emit_cmov(dst, sel, condition); }
 
   template<typename Sel1, typename Sel2>
-  ASMJIT_INLINE void select(const Gp& dst, const Sel1& sel1, const Sel2& sel2, const Condition& condition) noexcept { emit_select(dst, sel1, sel2, condition); }
+  ASMJIT_INLINE void select(const Gp& dst, const Sel1& sel1, const Sel2& sel2, const UniCondition& condition) noexcept { emit_select(dst, sel1, sel2, condition); }
 
   ASMJIT_INLINE void abs(const Gp& dst, const Gp& src) noexcept { emit_2i(UniOpRR::kAbs, dst, src); }
   ASMJIT_INLINE void abs(const Gp& dst, const Mem& src) noexcept { emit_2i(UniOpRR::kAbs, dst, src); }
@@ -1101,7 +909,7 @@ public:
 
   ASMJIT_INLINE void j(const Gp& target) noexcept { emit_j(target); }
   ASMJIT_INLINE void j(const Label& target) noexcept { emit_j(target); }
-  ASMJIT_INLINE void j(const Label& target, const Condition& condition) noexcept { emit_j_if(target, condition); }
+  ASMJIT_INLINE void j(const Label& target, const UniCondition& condition) noexcept { emit_j_if(target, condition); }
 
   ASMJIT_API void adds_u8(const Gp& dst, const Gp& src1, const Gp& src2) noexcept;
 
@@ -1283,8 +1091,12 @@ public:
   DEFINE_OP_2V(v_cvt_i32_hi_to_i64, UniOpVV::kCvtI32HiToI64)
   DEFINE_OP_2V(v_cvt_u32_lo_to_u64, UniOpVV::kCvtU32LoToU64)
   DEFINE_OP_2V(v_cvt_u32_hi_to_u64, UniOpVV::kCvtU32HiToU64)
+  DEFINE_OP_2V(s_abs_f32, UniOpVV::kAbsF32S)
+  DEFINE_OP_2V(s_abs_f64, UniOpVV::kAbsF64S)
   DEFINE_OP_2V(v_abs_f32, UniOpVV::kAbsF32)
   DEFINE_OP_2V(v_abs_f64, UniOpVV::kAbsF64)
+  DEFINE_OP_2V(s_neg_f32, UniOpVV::kNegF32S)
+  DEFINE_OP_2V(s_neg_f64, UniOpVV::kNegF64S)
   DEFINE_OP_2V(v_neg_f32, UniOpVV::kNegF32)
   DEFINE_OP_2V(v_neg_f64, UniOpVV::kNegF64)
   DEFINE_OP_2V(v_not_f32, UniOpVV::kNotF32)
@@ -1301,10 +1113,18 @@ public:
   DEFINE_OP_2V(s_ceil_f64, UniOpVV::kCeilF64S)
   DEFINE_OP_2V(v_ceil_f32, UniOpVV::kCeilF32)
   DEFINE_OP_2V(v_ceil_f64, UniOpVV::kCeilF64)
-  DEFINE_OP_2V(s_round_f32, UniOpVV::kRoundF32S)
-  DEFINE_OP_2V(s_round_f64, UniOpVV::kRoundF64S)
-  DEFINE_OP_2V(v_round_f32, UniOpVV::kRoundF32)
-  DEFINE_OP_2V(v_round_f64, UniOpVV::kRoundF64)
+  DEFINE_OP_2V(s_round_even_f32, UniOpVV::kRoundEvenF32S)
+  DEFINE_OP_2V(s_round_even_f64, UniOpVV::kRoundEvenF64S)
+  DEFINE_OP_2V(v_round_even_f32, UniOpVV::kRoundEvenF32)
+  DEFINE_OP_2V(v_round_even_f64, UniOpVV::kRoundEvenF64)
+  DEFINE_OP_2V(s_round_half_away_f32, UniOpVV::kRoundHalfAwayF32S)
+  DEFINE_OP_2V(s_round_half_away_f64, UniOpVV::kRoundHalfAwayF64S)
+  DEFINE_OP_2V(v_round_half_away_f32, UniOpVV::kRoundHalfAwayF32)
+  DEFINE_OP_2V(v_round_half_away_f64, UniOpVV::kRoundHalfAwayF64)
+  DEFINE_OP_2V(s_round_half_up_f32, UniOpVV::kRoundHalfUpF32S)
+  DEFINE_OP_2V(s_round_half_up_f64, UniOpVV::kRoundHalfUpF64S)
+  DEFINE_OP_2V(v_round_half_up_f32, UniOpVV::kRoundHalfUpF32)
+  DEFINE_OP_2V(v_round_half_up_f64, UniOpVV::kRoundHalfUpF64)
   DEFINE_OP_2V(v_rcp_f32, UniOpVV::kRcpF32)
   DEFINE_OP_2V(v_rcp_f64, UniOpVV::kRcpF64)
   DEFINE_OP_2V(s_sqrt_f32, UniOpVV::kSqrtF32S)
@@ -1743,6 +1563,10 @@ public:
   DEFINE_OP_3V(s_div_f64, UniOpVVV::kDivF64S)
   DEFINE_OP_3V(v_div_f32, UniOpVVV::kDivF32)
   DEFINE_OP_3V(v_div_f64, UniOpVVV::kDivF64)
+  DEFINE_OP_3V(s_mod_f32, UniOpVVV::kModF32S)
+  DEFINE_OP_3V(s_mod_f64, UniOpVVV::kModF64S)
+  DEFINE_OP_3V(v_mod_f32, UniOpVVV::kModF32)
+  DEFINE_OP_3V(v_mod_f64, UniOpVVV::kModF64)
   DEFINE_OP_3V(s_min_f32, UniOpVVV::kMinF32S)
   DEFINE_OP_3V(s_min_f64, UniOpVVV::kMinF64S)
   DEFINE_OP_3V(v_min_f32, UniOpVVV::kMinF32)
@@ -1856,7 +1680,6 @@ public:
   DEFINE_OP_3VI(v_insert_v256_u64, UniOpVVVI::kInsertV256_U64)
   DEFINE_OP_3VI(v_insert_v256_f64, UniOpVVVI::kInsertV256_F64)
 
-
   DEFINE_OP_4V(v_blendv_u8, UniOpVVVV::kBlendV_U8)
   DEFINE_OP_4V(v_madd_i16, UniOpVVVV::kMAddU16)
   DEFINE_OP_4V(v_madd_u16, UniOpVVVV::kMAddU16)
@@ -1944,7 +1767,7 @@ public:
   //! \name Memory Loads & Stores with Parameterized Size
   //! \{
 
-  ASMJIT_NOINLINE void v_load_iany(const Vec& dst, const Mem& src, uint32_t n_bytes, Alignment alignment) noexcept {
+  ASMJIT_NOINLINE void v_load_iany(const Vec& dst, const Mem& src, size_t n_bytes, Alignment alignment) noexcept {
     switch (n_bytes) {
       case 1: v_load8(dst, src); break;
       case 2: v_loada16(dst, src, alignment); break;
@@ -1959,7 +1782,7 @@ public:
     }
   }
 
-  ASMJIT_NOINLINE void v_store_iany(const Mem& dst, const Vec& src, uint32_t n_bytes, Alignment alignment) noexcept {
+  ASMJIT_NOINLINE void v_store_iany(const Mem& dst, const Vec& src, size_t n_bytes, Alignment alignment) noexcept {
     switch (n_bytes) {
       case 1: v_store8(dst, src); break;
       case 2: v_storea16(dst, src, alignment); break;
@@ -2005,27 +1828,6 @@ public:
   #endif
   }
 
-  // d = int(floor(a / b) * b).
-  template<typename VecOrMem>
-  ASMJIT_NOINLINE void v_mod_pd(const Vec& d, const Vec& a, const VecOrMem& b) noexcept {
-#if defined(ASMJIT_UJIT_X86)
-    if (!has_sse4_1()) {
-      Vec t = new_vec128("vModTmp");
-
-      v_div_f64(d, a, b);
-      v_cvt_trunc_f64_to_i32_lo(t, d);
-      v_cvt_i32_lo_to_f64(d, t);
-      v_mul_f64(d, d, b);
-    }
-    else
-#endif // ASMJIT_UJIT_X86
-    {
-      v_div_f64(d, a, b);
-      v_trunc_f64(d, d);
-      v_mul_f64(d, d, b);
-    }
-  }
-
   //! \}
 };
 
diff --git a/src/asmjit/ujit/unicompiler_a64.cpp b/src/asmjit/ujit/unicompiler_a64.cpp
index b68b3de..292ef0d 100644
--- a/src/asmjit/ujit/unicompiler_a64.cpp
+++ b/src/asmjit/ujit/unicompiler_a64.cpp
@@ -9,6 +9,8 @@
 #if defined(ASMJIT_UJIT_AARCH64)
 
 #include "unicompiler.h"
+#include "unicompiler_utils_p.h"
+#include "unicondition.h"
 
 ASMJIT_BEGIN_SUB_NAMESPACE(ujit)
 
@@ -19,17 +21,19 @@ namespace Inst { using namespace a64::Inst; }
 // ujit::UniCompiler - Construction & Destruction
 // ==============================================
 
-UniCompiler::UniCompiler(BackendCompiler* cc, const CpuFeatures& features, UniOptFlags opt_flags) noexcept
+UniCompiler::UniCompiler(BackendCompiler* cc, const CpuFeatures& features, CpuHints cpu_hints, VecConstTableRef ct_ref) noexcept
   : cc(cc),
-    ct(vec_const_table),
+    _ct_ref(ct_ref),
     _features(features),
-    _opt_flags(opt_flags),
+    _cpu_hints(cpu_hints),
     _vec_reg_count(32),
     _common_table_offset(0) {
 
   _scalar_op_behavior = ScalarOpBehavior::kZeroing;
-  _fmin_fmax_op_hehavior = FMinFMaxOpBehavior::kFiniteValue;
+  _fmin_fmax_op_behavior = FMinFMaxOpBehavior::kFiniteValue;
   _fmadd_op_behavior = FMAddOpBehavior::kFMAStoreToAccumulator;
+  _float_to_int_outside_range_behavior = FloatToIntOutsideRangeBehavior::kSaturatedValue;
+
   _init_extensions(features);
 }
 
@@ -38,7 +42,7 @@ UniCompiler::~UniCompiler() noexcept {}
 // ujit::UniCompiler - CPU Architecture, Features and Optimization Options
 // =======================================================================
 
-void UniCompiler::_init_extensions(const asmjit::CpuFeatures& features) noexcept {
+void UniCompiler::_init_extensions(const CpuFeatures& features) noexcept {
   uint64_t gp_ext_mask = 0;
   uint64_t asimd_ext_mask = 0;
 
@@ -85,16 +89,16 @@ void UniCompiler::init_vec_width(VecWidth vw) noexcept {
 
   _vec_width = VecWidth::k128;
   _vec_reg_type = RegType::kVec128;
-  _vec_type_id = asmjit::TypeId::kInt32x4;
+  _vec_type_id = TypeId::kInt32x4;
   _vec_multiplier = 1u;
 }
 
 bool UniCompiler::has_masked_access_of(uint32_t data_size) const noexcept {
   switch (data_size) {
-    case 1: return has_opt_flag(UniOptFlags::kMaskOps8Bit);
-    case 2: return has_opt_flag(UniOptFlags::kMaskOps16Bit);
-    case 4: return has_opt_flag(UniOptFlags::kMaskOps32Bit);
-    case 8: return has_opt_flag(UniOptFlags::kMaskOps64Bit);
+    case 1: return has_cpu_hint(CpuHints::kVecMaskedOps8);
+    case 2: return has_cpu_hint(CpuHints::kVecMaskedOps16);
+    case 4: return has_cpu_hint(CpuHints::kVecMaskedOps32);
+    case 8: return has_cpu_hint(CpuHints::kVecMaskedOps64);
 
     default:
       return false;
@@ -104,7 +108,7 @@ bool UniCompiler::has_masked_access_of(uint32_t data_size) const noexcept {
 // ujit::UniCompiler - Function
 // ============================
 
-void UniCompiler::init_function(asmjit::FuncNode* func_node) noexcept {
+void UniCompiler::init_function(FuncNode* func_node) noexcept {
   cc->add_func(func_node);
 
   _func_node = func_node;
@@ -116,7 +120,7 @@ void UniCompiler::init_function(asmjit::FuncNode* func_node) noexcept {
 // =============================
 
 void UniCompiler::_init_vec_const_table_ptr() noexcept {
-  const void* global = &vec_const_table;
+  const void* global = ct_ptr<void>();
 
   if (!_common_table_ptr.is_valid()) {
     ScopedInjector injector(cc, &_func_init);
@@ -152,7 +156,7 @@ Vec UniCompiler::simd_vec_const(const void* c, Bcst bcst_width, VecWidth const_w
     }
   }
 
-  return Vec(OperandSignature{RegTraits<RegType::kVec128>::kSignature}, _new_vecConst(c, true).id());
+  return Vec(OperandSignature{RegTraits<RegType::kVec128>::kSignature}, _new_vec_const(c, true).id());
 }
 
 Vec UniCompiler::simd_vec_const(const void* c, Bcst bcst_width, const Vec& similar_to) noexcept {
@@ -181,39 +185,39 @@ Mem UniCompiler::simd_mem_const(const void* c, Bcst bcst_width, const VecArray&
 }
 
 Mem UniCompiler::_get_mem_const(const void* c) noexcept {
-  // Make sure we are addressing a constant from the `vec_const_table` constant pool.
-  const void* global = &vec_const_table;
-  ASMJIT_ASSERT((uintptr_t)c >= (uintptr_t)global &&
-                (uintptr_t)c <  (uintptr_t)global + sizeof(VecConstTable));
+  // Make sure we are addressing a constant from the `ct` constant pool.
+  const void* ct_addr = ct_ptr<void>();
+  ASMJIT_ASSERT((uintptr_t)c >= (uintptr_t)ct_addr &&
+                (uintptr_t)c <  (uintptr_t)ct_addr + _ct_ref.size);
 
-  // One GP register is sacrificed to hold the pointer to the `vec_const_table`.
+  // One GP register is sacrificed to hold the pointer to the `ct`.
   _init_vec_const_table_ptr();
 
-  int32_t disp = int32_t((intptr_t)c - (intptr_t)global);
+  int32_t disp = int32_t((intptr_t)c - (intptr_t)ct_addr);
   return mem_ptr(_common_table_ptr, disp - _common_table_offset);
 }
 
-Vec UniCompiler::_new_vecConst(const void* c, bool is_unique_const) noexcept {
+Vec UniCompiler::_new_vec_const(const void* c, bool is_unique_const) noexcept {
   Support::maybe_unused(is_unique_const);
 
   Vec vec;
   const char* special_const_name = nullptr;
 
   if (special_const_name) {
-    vec = new_vec(vec_width(), special_const_name);
+    vec = new_vec_with_width(vec_width(), special_const_name);
   }
   else {
     uint64_t u0 = static_cast<const uint64_t*>(c)[0];
     uint64_t u1 = static_cast<const uint64_t*>(c)[1];
 
     if (u0 != u1)
-      vec = new_vec(vec_width(), "c_0x%016llX%016llX", (unsigned long long)u1, (unsigned long long)u0);
+      vec = new_vec_with_width(vec_width(), "c_0x%016llX%016llX", (unsigned long long)u1, (unsigned long long)u0);
     else if ((u0 >> 32) != (u0 & 0xFFFFFFFFu))
-      vec = new_vec(vec_width(), "c_0x%016llX", (unsigned long long)u0);
+      vec = new_vec_with_width(vec_width(), "c_0x%016llX", (unsigned long long)u0);
     else if (((u0 >> 16) & 0xFFFFu) != (u0 & 0xFFFFu))
-      vec = new_vec(vec_width(), "c_0x%08X", (unsigned)(u0 & 0xFFFFFFFFu));
+      vec = new_vec_with_width(vec_width(), "c_0x%08X", (unsigned)(u0 & 0xFFFFFFFFu));
     else
-      vec = new_vec(vec_width(), "c_0x%04X", (unsigned)(u0 & 0xFFFFu));
+      vec = new_vec_with_width(vec_width(), "c_0x%04X", (unsigned)(u0 & 0xFFFFu));
   }
 
   VecConstData const_data;
@@ -221,7 +225,7 @@ Vec UniCompiler::_new_vecConst(const void* c, bool is_unique_const) noexcept {
   const_data.virt_reg_id = vec.id();
   _vec_consts.append(arena(), const_data);
 
-  if (c == &ct.p_0000000000000000) {
+  if (c == &ct().p_0000000000000000) {
     ScopedInjector inject(cc, &_func_init);
     v_zero_i(vec.v128());
   }
@@ -245,14 +249,14 @@ Vec UniCompiler::simd_const_16b(const void* data16) noexcept {
     }
   }
 
-  Vec vec = new_vec(VecWidth::k128, "const");
+  Vec vec = new_vec128("const");
   VecConstDataEx entry;
 
   memcpy(entry.data, data16, 16);
   entry.virt_reg_id = vec.id();
   _vec_consts_ex.append(arena(), entry);
 
-  Mem mem = cc->new_const(asmjit::ConstPoolScope::kLocal, data16, 16);
+  Mem mem = cc->new_const(ConstPoolScope::kLocal, data16, 16);
   {
     ScopedInjector inject(cc, &_func_init);
     v_loadavec(vec, mem);
@@ -302,8 +306,8 @@ struct MemInst {
   uint16_t mem_size;
 };
 
-static ASMJIT_NOINLINE void gp_emit_mem_op(UniCompiler* pc, Gp r, Mem m, MemInst ii) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void gp_emit_mem_op(UniCompiler& uc, Gp r, Mem m, MemInst ii) noexcept {
+  BackendCompiler* cc = uc.cc;
   InstId inst_id = ii.inst_id;
 
   if (m.has_index() && m.has_shift()) {
@@ -313,7 +317,7 @@ static ASMJIT_NOINLINE void gp_emit_mem_op(UniCompiler* pc, Gp r, Mem m, MemInst
     uint32_t shift = m.shift();
 
     if (mem_size != (1u << shift)) {
-      Gp tmp = pc->new_gpz("@mem_addr");
+      Gp tmp = uc.new_gpz("@mem_addr");
       cc->add(tmp, m.base_reg().as<Gp>(), m.index_reg().as<Gp>(), a64::Shift(m.shift_op(), shift));
       m = a64::ptr(tmp);
     }
@@ -328,7 +332,7 @@ static ASMJIT_INLINE const Gp& gp_zero_as(const Gp& ref) noexcept {
   return gp_zero_regs[size_t(ref.is_gp64())];
 }
 
-static ASMJIT_NOINLINE Gp gp_force_reg(UniCompiler* pc, const Operand_& op, const Gp& ref) noexcept {
+static ASMJIT_NOINLINE Gp gp_force_reg(UniCompiler& uc, const Operand_& op, const Gp& ref) noexcept {
   ASMJIT_ASSERT(op.is_gp() || op.is_mem() || op.is_imm());
 
   Gp reg;
@@ -343,11 +347,11 @@ static ASMJIT_NOINLINE Gp gp_force_reg(UniCompiler* pc, const Operand_& op, cons
     return gp_zero_as(ref);
   }
 
-  BackendCompiler* cc = pc->cc;
-  reg = pc->new_similar_reg(ref, "@tmp");
+  BackendCompiler* cc = uc.cc;
+  reg = uc.new_similar_reg(ref, "@tmp");
 
   if (op.is_mem()) {
-    gp_emit_mem_op(pc, reg, op.as<Mem>(), MemInst{uint16_t(Inst::kIdLdr), uint16_t(reg.size())});
+    gp_emit_mem_op(uc, reg, op.as<Mem>(), MemInst{uint16_t(Inst::kIdLdr), uint16_t(reg.size())});
   }
   else {
     cc->mov(reg, op.as<Imm>());
@@ -375,15 +379,15 @@ static constexpr ConditionOpInfo condition_op_info[size_t(UniOpCond::kMaxValue)
   { Inst::kIdCmp , 0 }  // UniOpCond::kCompare
 };
 
-class ConditionApplier : public Condition {
+class ConditionApplier : public UniCondition {
 public:
-  ASMJIT_INLINE ConditionApplier(const Condition& condition) noexcept : Condition(condition) {
+  ASMJIT_INLINE ConditionApplier(const UniCondition& condition) noexcept : UniCondition(condition) {
     // The first operand must always be a register.
     ASMJIT_ASSERT(a.is_gp());
   }
 
-  ASMJIT_NOINLINE void optimize(UniCompiler* pc) noexcept {
-    Support::maybe_unused(pc);
+  ASMJIT_NOINLINE void optimize(UniCompiler& uc) noexcept {
+    Support::maybe_unused(uc);
 
     switch (op) {
       case UniOpCond::kCompare:
@@ -412,8 +416,8 @@ public:
     cond = a64::reverse_cond(cond);
   }
 
-  ASMJIT_NOINLINE void emit(UniCompiler* pc) noexcept {
-    BackendCompiler* cc = pc->cc;
+  ASMJIT_NOINLINE void emit(UniCompiler& uc) noexcept {
+    BackendCompiler* cc = uc.cc;
     ConditionOpInfo info = condition_op_info[size_t(op)];
 
     Gp aGp = a.as<Gp>();
@@ -424,7 +428,7 @@ public:
           cc->emit(info.inst_id, aGp, aGp, b.as<Imm>());
         }
         else {
-          cc->emit(info.inst_id, aGp, aGp, gp_force_reg(pc, b, aGp));
+          cc->emit(info.inst_id, aGp, aGp, gp_force_reg(uc, b, aGp));
         }
         return;
       }
@@ -435,7 +439,7 @@ public:
           cc->emit(info.inst_id, aGp, aGp, b.as<Imm>());
         }
         else {
-          cc->emit(info.inst_id, aGp, aGp, gp_force_reg(pc, b, aGp));
+          cc->emit(info.inst_id, aGp, aGp, gp_force_reg(uc, b, aGp));
         }
         return;
       }
@@ -457,7 +461,7 @@ public:
           cc->tst(aGp, aGp);
         }
         else {
-          cc->emit(info.inst_id, aGp, aGp, gp_force_reg(pc, b, aGp));
+          cc->emit(info.inst_id, aGp, aGp, gp_force_reg(uc, b, aGp));
           cc->tst(aGp, aGp);
         }
         return;
@@ -468,7 +472,7 @@ public:
           cc->adds(aGp, gp_zero_as(aGp), aGp, a64::lsr(b.as<Imm>().value_as<uint32_t>()));
         }
         else {
-          cc->lsr(aGp, aGp, gp_force_reg(pc, b, aGp));
+          cc->lsr(aGp, aGp, gp_force_reg(uc, b, aGp));
           cc->tst(aGp, aGp);
         }
         return;
@@ -479,7 +483,7 @@ public:
           cc->emit(info.inst_id, aGp, b.as<Imm>());
         }
         else {
-          cc->emit(info.inst_id, aGp, gp_force_reg(pc, b, aGp));
+          cc->emit(info.inst_id, aGp, gp_force_reg(uc, b, aGp));
         }
         return;
       }
@@ -489,14 +493,14 @@ public:
           cc->emit(info.inst_id, aGp, b.as<Imm>());
         }
         else {
-          cc->emit(info.inst_id, aGp, gp_force_reg(pc, b, aGp));
+          cc->emit(info.inst_id, aGp, gp_force_reg(uc, b, aGp));
         }
         return;
       }
 
       case UniOpCond::kBitTest: {
-        Gp tmp = pc->new_similar_reg(aGp);
-        cc->lsr(tmp, aGp, gp_force_reg(pc, b, aGp));
+        Gp tmp = uc.new_similar_reg(aGp);
+        cc->lsr(tmp, aGp, gp_force_reg(uc, b, aGp));
         cc->tst(tmp, Imm(1));
         return;
       }
@@ -512,7 +516,7 @@ public:
 
 void UniCompiler::emit_mov(const Gp& dst, const Operand_& src) noexcept {
   if (src.is_mem()) {
-    gp_emit_mem_op(this, dst, src.as<Mem>(), MemInst{uint16_t(Inst::kIdLdr), uint16_t(dst.size())});
+    gp_emit_mem_op(*this, dst, src.as<Mem>(), MemInst{uint16_t(Inst::kIdLdr), uint16_t(dst.size())});
   }
   else {
     cc->emit(Inst::kIdMov, dst, src);
@@ -531,7 +535,7 @@ void UniCompiler::emit_m(UniOpM op, const Mem& m_) noexcept {
   Gp zero = gp_zero_regs[size_t(op == UniOpM::kStoreZeroReg || op == UniOpM::kStoreZeroU64)];
   MemInst ii = st_inst[size_t(op)];
 
-  gp_emit_mem_op(this, zero, m_, ii);
+  gp_emit_mem_op(*this, zero, m_, ii);
 }
 
 void UniCompiler::emit_rm(UniOpRM op, const Gp& dst, const Mem& src) noexcept {
@@ -578,14 +582,14 @@ void UniCompiler::emit_rm(UniOpRM op, const Gp& dst, const Mem& src) noexcept {
         r = r.w();
       }
 
-      gp_emit_mem_op(this, r, m, ii);
+      gp_emit_mem_op(*this, r, m, ii);
       return;
     }
 
     case UniOpRM::kLoadShiftU8:
     case UniOpRM::kLoadShiftU16: {
       Gp tmp = new_similar_reg(r);
-      gp_emit_mem_op(this, tmp.r32(), m, ii);
+      gp_emit_mem_op(*this, tmp.r32(), m, ii);
       cc->orr(r, tmp, r, a64::lsl(ii.mem_size * 8));
       return;
     }
@@ -593,7 +597,7 @@ void UniCompiler::emit_rm(UniOpRM op, const Gp& dst, const Mem& src) noexcept {
     case UniOpRM::kLoadMergeU8:
     case UniOpRM::kLoadMergeU16: {
       Gp tmp = new_similar_reg(r);
-      gp_emit_mem_op(this, tmp.r32(), m, ii);
+      gp_emit_mem_op(*this, tmp.r32(), m, ii);
       cc->orr(r, r, tmp);
       return;
     }
@@ -649,20 +653,20 @@ void UniCompiler::emit_mr(UniOpMR op, const Mem& dst, const Gp& src) noexcept {
   }
 }
 
-void UniCompiler::emit_cmov(const Gp& dst, const Operand_& sel, const Condition& condition) noexcept {
+void UniCompiler::emit_cmov(const Gp& dst, const Operand_& sel, const UniCondition& condition) noexcept {
   ConditionApplier ca(condition);
-  ca.optimize(this);
-  ca.emit(this);
-  cc->csel(dst, gp_force_reg(this, sel, dst), dst, condition.cond);
+  ca.optimize(*this);
+  ca.emit(*this);
+  cc->csel(dst, gp_force_reg(*this, sel, dst), dst, condition.cond);
 }
 
-void UniCompiler::emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const Condition& condition) noexcept {
+void UniCompiler::emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const UniCondition& condition) noexcept {
   ConditionApplier ca(condition);
-  ca.optimize(this);
-  ca.emit(this);
+  ca.optimize(*this);
+  ca.emit(*this);
 
-  Gp sel1 = gp_force_reg(this, sel1_, dst);
-  Gp sel2 = gp_force_reg(this, sel2_, dst);
+  Gp sel1 = gp_force_reg(*this, sel1_, dst);
+  Gp sel2 = gp_force_reg(*this, sel2_, dst);
   cc->csel(dst, sel1, sel2, condition.cond);
 }
 
@@ -671,7 +675,7 @@ void UniCompiler::emit_2i(UniOpRR op, const Gp& dst, const Operand_& src_) noexc
   // ----------------
 
   if (src_.is_reg_or_mem()) {
-    Gp src = gp_force_reg(this, src_, dst);
+    Gp src = gp_force_reg(*this, src_, dst);
 
     switch (op) {
       case UniOpRR::kAbs: {
@@ -755,7 +759,7 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con
       std::swap(src1, src2);
     }
     else {
-      src1 = gp_force_reg(this, src1, dst);
+      src1 = gp_force_reg(*this, src1, dst);
     }
   }
 
@@ -805,13 +809,13 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con
 
         // TODO: [JIT] Just testing the idea of patching the previous instruction to have a post-index addressing.
         if (!reverse && uint64_t(value) < 256 && dst.id() == a.id()) {
-          if (cc->cursor()->type() == asmjit::NodeType::kInst) {
-            asmjit::InstNode* prev_inst = cc->cursor()->as<asmjit::InstNode>();
+          if (cc->cursor()->type() == NodeType::kInst) {
+            InstNode* prev_inst = cc->cursor()->as<InstNode>();
             if (prev_inst->inst_id() == Inst::kIdLdr || prev_inst->inst_id() == Inst::kIdStr ||
                 prev_inst->inst_id() == Inst::kIdLdr_v || prev_inst->inst_id() == Inst::kIdStr_v) {
               Mem& mem_op = prev_inst->op(prev_inst->op_count() - 1).as<Mem>();
               if (mem_op.base_reg() == a && !mem_op.has_index() && !mem_op.has_offset()) {
-                mem_op.set_offset_mode(asmjit::arm::OffsetMode::kPostIndex);
+                mem_op.set_offset_mode(arm::OffsetMode::kPostIndex);
                 mem_op.add_offset(int64_t(value));
                 return;
               }
@@ -819,7 +823,7 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con
           }
         }
 
-        if (asmjit::Support::is_uint_n<12>(value)) {
+        if (Support::is_uint_n<12>(value)) {
           cc->emit(addsub_inst[reverse], dst, a, Imm(value));
           return;
         }
@@ -831,15 +835,15 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con
       case UniOpRRR::kMul: {
         uint64_t value = b.value_as<uint64_t>();
         if (value > 0u) {
-          if (asmjit::Support::is_power_of_2(value)) {
-            uint32_t shift = asmjit::Support::ctz(value);
+          if (Support::is_power_of_2(value)) {
+            uint32_t shift = Support::ctz(value);
             cc->lsl(dst, a, Imm(shift));
             return;
           }
 
           // We can still support multiplication with `power_of_2 + 1`
-          if (asmjit::Support::is_power_of_2(--value)) {
-            uint32_t shift = asmjit::Support::ctz(value);
+          if (Support::is_power_of_2(--value)) {
+            uint32_t shift = Support::ctz(value);
             cc->add(dst, a, a, a64::lsl(shift));
             return;
           }
@@ -921,7 +925,7 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con
   // ---------------------
 
   {
-    src2 = gp_force_reg(this, src2, dst);
+    src2 = gp_force_reg(*this, src2, dst);
 
     Gp a = src1.as<Gp>();
     Gp b = src2.as<Gp>();
@@ -1020,10 +1024,10 @@ void UniCompiler::emit_j(const Operand_& target) noexcept {
   cc->emit(Inst::kIdB, target);
 }
 
-void UniCompiler::emit_j_if(const Label& target, const Condition& condition) noexcept {
+void UniCompiler::emit_j_if(const Label& target, const UniCondition& condition) noexcept {
   ConditionApplier ca(condition);
-  ca.optimize(this);
-  ca.emit(this);
+  ca.optimize(*this);
+  ca.emit(*this);
   cc->b(ca.cond, target);
 }
 
@@ -1066,8 +1070,8 @@ void UniCompiler::mul_257_hu16(const Gp& dst, const Gp& src) noexcept {
 void UniCompiler::add_scaled(const Gp& dst, const Gp& a_, int b) noexcept {
   Gp a = a_.clone_as(dst);
 
-  if (asmjit::Support::is_power_of_2(b)) {
-    uint32_t shift = asmjit::Support::ctz(b);
+  if (Support::is_power_of_2(b)) {
+    uint32_t shift = Support::ctz(b);
     cc->add(dst, dst, a, a64::lsl(shift));
   }
   else if (b == 3 && dst.id() == a.id()) {
@@ -1086,8 +1090,8 @@ void UniCompiler::add_ext(const Gp& dst, const Gp& src_, const Gp& idx_, uint32_
   Gp src = src_.clone_as(dst);
   Gp idx = idx_.clone_as(dst);
 
-  if (asmjit::Support::is_power_of_2(scale)) {
-    cc->add(dst, src, idx, a64::lsl(asmjit::Support::ctz(scale)));
+  if (Support::is_power_of_2(scale)) {
+    cc->add(dst, src, idx, a64::lsl(Support::ctz(scale)));
   }
   else {
     Gp tmp = new_similar_reg(dst, "@tmp");
@@ -1206,20 +1210,20 @@ struct UniOpVInfo {
 };
 
 #define DEFINE_OP(inst_id, ext, commutative, comparison, reverse, same_vec_op, float_mode, dst_element, dst_part, src_element, src_part, imm) \
-  UniOpVInfo {              \
+  UniOpVInfo {                \
     inst_id,                  \
-    ASIMDExt::ext,           \
-    commutative,             \
-    comparison,              \
-    reverse,                 \
-    SameVecOp::same_vec_op,    \
+    ASIMDExt::ext,            \
+    commutative,              \
+    comparison,               \
+    reverse,                  \
+    SameVecOp::same_vec_op,   \
     FloatMode::float_mode,    \
     ElementSize::dst_element, \
     VecPart::dst_part,        \
     ElementSize::src_element, \
     VecPart::src_part,        \
-    imm,                     \
-    0                        \
+    imm,                      \
+    0                         \
   }
 
 static constexpr UniOpVInfo opcode_info_2v[size_t(UniOpVV::kMaxValue) + 1] = {
@@ -1261,8 +1265,12 @@ static constexpr UniOpVInfo opcode_info_2v[size_t(UniOpVV::kMaxValue) + 1] = {
   DEFINE_OP(Inst::kIdSshll2_v       , kIntrin, 0, 0, 0, kNone, kNone, k64, kNA, k32, kHi, 0x00u), // kCvtI32HiToI64
   DEFINE_OP(Inst::kIdUshll_v        , kIntrin, 0, 0, 0, kNone, kNone, k64, kNA, k32, kLo, 0x00u), // kCvtU32LoToU64
   DEFINE_OP(Inst::kIdUshll2_v       , kIntrin, 0, 0, 0, kNone, kNone, k64, kNA, k32, kHi, 0x00u), // kCvtU32HiToU64
+  DEFINE_OP(Inst::kIdFabs_v         , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kAbsF32S.
+  DEFINE_OP(Inst::kIdFabs_v         , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kAbsF64S.
   DEFINE_OP(Inst::kIdFabs_v         , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kAbsF32.
   DEFINE_OP(Inst::kIdFabs_v         , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kAbsF64.
+  DEFINE_OP(Inst::kIdFneg_v         , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kNegF32S.
+  DEFINE_OP(Inst::kIdFneg_v         , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kNegF64S.
   DEFINE_OP(Inst::kIdFneg_v         , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kNegF32.
   DEFINE_OP(Inst::kIdFneg_v         , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kNegF64.
   DEFINE_OP(Inst::kIdMvn_v          , kASIMD , 0, 0, 0, kNone, kNone, k8 , kNA, k8 , kNA, 0x00u), // kNotF32.
@@ -1279,10 +1287,18 @@ static constexpr UniOpVInfo opcode_info_2v[size_t(UniOpVV::kMaxValue) + 1] = {
   DEFINE_OP(Inst::kIdFrintp_v       , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kCeilF64S.
   DEFINE_OP(Inst::kIdFrintp_v       , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kCeilF32.
   DEFINE_OP(Inst::kIdFrintp_v       , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kCeilF64.
-  DEFINE_OP(Inst::kIdFrintn_v       , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kRoundF32S.
-  DEFINE_OP(Inst::kIdFrintn_v       , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kRoundF64S.
-  DEFINE_OP(Inst::kIdFrintn_v       , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kRoundF32.
-  DEFINE_OP(Inst::kIdFrintn_v       , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kRoundF64.
+  DEFINE_OP(Inst::kIdFrintn_v       , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kRoundEvenF32S.
+  DEFINE_OP(Inst::kIdFrintn_v       , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kRoundEvenF64S.
+  DEFINE_OP(Inst::kIdFrintn_v       , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kRoundEvenF32.
+  DEFINE_OP(Inst::kIdFrintn_v       , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kRoundEvenF64.
+  DEFINE_OP(Inst::kIdFrinta_v       , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kRoundHalfAwayF32S.
+  DEFINE_OP(Inst::kIdFrinta_v       , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kRoundHalfAwayF64S.
+  DEFINE_OP(Inst::kIdFrinta_v       , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kRoundHalfAwayF32.
+  DEFINE_OP(Inst::kIdFrinta_v       , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kRoundHalfAwayF64.
+  DEFINE_OP(Inst::kIdNone           , kIntrin, 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kRoundHalfUpF32S.
+  DEFINE_OP(Inst::kIdNone           , kIntrin, 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kRoundHalfUpF64S.
+  DEFINE_OP(Inst::kIdNone           , kIntrin, 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kRoundHalfUpF32.
+  DEFINE_OP(Inst::kIdNone           , kIntrin, 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kRoundHalfUpF64.
   DEFINE_OP(Inst::kIdNone           , kIntrin, 0, 0, 0, kNone, kNone, k32, kNA, k32, kNA, 0x00u), // kRcpF32.
   DEFINE_OP(Inst::kIdNone           , kIntrin, 0, 0, 0, kNone, kNone, k64, kNA, k64, kNA, 0x00u), // kRcpF64.
   DEFINE_OP(Inst::kIdFsqrt_v        , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kSqrtF32S.
@@ -1492,6 +1508,10 @@ static constexpr UniOpVInfo opcode_info_3v[size_t(UniOpVVV::kMaxValue) + 1] = {
   DEFINE_OP(Inst::kIdFdiv_v         , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kDivF64S.
   DEFINE_OP(Inst::kIdFdiv_v         , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kDivF32.
   DEFINE_OP(Inst::kIdFdiv_v         , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kDivF64.
+  DEFINE_OP(Inst::kIdNone           , kIntrin, 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kModF32S.
+  DEFINE_OP(Inst::kIdNone           , kIntrin, 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kModF64S.
+  DEFINE_OP(Inst::kIdNone           , kIntrin, 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kModF32.
+  DEFINE_OP(Inst::kIdNone           , kIntrin, 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kModF64.
   DEFINE_OP(Inst::kIdFminnm_v       , kASIMD , 1, 0, 0, kSrc , kF32S, k32, kNA, k32, kNA, 0x00u), // kMinF32S.
   DEFINE_OP(Inst::kIdFminnm_v       , kASIMD , 1, 0, 0, kSrc , kF64S, k64, kNA, k64, kNA, 0x00u), // kMinF64S.
   DEFINE_OP(Inst::kIdFminnm_v       , kASIMD , 1, 0, 0, kSrc , kF32V, k32, kNA, k32, kNA, 0x00u), // kMinF32.
@@ -1809,8 +1829,8 @@ static ASMJIT_INLINE void vec_set_type_and_index(Vec& vec, ElementSize sz, uint3
   vec.set_element_index(idx);
 }
 
-static ASMJIT_NOINLINE void vec_load_mem(UniCompiler* pc, const Vec& dst, Mem src, uint32_t mem_size) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void vec_load_mem(UniCompiler& uc, const Vec& dst, Mem src, uint32_t mem_size) noexcept {
+  BackendCompiler* cc = uc.cc;
 
   if (src.has_index() && src.has_shift()) {
     // AArch64 limitation: index shift can be the same size as the size of the read operation, so H << 1, S << 2,
@@ -1825,7 +1845,7 @@ static ASMJIT_NOINLINE void vec_load_mem(UniCompiler* pc, const Vec& dst, Mem sr
         src = a64::ptr(base, src.offset_lo32());
       }
       else {
-        Gp tmp = pc->new_gpz("@mem_addr");
+        Gp tmp = uc.new_gpz("@mem_addr");
         cc->add(tmp, base, index, a64::Shift(src.shift_op(), shift));
         src = a64::ptr(tmp, src.offset_lo32());
       }
@@ -1843,30 +1863,30 @@ static ASMJIT_NOINLINE void vec_load_mem(UniCompiler* pc, const Vec& dst, Mem sr
   }
 }
 
-static ASMJIT_NOINLINE Vec vec_from_mem(UniCompiler* pc, const Mem& op, const Vec& ref, uint32_t mem_size = 0) noexcept {
-  Vec vec = pc->new_vec128("@tmp");
+static ASMJIT_NOINLINE Vec vec_from_mem(UniCompiler& uc, const Mem& op, const Vec& ref, uint32_t mem_size = 0) noexcept {
+  Vec vec = uc.new_vec128("@tmp");
   if (mem_size == 0)
     mem_size = ref.size();
-  vec_load_mem(pc, vec, op, mem_size);
+  vec_load_mem(uc, vec, op, mem_size);
   return vec.clone_as(ref);
 }
 
-static ASMJIT_INLINE Vec as_vec(UniCompiler* pc, const Operand_& op, const Vec& ref, uint32_t mem_size = 0) noexcept {
+static ASMJIT_INLINE Vec as_vec(UniCompiler& uc, const Operand_& op, const Vec& ref, uint32_t mem_size = 0) noexcept {
   if (op.is_vec())
     return op.as<Vec>().clone_as(ref);
   else
-    return vec_from_mem(pc, op.as<Mem>(), ref, mem_size);
+    return vec_from_mem(uc, op.as<Mem>(), ref, mem_size);
 }
 
-static ASMJIT_INLINE Vec as_vec(UniCompiler* pc, const Operand_& op, const Vec& ref, FloatMode fm) noexcept {
+static ASMJIT_INLINE Vec as_vec(UniCompiler& uc, const Operand_& op, const Vec& ref, FloatMode fm) noexcept {
   if (op.is_vec())
     return op.as<Vec>().clone_as(ref);
   else
-    return vec_from_mem(pc, op.as<Mem>(), ref, float_mode_mem_size_table[size_t(fm)]);
+    return vec_from_mem(uc, op.as<Mem>(), ref, float_mode_mem_size_table[size_t(fm)]);
 }
 
-static ASMJIT_NOINLINE Vec vec_mov(UniCompiler* pc, const Vec& dst_, const Operand_& src_) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE Vec vec_mov(UniCompiler& uc, const Vec& dst_, const Operand_& src_) noexcept {
+  BackendCompiler* cc = uc.cc;
 
   Vec dst(dst_);
   vec_set_type(dst, ElementSize::k8);
@@ -1881,15 +1901,15 @@ static ASMJIT_NOINLINE Vec vec_mov(UniCompiler* pc, const Vec& dst_, const Opera
   }
 
   if (src_.is_mem()) {
-    vec_load_mem(pc, dst, src_.as<Mem>(), dst.size());
+    vec_load_mem(uc, dst, src_.as<Mem>(), dst.size());
     return dst;
   }
 
   ASMJIT_NOT_REACHED();
 }
 
-static ASMJIT_NOINLINE void vec_neg(UniCompiler* pc, const Vec& dst, const Vec& src, FloatMode fm) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void vec_neg(UniCompiler& uc, const Vec& dst, const Vec& src, FloatMode fm) noexcept {
+  BackendCompiler* cc = uc.cc;
 
   if (fm == FloatMode::kF32S)
     cc->mvn_(dst.s(), src.s());
@@ -2109,10 +2129,10 @@ static constexpr Swizzle32Data swizzle_32_data[256] = {
 
 #undef OP
 
-static void emit_swizzle32_impl(UniCompiler* pc, const Vec& dst, const Vec& src, uint32_t imm) noexcept {
+static void emit_swizzle32_impl(UniCompiler& uc, const Vec& dst, const Vec& src, uint32_t imm) noexcept {
   ASMJIT_ASSERT((imm & 0xFCFCFCFC) == 0);
 
-  BackendCompiler* cc = pc->cc;
+  BackendCompiler* cc = uc.cc;
 
   uint32_t table_index = ((imm & 0x03000000) >> (24 - 6)) |
                          ((imm & 0x00030000) >> (16 - 4)) |
@@ -2134,12 +2154,12 @@ static void emit_swizzle32_impl(UniCompiler* pc, const Vec& dst, const Vec& src,
         op_dst = dst;
       }
       else {
-        op_dst = pc->new_similar_reg(dst, "@tmp");
+        op_dst = uc.new_similar_reg(dst, "@tmp");
       }
 
       switch (op.type()) {
         case Swizzle32Data::Op::kMov: {
-          vec_mov(pc, op_dst, op_src[0]);
+          vec_mov(uc, op_dst, op_src[0]);
           break;
         }
 
@@ -2226,7 +2246,7 @@ static void emit_swizzle32_impl(UniCompiler* pc, const Vec& dst, const Vec& src,
     pred_data[14] = uint8_t(d + 2u);
     pred_data[15] = uint8_t(d + 3u);
 
-    Vec pred = pc->simd_const_16b(pred_data);
+    Vec pred = uc.simd_const_16b(pred_data);
     cc->tbl(dst.b16(), src.b16(), pred.b16());
   }
 }
@@ -2508,13 +2528,13 @@ static constexpr InterleavedShuffle32Ops interleaved_shuffle32_ops_dst_same_as_b
 
 #undef OP
 
-static void emit_interleaved_shuffle32_impl(UniCompiler* pc, const Vec& dst, const Vec& src1, const Vec& src2, uint32_t imm) noexcept {
+static void emit_interleaved_shuffle32_impl(UniCompiler& uc, const Vec& dst, const Vec& src1, const Vec& src2, uint32_t imm) noexcept {
   ASMJIT_ASSERT((imm & 0xFCFCFCFC) == 0);
 
   if (src1.id() == src2.id())
-    return emit_swizzle32_impl(pc, dst, src1, imm);
+    return emit_swizzle32_impl(uc, dst, src1, imm);
 
-  BackendCompiler* cc = pc->cc;
+  BackendCompiler* cc = uc.cc;
 
   uint32_t table_index = ((imm & 0x03000000) >> (24 - 6)) |
                          ((imm & 0x00030000) >> (16 - 4)) |
@@ -2575,7 +2595,7 @@ static void emit_interleaved_shuffle32_impl(UniCompiler* pc, const Vec& dst, con
         op_dst = regs[op_index];
       }
       else {
-        op_dst = pc->new_similar_reg(dst, "@shuf_tmp_%u", op_index - 2);
+        op_dst = uc.new_similar_reg(dst, "@shuf_tmp_%u", op_index - 2);
       }
     }
     else {
@@ -2592,7 +2612,7 @@ static void emit_interleaved_shuffle32_impl(UniCompiler* pc, const Vec& dst, con
           // In this case the destination is in conflict with one of the source registers. We have to
           // create a new virtual register and then move it to the real `dst` to not mess up the shuffle.
           ASMJIT_ASSERT(!regs[op_index].is_valid());
-          final_dst = pc->new_similar_reg(dst, "@shuf_dst");
+          final_dst = uc.new_similar_reg(dst, "@shuf_dst");
         }
         else {
           // Perfect - the destination is not in conflict with any source register.
@@ -2661,7 +2681,7 @@ static void emit_interleaved_shuffle32_impl(UniCompiler* pc, const Vec& dst, con
     regs[op_index] = op_dst;
   }
 
-  vec_mov(pc, dst, final_dst);
+  vec_mov(uc, dst, final_dst);
 }
 
 // ujit::UniCompiler - Vector Instructions - OpArray Iterator
@@ -2690,62 +2710,62 @@ public:
 };
 
 template<typename Src>
-static ASMJIT_INLINE void emit_2v_t(UniCompiler* pc, UniOpVV op, const OpArray& dst_, const Src& src_) noexcept {
+static ASMJIT_INLINE void emit_2v_t(UniCompiler& uc, UniOpVV op, const OpArray& dst_, const Src& src_) noexcept {
   size_t n = dst_.size();
   OpArrayIter<Src> src(src_);
 
   for (size_t i = 0; i < n; i++) {
-    pc->emit_2v(op, dst_[i], src.op());
+    uc.emit_2v(op, dst_[i], src.op());
     src.next();
   }
 }
 
 template<typename Src>
-static ASMJIT_INLINE void emit_2vi_t(UniCompiler* pc, UniOpVVI op, const OpArray& dst_, const Src& src_, uint32_t imm) noexcept {
+static ASMJIT_INLINE void emit_2vi_t(UniCompiler& uc, UniOpVVI op, const OpArray& dst_, const Src& src_, uint32_t imm) noexcept {
   size_t n = dst_.size();
   OpArrayIter<Src> src(src_);
 
   for (size_t i = 0; i < n; i++) {
-    pc->emit_2vi(op, dst_[i], src.op(), imm);
+    uc.emit_2vi(op, dst_[i], src.op(), imm);
     src.next();
   }
 }
 
 template<typename Src1, typename Src2>
-static ASMJIT_INLINE void emit_3v_t(UniCompiler* pc, UniOpVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_) noexcept {
+static ASMJIT_INLINE void emit_3v_t(UniCompiler& uc, UniOpVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_) noexcept {
   size_t n = dst_.size();
   OpArrayIter<Src1> src1(src1_);
   OpArrayIter<Src2> src2(src2_);
 
   for (size_t i = 0; i < n; i++) {
-    pc->emit_3v(op, dst_[i], src1.op(), src2.op());
+    uc.emit_3v(op, dst_[i], src1.op(), src2.op());
     src1.next();
     src2.next();
   }
 }
 
 template<typename Src1, typename Src2>
-static ASMJIT_INLINE void emit_3vi_t(UniCompiler* pc, UniOpVVVI op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, uint32_t imm) noexcept {
+static ASMJIT_INLINE void emit_3vi_t(UniCompiler& uc, UniOpVVVI op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, uint32_t imm) noexcept {
   size_t n = dst_.size();
   OpArrayIter<Src1> src1(src1_);
   OpArrayIter<Src2> src2(src2_);
 
   for (size_t i = 0; i < n; i++) {
-    pc->emit_3vi(op, dst_[i], src1.op(), src2.op(), imm);
+    uc.emit_3vi(op, dst_[i], src1.op(), src2.op(), imm);
     src1.next();
     src2.next();
   }
 }
 
 template<typename Src1, typename Src2, typename Src3>
-static ASMJIT_INLINE void emit_4v_t(UniCompiler* pc, UniOpVVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, const Src3& src3_) noexcept {
+static ASMJIT_INLINE void emit_4v_t(UniCompiler& uc, UniOpVVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, const Src3& src3_) noexcept {
   size_t n = dst_.size();
   OpArrayIter<Src1> src1(src1_);
   OpArrayIter<Src2> src2(src2_);
   OpArrayIter<Src3> src3(src3_);
 
   for (size_t i = 0; i < n; i++) {
-    pc->emit_4v(op, dst_[i], src1.op(), src2.op(), src3.op());
+    uc.emit_4v(op, dst_[i], src1.op(), src2.op(), src3.op());
     src1.next();
     src2.next();
     src3.next();
@@ -2774,13 +2794,13 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
         return;
       }
 
-      vec_mov(this, dst, src_);
+      vec_mov(*this, dst, src_);
       return;
     }
 
     case UniOpVV::kMovU64: {
       dst = dst.d();
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       cc->mov(dst.b8(), src.b8());
       return;
@@ -2809,7 +2829,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
           return;
         }
 
-        v_load_iany(dst, src.as<Mem>(), 1u << uint32_t(op_info.src_element), Alignment{1});
+        v_load_iany(dst, src.as<Mem>(), size_t(1) << uint32_t(op_info.src_element), Alignment(1));
         src = dst;
       }
 
@@ -2855,7 +2875,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
     case UniOpVV::kNotU64:
     case UniOpVV::kNotF32:
     case UniOpVV::kNotF64: {
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       vec_set_type(dst, op_info.dst_element);
       vec_set_type(src, op_info.src_element);
@@ -2866,7 +2886,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
 
     case UniOpVV::kCvtI8ToI32:
     case UniOpVV::kCvtU8ToU32: {
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
       cc->emit(inst_id, dst.h8(), src.b8(), 0);
       cc->emit(inst_id, dst.s4(), dst.h4(), 0);
       return;
@@ -2889,7 +2909,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       Vec src;
 
       if (op_info.src_part == VecPart::kLo) {
-        src = as_vec(this, src_, dst, 8);
+        src = as_vec(*this, src_, dst, 8);
         src = src.v64();
       }
       else if (src_.is_vec()) {
@@ -2898,7 +2918,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       else {
         Mem m(src_.as<Mem>());
         m.add_offset(8);
-        src = vec_from_mem(this,  m, dst, 8);
+        src = vec_from_mem(*this,  m, dst, 8);
         src = src.v64();
 
         // Since we have loaded from memory, we want to use the low-part variant of the instruction.
@@ -2913,10 +2933,10 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
     case UniOpVV::kRcpF32:
     case UniOpVV::kRcpF64: {
       // Intrinsic.
-      const void* one_ptr = op == UniOpVV::kRcpF32 ? static_cast<const void*>(&ct.f32_1) : static_cast<const void*>(&ct.f64_1);
+      const void* one_ptr = op_info.src_element == ElementSize::k32 ? static_cast<const void*>(&ct().f32_1) : static_cast<const void*>(&ct().f64_1);
 
       Vec one = simd_vec_const(one_ptr, Bcst::kNA, dst);
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       vec_set_type(dst, op_info.dst_element);
       vec_set_type(one, op_info.dst_element);
@@ -2926,30 +2946,71 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       return;
     }
 
+    case UniOpVV::kAbsF32S:
+    case UniOpVV::kNegF32S:
     case UniOpVV::kTruncF32S:
     case UniOpVV::kFloorF32S:
     case UniOpVV::kCeilF32S:
-    case UniOpVV::kRoundF32S:
+    case UniOpVV::kRoundEvenF32S:
+    case UniOpVV::kRoundHalfAwayF32S:
     case UniOpVV::kSqrtF32S: {
       dst.set_signature(RegTraits<RegType::kVec32>::kSignature);
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       cc->emit(inst_id, dst, src);
       return;
     }
 
+    case UniOpVV::kAbsF64S:
+    case UniOpVV::kNegF64S:
     case UniOpVV::kTruncF64S:
     case UniOpVV::kFloorF64S:
     case UniOpVV::kCeilF64S:
-    case UniOpVV::kRoundF64S:
+    case UniOpVV::kRoundEvenF64S:
+    case UniOpVV::kRoundHalfAwayF64S:
     case UniOpVV::kSqrtF64S: {
       dst.set_signature(RegTraits<RegType::kVec64>::kSignature);
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       cc->emit(inst_id, dst, src);
       return;
     }
 
+    case UniOpVV::kRoundHalfUpF32S:
+    case UniOpVV::kRoundHalfUpF64S:
+    case UniOpVV::kRoundHalfUpF32:
+    case UniOpVV::kRoundHalfUpF64: {
+      // Intrinsic.
+      const void* one_ptr =
+        op_info.src_element == ElementSize::k32
+          ? static_cast<const void*>(&ct().f32_0_5_minus_1ulp)
+          : static_cast<const void*>(&ct().f64_0_5_minus_1ulp);
+
+      Vec one = simd_vec_const(one_ptr, Bcst::kNA, dst);
+      Vec src = as_vec(*this, src_, dst);
+
+      if (op == UniOpVV::kRoundHalfUpF32S) {
+        dst.set_signature(RegTraits<RegType::kVec32>::kSignature);
+        src.set_signature(RegTraits<RegType::kVec32>::kSignature);
+        one.set_signature(RegTraits<RegType::kVec32>::kSignature);
+      }
+      else if (op == UniOpVV::kRoundHalfUpF64S) {
+        dst.set_signature(RegTraits<RegType::kVec64>::kSignature);
+        src.set_signature(RegTraits<RegType::kVec64>::kSignature);
+        one.set_signature(RegTraits<RegType::kVec64>::kSignature);
+      }
+      else {
+        vec_set_type(dst, op_info.dst_element);
+        vec_set_type(one, op_info.src_element);
+        vec_set_type(src, op_info.src_element);
+      }
+
+      cc->fadd(dst, src, one);
+      cc->frintm(dst, dst);
+
+      return;
+    }
+
     case UniOpVV::kAbsF32:
     case UniOpVV::kAbsF64:
     case UniOpVV::kNegF32:
@@ -2960,14 +3021,16 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
     case UniOpVV::kFloorF64:
     case UniOpVV::kCeilF32:
     case UniOpVV::kCeilF64:
-    case UniOpVV::kRoundF32:
-    case UniOpVV::kRoundF64:
+    case UniOpVV::kRoundEvenF32:
+    case UniOpVV::kRoundEvenF64:
+    case UniOpVV::kRoundHalfAwayF32:
+    case UniOpVV::kRoundHalfAwayF64:
     case UniOpVV::kSqrtF32:
     case UniOpVV::kSqrtF64:
     case UniOpVV::kCvtI32ToF32:
     case UniOpVV::kCvtRoundF32ToI32:
     case UniOpVV::kCvtTruncF32ToI32: {
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       vec_set_type(dst, op_info.dst_element);
       vec_set_type(src, op_info.src_element);
@@ -2978,7 +3041,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
 
     case UniOpVV::kCvtF32ToF64S:
     case UniOpVV::kCvtF64ToF32S: {
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       vec_set_vec_type(dst, op_info.dst_element);
       vec_set_vec_type(src, op_info.src_element);
@@ -2989,7 +3052,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
 
     case UniOpVV::kCvtF32HiToF64:
       if (src_.is_mem()) {
-        Vec src = as_vec(this, src_.as<Mem>().clone_adjusted(8), dst, 8).v64();
+        Vec src = as_vec(*this, src_.as<Mem>().clone_adjusted(8), dst, 8).v64();
 
         vec_set_type(dst, op_info.dst_element);
         vec_set_type(src, op_info.src_element);
@@ -3000,7 +3063,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       [[fallthrough]];
 
     case UniOpVV::kCvtF32LoToF64: {
-      Vec src = as_vec(this, src_, dst, 8);
+      Vec src = as_vec(*this, src_, dst, 8);
 
       if (op_info.src_part == VecPart::kLo) {
         src = src.v64();
@@ -3015,7 +3078,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
 
     case UniOpVV::kCvtI32HiToF64:
       if (src_.is_mem()) {
-        Vec src = as_vec(this, src_.as<Mem>().clone_adjusted(8), dst, 8).v64();
+        Vec src = as_vec(*this, src_.as<Mem>().clone_adjusted(8), dst, 8).v64();
 
         vec_set_type(dst, op_info.dst_element);
         vec_set_type(src, op_info.src_element);
@@ -3027,7 +3090,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       [[fallthrough]];
 
     case UniOpVV::kCvtI32LoToF64: {
-      Vec src = as_vec(this, src_, dst, 8);
+      Vec src = as_vec(*this, src_, dst, 8);
 
       if (op_info.src_part == VecPart::kLo) {
         src = src.v64();
@@ -3044,7 +3107,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
     case UniOpVV::kCvtF64ToF32Lo:
     case UniOpVV::kCvtF64ToF32Hi: {
       dst = dst.q();
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       if (op_info.dst_part == VecPart::kLo) {
         dst = dst.d();
@@ -3063,7 +3126,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
     case UniOpVV::kCvtRoundF64ToI32Hi: {
       dst = dst.q();
 
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
       Vec tmp = new_similar_reg(dst, "@tmp");
 
       cc->emit(inst_id, tmp.d2(), src.d2());
@@ -3083,8 +3146,8 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
   }
 }
 
-void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const Operand_& src_) noexcept { emit_2v_t(this, op, dst_, src_); }
-void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const OpArray& src_) noexcept { emit_2v_t(this, op, dst_, src_); }
+void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const Operand_& src_) noexcept { emit_2v_t(*this, op, dst_, src_); }
+void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const OpArray& src_) noexcept { emit_2v_t(*this, op, dst_, src_); }
 
 // ujit::UniCompiler - Vector Instructions - Emit 2VI
 // ==================================================
@@ -3102,7 +3165,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
     case UniOpVVI::kSrlbU128: {
       ASMJIT_ASSERT(imm < 16);
 
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       // If the shift is used to extract a high 64-bit element and zero the rest of the register.
       if (op == UniOpVVI::kSrlbU128 && imm == 8) {
@@ -3131,7 +3194,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
     case UniOpVVI::kSwizzleU16x4:
     case UniOpVVI::kSwizzleLoU16x4:
     case UniOpVVI::kSwizzleHiU16x4: {
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       uint8_t pred_data[16] = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF };
 
@@ -3169,14 +3232,14 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
 
     case UniOpVVI::kSwizzleU32x4:
     case UniOpVVI::kSwizzleF32x4: {
-      Vec src = as_vec(this, src_, dst);
-      emit_swizzle32_impl(this, dst, src, imm);
+      Vec src = as_vec(*this, src_, dst);
+      emit_swizzle32_impl(*this, dst, src, imm);
       return;
     }
 
     case UniOpVVI::kSwizzleU64x2:
     case UniOpVVI::kSwizzleF64x2: {
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       // Use `dup` to broadcast one 64-bit elements.
       if (Swizzle2{imm} == swizzle(0, 0) ||
@@ -3215,7 +3278,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
       ASMJIT_NOT_REACHED();
 
     default: {
-      Vec src = as_vec(this, src_, dst);
+      Vec src = as_vec(*this, src_, dst);
 
       if (op_info.dst_part == VecPart::kLo) dst = dst.d();
       if (op_info.src_part == VecPart::kLo) src = src.d();
@@ -3229,8 +3292,8 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
   }
 }
 
-void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept { emit_2vi_t(this, op, dst_, src_, imm); }
-void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept { emit_2vi_t(this, op, dst_, src_, imm); }
+void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept { emit_2vi_t(*this, op, dst_, src_, imm); }
+void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept { emit_2vi_t(*this, op, dst_, src_, imm); }
 
 // ujit::UniCompiler - Vector Instructions - Emit 2VS
 // ==================================================
@@ -3375,7 +3438,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen
     case UniOpVM::kLoad128_U64:
     case UniOpVM::kLoad128_F32:
     case UniOpVM::kLoad128_F64: {
-      vec_load_mem(this, dst, src, op_info.mem_size);
+      vec_load_mem(*this, dst, src, op_info.mem_size);
       return;
     }
 
@@ -3383,7 +3446,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen
     case UniOpVM::kLoadN_U64:
     case UniOpVM::kLoadN_F32:
     case UniOpVM::kLoadN_F64: {
-      vec_load_mem(this, dst.q(), src, 16);
+      vec_load_mem(*this, dst.q(), src, 16);
       return;
     }
 
@@ -3411,7 +3474,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen
     case UniOpVM::kLoadCvt64_U16ToU32:
     case UniOpVM::kLoadCvt64_I32ToI64:
     case UniOpVM::kLoadCvt64_U32ToU64: {
-      vec_load_mem(this, dst, src, op_info.mem_size);
+      vec_load_mem(*this, dst, src, op_info.mem_size);
       emit_2v(UniOpVV(op_info.cvt_op), dst, dst);
       return;
     }
@@ -3424,7 +3487,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen
     case UniOpVM::kLoadCvtN_U16ToU32:
     case UniOpVM::kLoadCvtN_I32ToI64:
     case UniOpVM::kLoadCvtN_U32ToU64: {
-      vec_load_mem(this, dst, src, dst.size() / 2u);
+      vec_load_mem(*this, dst, src, dst.size() / 2u);
       emit_2v(UniOpVV(op_info.cvt_op), dst, dst);
       return;
     }
@@ -3442,7 +3505,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen
       }
       else {
         Vec tmp = new_similar_reg(dst, "@tmp");
-        v_load_iany(tmp, src, op_info.mem_size, Alignment{1});
+        v_load_iany(tmp, src, op_info.mem_size, Alignment(1));
 
         vec_set_type_and_index(dst, op_info.element, idx);
         vec_set_type_and_index(tmp, op_info.element, 0);
@@ -3725,7 +3788,7 @@ void UniCompiler::emit_mv(UniOpMV op, const Mem& dst_, const OpArray& src_, Alig
 // =================================================
 
 static void emit_3v_op(
-  UniCompiler* pc,
+  UniCompiler& uc,
   InstId inst_id,
   Vec dst, Vec src1, Operand_ src2_,
   FloatMode float_mode,
@@ -3739,19 +3802,19 @@ static void emit_3v_op(
     case FloatMode::kF32S: {
       dst = dst.s();
       src1 = src1.s();
-      src2 = as_vec(pc, src2_, dst, 4);
+      src2 = as_vec(uc, src2_, dst, 4);
       break;
     }
 
     case FloatMode::kF64S: {
       dst = dst.d();
       src1 = src1.d();
-      src2 = as_vec(pc, src2_, dst, 8);
+      src2 = as_vec(uc, src2_, dst, 8);
       break;
     }
 
     default: {
-      src2 = as_vec(pc, src2_, dst);
+      src2 = as_vec(uc, src2_, dst);
 
       if (dst_part == VecPart::kLo) {
         dst = dst.d();
@@ -3769,7 +3832,7 @@ static void emit_3v_op(
     }
   }
 
-  BackendCompiler* cc = pc->cc;
+  BackendCompiler* cc = uc.cc;
   if (reversed)
     cc->emit(inst_id, dst, src2, src1);
   else
@@ -3799,7 +3862,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       }
 
       case SameVecOp::kSrc: {
-        vec_mov(this, dst, src1);
+        vec_mov(*this, dst, src1);
         return;
       }
 
@@ -3809,8 +3872,29 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
   }
 
   switch (op) {
+    // dst = a - (floor(a / b) * b).
+    case UniOpVVV::kModF32S:
+    case UniOpVVV::kModF64S:
+    case UniOpVVV::kModF32:
+    case UniOpVVV::kModF64: {
+      Vec src2 = as_vec(*this, src2_, dst, op_info.float_mode);
+      Vec tmp = new_similar_reg(dst, "@tmp1");
+
+      UniOpVVV sub_op = translate_op(op, UniOpVVV::kModF32S, UniOpVVV::kSubF32S);
+      UniOpVVV mul_op = translate_op(op, UniOpVVV::kModF32S, UniOpVVV::kMulF32S);
+      UniOpVVV div_op = translate_op(op, UniOpVVV::kModF32S, UniOpVVV::kDivF32S);
+      UniOpVV trunc_op = translate_op(op, UniOpVVV::kModF32S, UniOpVV::kTruncF32S);
+
+      emit_3v(div_op, tmp, src1, src2);
+      emit_2v(trunc_op, tmp, tmp);
+      emit_3v(mul_op, tmp, tmp, src2);
+      emit_3v(sub_op, dst, src1, tmp);
+
+      return;
+    }
+
     case UniOpVVV::kMulU64: {
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
       Vec tmp1 = new_similar_reg(dst, "@tmp1");
       Vec tmp2 = new_similar_reg(dst, "@tmp2");
       Vec tmp3 = new_similar_reg(dst, "@tmp3");
@@ -3827,7 +3911,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
     }
 
     case UniOpVVV::kMulhI16: {
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
       Vec tmp = new_similar_reg(dst, "@tmp");
 
       cc->smull(tmp.s4(), src1.h4(), src2.h4());
@@ -3837,7 +3921,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
     }
 
     case UniOpVVV::kMulhU16: {
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
       Vec tmp = new_similar_reg(dst, "@tmp");
 
       cc->umull(tmp.s4(), src1.h4(), src2.h4());
@@ -3847,7 +3931,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
     }
 
     case UniOpVVV::kMulU64_LoU32: {
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
       Vec tmp1 = new_similar_reg(dst, "@tmp1");
       Vec tmp2 = new_similar_reg(dst, "@tmp2");
       Vec tmp3 = dst;
@@ -3865,7 +3949,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
     }
 
     case UniOpVVV::kMHAddI16_I32: {
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
 
       Vec al = new_similar_reg(dst, "@al");
       Vec ah = new_similar_reg(dst, "@ah");
@@ -3888,7 +3972,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
     case UniOpVVV::kMinU64:
     case UniOpVVV::kMaxI64:
     case UniOpVVV::kMaxU64: {
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
 
       // Min/Max is commutative, so let's make dst only overlap src1.
       if (dst.id() == src2.id()) {
@@ -3923,8 +4007,8 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
     case UniOpVVV::kCmpNeF64S:
     case UniOpVVV::kCmpNeF32:
     case UniOpVVV::kCmpNeF64: {
-      emit_3v_op(this, inst_id, dst, src1, src2_, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0);
-      vec_neg(this, dst, dst, op_info.float_mode);
+      emit_3v_op(*this, inst_id, dst, src1, src2_, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0);
+      vec_neg(*this, dst, dst, op_info.float_mode);
       return;
     }
 
@@ -3937,7 +4021,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
     case UniOpVVV::kCmpUnordF32:
     case UniOpVVV::kCmpUnordF64: {
       if (is_same_vec(src1, src2_)) {
-        emit_3v_op(this, Inst::kIdFcmeq_v, dst, src1, src1, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0);
+        emit_3v_op(*this, Inst::kIdFcmeq_v, dst, src1, src1, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0);
       }
       else {
         // This takes advantage of the following:
@@ -3952,20 +4036,20 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         //
         //   - If both elements are zeros, regardless of the sign of either zero, the result is the second element.
         //   - If either element is a NaN, regardless of the value of FPCR.DN, the result is the second element.
-        Vec src2 = as_vec(this, src2_, dst, op_info.float_mode);
-        emit_3v_op(this, Inst::kIdFmin_v, dst, src1, src2, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0);
-        emit_3v_op(this, Inst::kIdFcmeq_v, dst, dst, dst, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0);
+        Vec src2 = as_vec(*this, src2_, dst, op_info.float_mode);
+        emit_3v_op(*this, Inst::kIdFmin_v, dst, src1, src2, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0);
+        emit_3v_op(*this, Inst::kIdFcmeq_v, dst, dst, dst, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0);
       }
 
       if (op_info.imm)
-        vec_neg(this, dst, dst, op_info.float_mode);
+        vec_neg(*this, dst, dst, op_info.float_mode);
 
       return;
     }
 
     case UniOpVVV::kHAddF64: {
       Vec tmp = new_similar_reg(dst, "@tmp");
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
 
       if (src1.id() == src2.id()) {
         cc->ext(tmp.b16(), src1.b16(), src1.b16(), 8);
@@ -3982,7 +4066,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
     case UniOpVVV::kCombineLoHiU64:
     case UniOpVVV::kCombineLoHiF64: {
       // Intrinsic - dst = {src1.u64[0], src2.64[1]} - combining low part of src1 and high part of src1.
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
 
       vec_set_type(dst, ElementSize::k8);
       vec_set_type(src1, ElementSize::k8);
@@ -3996,7 +4080,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
     case UniOpVVV::kCombineHiLoU64:
     case UniOpVVV::kCombineHiLoF64: {
       // Intrinsic - dst = {src1.u64[1], src2.64[0]} - combining high part of src1 and low part of src2.
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
 
       if (is_same_vec(dst, src1)) {
         if (is_same_vec(dst, src2))
@@ -4023,7 +4107,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
 
       size_t id = size_t(op) - size_t(UniOpVVV::kPacksI16_I8);
 
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
       vec_set_type(src1, op_info.src_element);
       vec_set_type(src2, op_info.src_element);
 
@@ -4058,15 +4142,15 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
     }
 
     default: {
-      emit_3v_op(this, inst_id, dst, src1, src2_, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, op_info.reverse);
+      emit_3v_op(*this, inst_id, dst, src1, src2_, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, op_info.reverse);
       return;
     }
   }
 }
 
-void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); }
-void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); }
-void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); }
+void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); }
+void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); }
+void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); }
 
 // ujit::UniCompiler - Vector Instructions - Emit 3VI
 // ==================================================
@@ -4087,11 +4171,11 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
       ASMJIT_ASSERT(imm < 16);
 
       if (imm == 0) {
-        vec_mov(this, dst, src2_);
+        vec_mov(*this, dst, src2_);
         return;
       }
 
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
       vec_set_type(dst, ElementSize::k8);
       vec_set_type(src1, ElementSize::k8);
       vec_set_type(src2, ElementSize::k8);
@@ -4103,8 +4187,8 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
     case UniOpVVVI::kInterleaveShuffleF32x4: {
       ASMJIT_ASSERT((imm & 0xFCFCFCFC) == 0);
 
-      Vec src2 = as_vec(this, src2_, dst);
-      emit_interleaved_shuffle32_impl(this, dst, src1, src2, imm);
+      Vec src2 = as_vec(*this, src2_, dst);
+      emit_interleaved_shuffle32_impl(*this, dst, src1, src2, imm);
       return;
     }
 
@@ -4112,7 +4196,7 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
     case UniOpVVVI::kInterleaveShuffleF64x2: {
       ASMJIT_ASSERT((imm & 0xFFFCFEFE) == 0);
 
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
 
       if (src1.id() == src2.id()) {
         v_swizzle_u64x2(dst, src1, Swizzle2{imm});
@@ -4147,9 +4231,9 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
   }
 }
 
-void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); }
-void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); }
-void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); }
+void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); }
+void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); }
+void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); }
 
 // ujit::UniCompiler - Vector Instructions - Emit 4V
 // =================================================
@@ -4166,8 +4250,8 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
 
   switch (op) {
     case UniOpVVVV::kBlendV_U8: {
-      Vec src2 = as_vec(this, src2_, dst);
-      Vec src3 = as_vec(this, src3_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
+      Vec src3 = as_vec(*this, src3_, dst);
 
       vec_set_type(dst, op_info.dst_element);
       vec_set_type(src1, op_info.src_element);
@@ -4189,7 +4273,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
         return;
       }
 
-      vec_mov(this, dst, src3);
+      vec_mov(*this, dst, src3);
       cc->bsl(dst, src2, src1);
       return;
     }
@@ -4208,14 +4292,14 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
       if (op_info.float_mode == FloatMode::kF32S) {
         dst = dst.s();
         src1 = src1.s();
-        src2 = as_vec(this, src2_, dst, 4);
-        src3 = as_vec(this, src3_, dst, 4);
+        src2 = as_vec(*this, src2_, dst, 4);
+        src3 = as_vec(*this, src3_, dst, 4);
       }
       else {
         dst = dst.d();
         src1 = src1.d();
-        src2 = as_vec(this, src2_, dst, 8);
-        src3 = as_vec(this, src3_, dst, 8);
+        src2 = as_vec(*this, src2_, dst, 8);
+        src3 = as_vec(*this, src3_, dst, 8);
       }
 
       cc->emit(inst_id, dst, src1, src2, src3);
@@ -4232,7 +4316,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
     case UniOpVVVV::kNMAddF64:
     case UniOpVVVV::kNMSubF32:
     case UniOpVVVV::kNMSubF64: {
-      Vec src2 = as_vec(this, src2_, dst);
+      Vec src2 = as_vec(*this, src2_, dst);
       Vec src3;
 
       bool negate_acc = op_info.imm != 0;
@@ -4240,11 +4324,11 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
       bool destructible = is_same_vec(dst, src3_) || !src3_.is_reg();
 
       if (!dst_overlaps && src3_.is_mem()) {
-        vec_load_mem(this, dst, src3_.as<Mem>(), dst.size());
+        vec_load_mem(*this, dst, src3_.as<Mem>(), dst.size());
         src3 = dst;
       }
       else {
-        src3 = as_vec(this, src3_, dst);
+        src3 = as_vec(*this, src3_, dst);
       }
 
       vec_set_type(dst, op_info.dst_element);
@@ -4288,13 +4372,13 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
   }
 }
 
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
 
 ASMJIT_END_SUB_NAMESPACE
 
diff --git a/src/asmjit/ujit/unicompiler_utils_p.h b/src/asmjit/ujit/unicompiler_utils_p.h
new file mode 100644
index 0000000..b01cc1c
--- /dev/null
+++ b/src/asmjit/ujit/unicompiler_utils_p.h
@@ -0,0 +1,32 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See <asmjit/core.h> or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_UJIT_UNICOMPILER_UTILS_P_H_INCLUDED
+#define ASMJIT_UJIT_UNICOMPILER_UTILS_P_H_INCLUDED
+
+#include "ujitbase.h"
+
+#if !defined(ASMJIT_NO_UJIT)
+
+#include "uniop.h"
+
+ASMJIT_BEGIN_SUB_NAMESPACE(ujit)
+
+//! \addtogroup asmjit_ujit
+//! \{
+
+template<typename UniOpDst, typename UniOpSrc>
+static ASMJIT_INLINE UniOpDst translate_op(UniOpSrc op, UniOpSrc begin, UniOpDst target) noexcept {
+  ASMJIT_ASSERT(begin <= op);
+  uint32_t offset = uint32_t(op) - uint32_t(begin);
+  return UniOpDst(uint32_t(target) + offset);
+}
+
+//! \}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_UJIT
+#endif // ASMJIT_UJIT_UNICOMPILER_UTILS_P_H_INCLUDED
diff --git a/src/asmjit/ujit/unicompiler_x86.cpp b/src/asmjit/ujit/unicompiler_x86.cpp
index 57b247b..a0c8939 100644
--- a/src/asmjit/ujit/unicompiler_x86.cpp
+++ b/src/asmjit/ujit/unicompiler_x86.cpp
@@ -9,6 +9,8 @@
 #if defined(ASMJIT_UJIT_X86)
 
 #include "unicompiler.h"
+#include "unicompiler_utils_p.h"
+#include "unicondition.h"
 
 ASMJIT_BEGIN_SUB_NAMESPACE(ujit)
 
@@ -17,8 +19,8 @@ using SSEExt = UniCompiler::SSEExt;
 using AVXExt = UniCompiler::AVXExt;
 namespace Inst { using namespace x86::Inst; }
 
-// ujit::UniCompiler  - Constants
-// ==============================
+// ujit::UniCompiler - Constants
+// =============================
 
 static constexpr OperandSignature signature_of_xmm_ymm_zmm[] = {
   OperandSignature{RegTraits<RegType::kVec128>::kSignature},
@@ -30,30 +32,31 @@ static ASMJIT_INLINE RegType vec_reg_type_from_width(VecWidth vw) noexcept {
   return RegType(uint32_t(RegType::kVec128) + uint32_t(vw));
 }
 
-// ujit::UniCompiler  - Construction & Destruction
-// ===============================================
+// ujit::UniCompiler - Construction & Destruction
+// ==============================================
 
-UniCompiler::UniCompiler(BackendCompiler* cc, const CpuFeatures& features, UniOptFlags opt_flags) noexcept
+UniCompiler::UniCompiler(BackendCompiler* cc, const CpuFeatures& features, CpuHints cpu_hints, VecConstTableRef ct_ref) noexcept
   : cc(cc),
-    ct(vec_const_table),
+    _ct_ref(ct_ref),
     _features(features),
-    _opt_flags(opt_flags),
+    _cpu_hints(cpu_hints),
     _vec_reg_count(16),
     _common_table_offset(128) {
 
   _scalar_op_behavior = ScalarOpBehavior::kPreservingVec128;
-  _fmin_fmax_op_hehavior = FMinFMaxOpBehavior::kTernaryLogic;
+  _fmin_fmax_op_behavior = FMinFMaxOpBehavior::kTernaryLogic;
   _fmadd_op_behavior = FMAddOpBehavior::kNoFMA; // Will be changed by _init_extensions() if supported.
+  _float_to_int_outside_range_behavior = FloatToIntOutsideRangeBehavior::kSmallestValue;
 
   _init_extensions(features);
 }
 
 UniCompiler::~UniCompiler() noexcept {}
 
-// ujit::UniCompiler  - CPU Architecture, Features and Optimization Options
-// ========================================================================
+// ujit::UniCompiler - CPU Architecture, Features and Optimization Options
+// =======================================================================
 
-void UniCompiler::_init_extensions(const asmjit::CpuFeatures& features) noexcept {
+void UniCompiler::_init_extensions(const CpuFeatures& features) noexcept {
   uint32_t gp_ext_mask = 0;
   uint32_t sse_ext_mask = 0;
   uint64_t avx_ext_mask = 0;
@@ -134,20 +137,20 @@ void UniCompiler::init_vec_width(VecWidth vw) noexcept {
 
 bool UniCompiler::has_masked_access_of(uint32_t data_size) const noexcept {
   switch (data_size) {
-    case 1: return has_opt_flag(UniOptFlags::kMaskOps8Bit);
-    case 2: return has_opt_flag(UniOptFlags::kMaskOps16Bit);
-    case 4: return has_opt_flag(UniOptFlags::kMaskOps32Bit);
-    case 8: return has_opt_flag(UniOptFlags::kMaskOps64Bit);
+    case 1: return has_cpu_hint(CpuHints::kVecMaskedOps8);
+    case 2: return has_cpu_hint(CpuHints::kVecMaskedOps16);
+    case 4: return has_cpu_hint(CpuHints::kVecMaskedOps32);
+    case 8: return has_cpu_hint(CpuHints::kVecMaskedOps64);
 
     default:
       return false;
   }
 }
 
-// ujit::UniCompiler  - Function
-// =============================
+// ujit::UniCompiler - Function
+// ============================
 
-void UniCompiler::init_function(asmjit::FuncNode* func_node) noexcept {
+void UniCompiler::init_function(FuncNode* func_node) noexcept {
   cc->add_func(func_node);
 
   _func_node = func_node;
@@ -164,16 +167,16 @@ void UniCompiler::init_function(asmjit::FuncNode* func_node) noexcept {
   }
 }
 
-// ujit::UniCompiler  - Constants
-// ==============================
+// ujit::UniCompiler - Constants
+// =============================
 
 void UniCompiler::_init_vec_const_table_ptr() noexcept {
-  const void* global = &vec_const_table;
+  const void* ct_addr = ct_ptr<void>();
 
   if (!_common_table_ptr.is_valid()) {
     ScopedInjector injector(cc, &_func_init);
     _common_table_ptr = new_gpz("common_table_ptr");
-    cc->mov(_common_table_ptr, (int64_t)global + _common_table_offset);
+    cc->mov(_common_table_ptr, (int64_t)ct_addr + _common_table_offset);
   }
 }
 
@@ -183,7 +186,7 @@ x86::KReg UniCompiler::k_const(uint64_t value) noexcept {
     if (_k_reg[slot].is_valid() && _k_imm[slot] == value)
       return _k_reg[slot];
 
-  asmjit::BaseNode* prevNode = nullptr;
+  BaseNode* prevNode = nullptr;
   Gp tmp;
   x86::KReg kReg;
 
@@ -213,55 +216,58 @@ x86::KReg UniCompiler::k_const(uint64_t value) noexcept {
 }
 
 Operand UniCompiler::simd_const(const void* c, Bcst bcst_width, VecWidth const_width) noexcept {
-  size_t constCount = _vec_consts.size();
+  size_t const_count = _vec_consts.size();
 
-  for (size_t i = 0; i < constCount; i++)
-    if (_vec_consts[i].ptr == c)
+  for (size_t i = 0; i < const_count; i++) {
+    if (_vec_consts[i].ptr == c) {
       return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _vec_consts[i].virt_reg_id);
+    }
+  }
 
   // We don't use memory constants when compiling for AVX-512, because we don't store 64-byte constants and AVX-512
   // has enough registers to hold all the constants that we need. However, in SSE/AVX2 case, we don't want so many
   // constants in registers as that could limit registers that we need during fetching and composition.
   if (!has_avx512()) {
-    bool useVReg = (c == &ct.p_0000000000000000); // Required if the CPU doesn't have SSE4.1.
-    if (!useVReg)
+    bool use_vreg = (c == &ct().p_0000000000000000); // Required if the CPU doesn't have SSE4.1.
+    if (!use_vreg) {
       return simd_mem_const(c, bcst_width, const_width);
+    }
   }
 
-  return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _new_vecConst(c, bcst_width == Bcst::kNA_Unique).id());
+  return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _new_vec_const(c, bcst_width == Bcst::kNA_Unique).id());
 }
 
 Operand UniCompiler::simd_const(const void* c, Bcst bcst_width, const Vec& similar_to) noexcept {
-  VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(asmjit::RegType::kVec128));
+  VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(RegType::kVec128));
   return simd_const(c, bcst_width, const_width);
 }
 
 Operand UniCompiler::simd_const(const void* c, Bcst bcst_width, const VecArray& similar_to) noexcept {
   ASMJIT_ASSERT(!similar_to.is_empty());
 
-  VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(asmjit::RegType::kVec128));
+  VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(RegType::kVec128));
   return simd_const(c, bcst_width, const_width);
 }
 
 Vec UniCompiler::simd_vec_const(const void* c, Bcst bcst_width, VecWidth const_width) noexcept {
-  size_t constCount = _vec_consts.size();
+  size_t const_count = _vec_consts.size();
 
-  for (size_t i = 0; i < constCount; i++)
+  for (size_t i = 0; i < const_count; i++)
     if (_vec_consts[i].ptr == c)
       return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _vec_consts[i].virt_reg_id);
 
-  return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _new_vecConst(c, bcst_width == Bcst::kNA_Unique).id());
+  return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _new_vec_const(c, bcst_width == Bcst::kNA_Unique).id());
 }
 
 Vec UniCompiler::simd_vec_const(const void* c, Bcst bcst_width, const Vec& similar_to) noexcept {
-  VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(asmjit::RegType::kVec128));
+  VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(RegType::kVec128));
   return simd_vec_const(c, bcst_width, const_width);
 }
 
 Vec UniCompiler::simd_vec_const(const void* c, Bcst bcst_width, const VecArray& similar_to) noexcept {
   ASMJIT_ASSERT(!similar_to.is_empty());
 
-  VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(asmjit::RegType::kVec128));
+  VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(RegType::kVec128));
   return simd_vec_const(c, bcst_width, const_width);
 }
 
@@ -284,22 +290,22 @@ x86::Mem UniCompiler::simd_mem_const(const void* c, Bcst bcst_width, VecWidth co
 }
 
 x86::Mem UniCompiler::simd_mem_const(const void* c, Bcst bcst_width, const Vec& similar_to) noexcept {
-  VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(asmjit::RegType::kVec128));
+  VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(RegType::kVec128));
   return simd_mem_const(c, bcst_width, const_width);
 }
 
 x86::Mem UniCompiler::simd_mem_const(const void* c, Bcst bcst_width, const VecArray& similar_to) noexcept {
   ASMJIT_ASSERT(!similar_to.is_empty());
 
-  VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(asmjit::RegType::kVec128));
+  VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(RegType::kVec128));
   return simd_mem_const(c, bcst_width, const_width);
 }
 
 x86::Mem UniCompiler::_get_mem_const(const void* c) noexcept {
   // Make sure we are addressing a constant from the `commonTable` constant pool.
-  const void* global = &vec_const_table;
-  ASMJIT_ASSERT((uintptr_t)c >= (uintptr_t)global &&
-                (uintptr_t)c <  (uintptr_t)global + sizeof(VecConstTable));
+  const void* ct_addr = ct_ptr<void>();
+  ASMJIT_ASSERT((uintptr_t)c >= (uintptr_t)ct_addr &&
+                (uintptr_t)c <  (uintptr_t)ct_addr + _ct_ref.size);
 
   if (is_32bit()) {
     // 32-bit mode - These constants will never move in memory so the absolute addressing is a win/win as we can save
@@ -307,34 +313,34 @@ x86::Mem UniCompiler::_get_mem_const(const void* c) noexcept {
     return x86::ptr((uint64_t)c);
   }
   else {
-    // 64-bit mode - One GP register is sacrificed to hold the pointer to the `vec_const_table`. This is probably the
-    // safest approach as relying on absolute addressing or anything else could lead to problems or performance issues.
+    // 64-bit mode - One GP register is sacrificed to hold the pointer to the `ct`. This is probably the safest
+    // approach as relying on absolute addressing or anything else could lead to problems or performance issues.
     _init_vec_const_table_ptr();
 
-    int32_t disp = int32_t((intptr_t)c - (intptr_t)global);
+    int32_t disp = int32_t((intptr_t)c - (intptr_t)ct_addr);
     return x86::ptr(_common_table_ptr, disp - _common_table_offset);
   }
 }
 
-Vec UniCompiler::_new_vecConst(const void* c, bool is_unique_const) noexcept {
+Vec UniCompiler::_new_vec_const(const void* c, bool is_unique_const) noexcept {
   Vec vec;
   const char* special_const_name = nullptr;
 
   if (special_const_name) {
-    vec = new_vec(vec_width(), special_const_name);
+    vec = new_vec_with_width(vec_width(), special_const_name);
   }
   else {
     uint64_t u0 = static_cast<const uint64_t*>(c)[0];
     uint64_t u1 = static_cast<const uint64_t*>(c)[1];
 
     if (u0 != u1)
-      vec = new_vec(vec_width(), "c_0x%016llX%016llX", (unsigned long long)u1, (unsigned long long)u0);
+      vec = new_vec_with_width(vec_width(), "c_0x%016llX%016llX", (unsigned long long)u1, (unsigned long long)u0);
     else if ((u0 >> 32) != (u0 & 0xFFFFFFFFu))
-      vec = new_vec(vec_width(), "c_0x%016llX", (unsigned long long)u0);
+      vec = new_vec_with_width(vec_width(), "c_0x%016llX", (unsigned long long)u0);
     else if (((u0 >> 16) & 0xFFFFu) != (u0 & 0xFFFFu))
-      vec = new_vec(vec_width(), "c_0x%08X", (unsigned)(u0 & 0xFFFFFFFFu));
+      vec = new_vec_with_width(vec_width(), "c_0x%08X", (unsigned)(u0 & 0xFFFFFFFFu));
     else
-      vec = new_vec(vec_width(), "c_0x%04X", (unsigned)(u0 & 0xFFFFu));
+      vec = new_vec_with_width(vec_width(), "c_0x%04X", (unsigned)(u0 & 0xFFFFu));
   }
 
   VecConstData const_data;
@@ -342,7 +348,7 @@ Vec UniCompiler::_new_vecConst(const void* c, bool is_unique_const) noexcept {
   const_data.virt_reg_id = vec.id();
   _vec_consts.append(arena(), const_data);
 
-  if (c == &ct.p_0000000000000000) {
+  if (c == &ct().p_0000000000000000) {
     ScopedInjector inject(cc, &_func_init);
     v_zero_i(vec.xmm());
   }
@@ -363,8 +369,8 @@ Vec UniCompiler::_new_vecConst(const void* c, bool is_unique_const) noexcept {
   return vec;
 }
 
-// ujit::UniCompiler  - Stack
-// ==========================
+// ujit::UniCompiler - Stack
+// =========================
 
 x86::Mem UniCompiler::tmp_stack(StackId id, uint32_t size) noexcept {
   ASMJIT_ASSERT(Support::is_power_of_2(size));
@@ -379,8 +385,8 @@ x86::Mem UniCompiler::tmp_stack(StackId id, uint32_t size) noexcept {
   return stack;
 }
 
-// ujit::UniCompiler  - Utilities
-// ==============================
+// ujit::UniCompiler - Utilities
+// =============================
 
 void UniCompiler::embed_jump_table(const Label* jump_table, size_t jump_table_size, const Label& jump_table_base, uint32_t entry_size) noexcept {
   static const uint8_t zeros[8] {};
@@ -393,8 +399,8 @@ void UniCompiler::embed_jump_table(const Label* jump_table, size_t jump_table_si
   }
 }
 
-// ujit::UniCompiler  - General Purpose Instructions - Conditions
-// ==============================================================
+// ujit::UniCompiler - General Purpose Instructions - Conditions
+// =============================================================
 
 static constexpr InstId condition_to_inst_id[size_t(UniOpCond::kMaxValue) + 1] = {
   Inst::kIdAnd,  // UniOpCond::kAssignAnd
@@ -408,14 +414,14 @@ static constexpr InstId condition_to_inst_id[size_t(UniOpCond::kMaxValue) + 1] =
   Inst::kIdCmp   // UniOpCond::kCompare
 };
 
-class ConditionApplier : public Condition {
+class ConditionApplier : public UniCondition {
 public:
-  ASMJIT_INLINE ConditionApplier(const Condition& condition) noexcept : Condition(condition) {
+  ASMJIT_INLINE ConditionApplier(const UniCondition& condition) noexcept : UniCondition(condition) {
     // The first operand must always be a register.
     ASMJIT_ASSERT(a.is_gp());
   }
 
-  ASMJIT_NOINLINE void optimize(UniCompiler* pc) noexcept {
+  ASMJIT_NOINLINE void optimize(UniCompiler& uc) noexcept {
     switch (op) {
       case UniOpCond::kAssignShr:
         if (b.is_imm() && b.as<Imm>().value() == 0) {
@@ -448,7 +454,7 @@ public:
           // test on 64-bit hardware as it's guaranteed that any register index is encodable. On 32-bit hardware only the
           // first 4 registers can be used, which could mean that the register would have to be moved just to be tested,
           // which is something we would like to avoid.
-          if (pc->is_64bit() && bit_index < 8) {
+          if (uc.is_64bit() && bit_index < 8) {
             op = UniOpCond::kTest;
             b = Imm(1u << bit_index);
             cond = cond == CondCode::kC ? CondCode::kNZ : CondCode::kZ;
@@ -466,8 +472,8 @@ public:
     cond = x86::reverse_cond(cond);
   }
 
-  ASMJIT_NOINLINE void emit(UniCompiler* pc) noexcept {
-    BackendCompiler* cc = pc->cc;
+  ASMJIT_NOINLINE void emit(UniCompiler& uc) noexcept {
+    BackendCompiler* cc = uc.cc;
     InstId inst_id = condition_to_inst_id[size_t(op)];
 
     if (inst_id == Inst::kIdTest && cc->is_64bit()) {
@@ -494,8 +500,8 @@ public:
   }
 };
 
-// ujit::UniCompiler  - General Purpose Instructions - Emit
-// ========================================================
+// ujit::UniCompiler - General Purpose Instructions - Emit
+// =======================================================
 
 void UniCompiler::emit_mov(const Gp& dst, const Operand_& src) noexcept {
   if (src.is_imm() && src.as<Imm>().value() == 0) {
@@ -642,16 +648,16 @@ void UniCompiler::emit_mr(UniOpMR op, const Mem& dst, const Gp& src) noexcept {
   cc->emit(op_info.inst_id, m, r);
 }
 
-void UniCompiler::emit_cmov(const Gp& dst, const Operand_& sel, const Condition& condition) noexcept {
+void UniCompiler::emit_cmov(const Gp& dst, const Operand_& sel, const UniCondition& condition) noexcept {
   ConditionApplier ca(condition);
-  ca.optimize(this);
-  ca.emit(this);
+  ca.optimize(*this);
+  ca.emit(*this);
   cc->emit(Inst::cmovcc_from_cond(ca.cond), dst, sel);
 }
 
-void UniCompiler::emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const Condition& condition) noexcept {
+void UniCompiler::emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const UniCondition& condition) noexcept {
   ConditionApplier ca(condition);
-  ca.optimize(this);
+  ca.optimize(*this);
 
   bool dst_is_a = ca.a.is_reg() && dst.id() == ca.a.as<Reg>().id();
   bool dst_is_b = ca.b.is_reg() && dst.id() == ca.b.as<Reg>().id();
@@ -674,17 +680,17 @@ void UniCompiler::emit_select(const Gp& dst, const Operand_& sel1_, const Operan
 
   if (sel1.is_imm() && sel1.as<Imm>().value() == 0 && !dst_is_a && !dst_is_b && !dst_is_sel) {
     cc->xor_(dst, dst);
-    ca.emit(this);
+    ca.emit(*this);
   }
   else {
-    ca.emit(this);
+    ca.emit(*this);
     if (!dst_is_sel)
       cc->emit(Inst::kIdMov, dst, sel1);
   }
 
   if (sel2.is_imm()) {
     int64_t value = sel2.as<Imm>().value();
-    Mem sel2_mem = cc->new_const(asmjit::ConstPoolScope::kLocal, &value, dst.size());
+    Mem sel2_mem = cc->new_const(ConstPoolScope::kLocal, &value, dst.size());
     sel2 = sel2_mem;
   }
 
@@ -1045,7 +1051,7 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con
       case UniOpRRR::kRol: {
         if (has_bmi2()) {
           uint32_t reg_size = dst.size() * 8u;
-          uint32_t imm = (reg_size - b.value_as<uint32_t>()) & asmjit::Support::lsb_mask<uint32_t>(reg_size);
+          uint32_t imm = (reg_size - b.value_as<uint32_t>()) & Support::lsb_mask<uint32_t>(reg_size);
           cc->rorx(dst, a, imm);
         }
         else {
@@ -1479,10 +1485,10 @@ void UniCompiler::emit_j(const Operand_& target) noexcept {
   cc->emit(Inst::kIdJmp, target);
 }
 
-void UniCompiler::emit_j_if(const Label& target, const Condition& condition) noexcept {
+void UniCompiler::emit_j_if(const Label& target, const UniCondition& condition) noexcept {
   ConditionApplier ca(condition);
-  ca.optimize(this);
-  ca.emit(this);
+  ca.optimize(*this);
+  ca.emit(*this);
   cc->j(ca.cond, target);
 }
 
@@ -1581,7 +1587,7 @@ void UniCompiler::add_ext(const Gp& dst, const Gp& src_, const Gp& idx_, uint32_
     case 2:
     case 4:
     case 8:
-      lea(dst, x86::ptr(src, idx, asmjit::Support::ctz(scale), disp));
+      lea(dst, x86::ptr(src, idx, Support::ctz(scale), disp));
       return;
 
     default:
@@ -1608,20 +1614,20 @@ void UniCompiler::lea(const Gp& dst, const Mem& src) noexcept {
   Mem m(src);
 
   if (is_64bit() && dst.size() == 4) {
-    if (m.base_type() == asmjit::RegType::kGp32) {
-      m.set_base_type(asmjit::RegType::kGp64);
+    if (m.base_type() == RegType::kGp32) {
+      m.set_base_type(RegType::kGp64);
     }
 
-    if (m.index_type() == asmjit::RegType::kGp32) {
-      m.set_index_type(asmjit::RegType::kGp64);
+    if (m.index_type() == RegType::kGp32) {
+      m.set_index_type(RegType::kGp64);
     }
   }
 
   cc->lea(dst, m);
 }
 
-// ujit::UniCompiler  - Vector Instructions - Constants
-// ====================================================
+// ujit::UniCompiler - Vector Instructions - Constants
+// ===================================================
 
 //! Floating point mode is used in places that are generic and implement various functionality that needs more
 //! than a single instruction. Typically implementing either higher level concepts or missing functionality.
@@ -1692,8 +1698,17 @@ enum class NarrowingMode : uint32_t {
   kSaturateUToU
 };
 
-// ujit::UniCompiler  - Vector Instructions - Broadcast / Shuffle Data
-// ===================================================================
+[[maybe_unused]]
+static ASMJIT_INLINE bool is_scalar_fp_op(FloatMode fm) noexcept { return fm <= kF64S; }
+
+[[maybe_unused]]
+static ASMJIT_INLINE bool is_f32_op(FloatMode fm) noexcept { return fm == kF32S || fm == kF32V; }
+
+[[maybe_unused]]
+static ASMJIT_INLINE bool is_f64_op(FloatMode fm) noexcept { return fm == kF64S || fm == kF64V; }
+
+// ujit::UniCompiler - Vector Instructions - Broadcast / Shuffle Data
+// ==================================================================
 
 static constexpr uint16_t avx512_vinsert_128[] = {
   Inst::kIdVinserti32x4,
@@ -1709,8 +1724,8 @@ static constexpr uint16_t avx512_vshuf_128[] = {
   Inst::kIdVshuff64x2
 };
 
-// ujit::UniCompiler  - Vector Instructions - Integer Cmp/Min/Max Data
-// ===================================================================
+// ujit::UniCompiler - Vector Instructions - Integer Cmp/Min/Max Data
+// ==================================================================
 
 struct CmpMinMaxInst {
   uint16_t peq;
@@ -1741,8 +1756,8 @@ static constexpr CmpMinMaxInst avx_cmp_min_max[] = {
   { Inst::kIdVpcmpeqq, Inst::kIdVpcmpgtq, Inst::kIdVpminuq, Inst::kIdVpmaxuq },
 };
 
-// ujit::UniCompiler  - Vector Instructions - Integer Conversion Data
-// ==================================================================
+// ujit::UniCompiler - Vector Instructions - Integer Conversion Data
+// =================================================================
 
 struct WideningOpInfo {
   uint32_t mov          : 16;
@@ -1773,12 +1788,13 @@ static constexpr WideningOpInfo sse_int_widening_op_info[] = {
   { Inst::kIdPmovzxdq , Inst::kIdPunpckldq , Inst::kIdPunpckhdq , 0, 0 }  // kU32ToU64.
 };
 
-// ujit::UniCompiler  - Vector Instructions - Float Instruction Data
-// =================================================================
+// ujit::UniCompiler - Vector Instructions - Float Instruction Data
+// ================================================================
 
 struct FloatInst {
   uint16_t fmovs;
-  uint16_t fmov;
+  uint16_t fmova;
+  uint16_t fmovu;
   uint16_t fand;
   uint16_t for_;
   uint16_t fxor;
@@ -1791,6 +1807,7 @@ struct FloatInst {
   uint16_t fmax;
   uint16_t fcmp;
   uint16_t fround;
+  uint16_t frndscale;
   uint16_t psrl;
   uint16_t psll;
 };
@@ -1799,6 +1816,7 @@ static constexpr FloatInst sse_float_inst[4] = {
   {
     Inst::kIdMovss,
     Inst::kIdMovaps,
+    Inst::kIdMovups,
     Inst::kIdAndps,
     Inst::kIdOrps,
     Inst::kIdXorps,
@@ -1811,12 +1829,14 @@ static constexpr FloatInst sse_float_inst[4] = {
     Inst::kIdMaxss,
     Inst::kIdCmpss,
     Inst::kIdRoundss,
+    Inst::kIdNone,
     Inst::kIdPsrld,
     Inst::kIdPslld
   },
   {
     Inst::kIdMovsd,
     Inst::kIdMovaps,
+    Inst::kIdMovups,
     Inst::kIdAndpd,
     Inst::kIdOrpd,
     Inst::kIdXorpd,
@@ -1829,12 +1849,14 @@ static constexpr FloatInst sse_float_inst[4] = {
     Inst::kIdMaxsd,
     Inst::kIdCmpsd,
     Inst::kIdRoundsd,
+    Inst::kIdNone,
     Inst::kIdPsrlq,
     Inst::kIdPsllq
   },
   {
     Inst::kIdMovaps,
     Inst::kIdMovaps,
+    Inst::kIdMovups,
     Inst::kIdAndps,
     Inst::kIdOrps,
     Inst::kIdXorps,
@@ -1847,12 +1869,14 @@ static constexpr FloatInst sse_float_inst[4] = {
     Inst::kIdMaxps,
     Inst::kIdCmpps,
     Inst::kIdRoundps,
+    Inst::kIdNone,
     Inst::kIdPsrld,
     Inst::kIdPslld
   },
   {
     Inst::kIdMovaps,
     Inst::kIdMovaps,
+    Inst::kIdMovups,
     Inst::kIdAndpd,
     Inst::kIdOrpd,
     Inst::kIdXorpd,
@@ -1865,6 +1889,7 @@ static constexpr FloatInst sse_float_inst[4] = {
     Inst::kIdMaxpd,
     Inst::kIdCmppd,
     Inst::kIdRoundpd,
+    Inst::kIdNone,
     Inst::kIdPsrlq,
     Inst::kIdPsllq
   }
@@ -1874,6 +1899,7 @@ static constexpr FloatInst avx_float_inst[4] = {
   {
     Inst::kIdVmovss,
     Inst::kIdVmovaps,
+    Inst::kIdVmovups,
     Inst::kIdVandps,
     Inst::kIdVorps,
     Inst::kIdVxorps,
@@ -1886,12 +1912,14 @@ static constexpr FloatInst avx_float_inst[4] = {
     Inst::kIdVmaxss,
     Inst::kIdVcmpss,
     Inst::kIdVroundss,
+    Inst::kIdVrndscaless,
     Inst::kIdVpsrld,
     Inst::kIdVpslld
   },
   {
     Inst::kIdVmovsd,
     Inst::kIdVmovaps,
+    Inst::kIdVmovups,
     Inst::kIdVandpd,
     Inst::kIdVorpd,
     Inst::kIdVxorpd,
@@ -1904,12 +1932,14 @@ static constexpr FloatInst avx_float_inst[4] = {
     Inst::kIdVmaxsd,
     Inst::kIdVcmpsd,
     Inst::kIdVroundsd,
+    Inst::kIdVrndscalesd,
     Inst::kIdVpsrlq,
     Inst::kIdVpsllq
   },
   {
     Inst::kIdVmovaps,
     Inst::kIdVmovaps,
+    Inst::kIdVmovups,
     Inst::kIdVandps,
     Inst::kIdVorps,
     Inst::kIdVxorps,
@@ -1922,12 +1952,14 @@ static constexpr FloatInst avx_float_inst[4] = {
     Inst::kIdVmaxps,
     Inst::kIdVcmpps,
     Inst::kIdVroundps,
+    Inst::kIdVrndscaleps,
     Inst::kIdVpsrld,
     Inst::kIdVpslld
   },
   {
     Inst::kIdVmovaps,
     Inst::kIdVmovaps,
+    Inst::kIdVmovups,
     Inst::kIdVandpd,
     Inst::kIdVorpd,
     Inst::kIdVxorpd,
@@ -1940,13 +1972,14 @@ static constexpr FloatInst avx_float_inst[4] = {
     Inst::kIdVmaxpd,
     Inst::kIdVcmppd,
     Inst::kIdVroundpd,
+    Inst::kIdVrndscalepd,
     Inst::kIdVpsrlq,
     Inst::kIdVpsllq
   }
 };
 
-// ujit::UniCompiler  - Vector Instructions - UniOp Information
-// =============================================================
+// ujit::UniCompiler - Vector Instructions - UniOp Information
+// ===========================================================
 
 struct UniOpVInfo {
   //! \name Members
@@ -2029,8 +2062,12 @@ static constexpr UniOpVInfo opcode_info_2v[size_t(UniOpVV::kMaxValue) + 1] = {
   DEFINE_OP(kIdPmovsxdq   , 0, kIntrin, kIdVpmovsxdq      , kIntrin     , 0, 0, kNone, 0, 0x00u, kNone, k64, 0, kNA), // kCvtI32HiToI64.
   DEFINE_OP(kIdPmovzxdq   , 0, kIntrin, kIdVpmovzxdq      , kIntrin     , 0, 0, kNone, 0, 0x00u, kNone, k64, 0, kNA), // kCvtU32LoToU64.
   DEFINE_OP(kIdPmovzxdq   , 0, kIntrin, kIdVpmovzxdq      , kIntrin     , 0, 0, kNone, 0, 0x00u, kNone, k64, 0, kNA), // kCvtU32HiToU64.
+  DEFINE_OP(kIdAndps      , 0, kIntrin, kIdVandps         , kIntrin     , 0, 0, kNone, 0, 0x00u, kF32S, k32, 4, kNA), // kAbsF32S.
+  DEFINE_OP(kIdAndpd      , 0, kIntrin, kIdVandpd         , kIntrin     , 0, 0, kNone, 0, 0x00u, kF64S, k64, 8, kNA), // kAbsF64S.
   DEFINE_OP(kIdAndps      , 0, kIntrin, kIdVandps         , kIntrin     , 0, 0, kNone, 0, 0x00u, kF32V, k32, 4, kNA), // kAbsF32.
   DEFINE_OP(kIdAndpd      , 0, kIntrin, kIdVandpd         , kIntrin     , 0, 0, kNone, 0, 0x00u, kF64V, k64, 8, kNA), // kAbsF64.
+  DEFINE_OP(kIdXorps      , 0, kIntrin, kIdVxorps         , kIntrin     , 0, 0, kNone, 0, 0x00u, kF32S, k32, 4, kNA), // kNegF32S.
+  DEFINE_OP(kIdXorpd      , 0, kIntrin, kIdVxorpd         , kIntrin     , 0, 0, kNone, 0, 0x00u, kF64S, k64, 8, kNA), // kNegF64S.
   DEFINE_OP(kIdXorps      , 0, kIntrin, kIdVxorps         , kIntrin     , 0, 0, kNone, 0, 0x00u, kF32V, k32, 4, kNA), // kNegF32.
   DEFINE_OP(kIdXorpd      , 0, kIntrin, kIdVxorpd         , kIntrin     , 0, 0, kNone, 0, 0x00u, kF64V, k64, 8, kNA), // kNegF64.
   DEFINE_OP(kIdNone       , 0, kIntrin, kIdNone           , kIntrin     , 0, 0, kNone, 0, 0x00u, kNone, k32, 4, kNA), // kAbsU32.
@@ -2047,10 +2084,18 @@ static constexpr UniOpVInfo opcode_info_2v[size_t(UniOpVV::kMaxValue) + 1] = {
   DEFINE_OP(kIdRoundsd    , 2, kIntrin, kIdVroundsd       , kIntrin     , 0, 0, kNone, 1, 0x0Au, kF64S, k64, 8, kNA), // kCeilF64S.
   DEFINE_OP(kIdRoundps    , 2, kIntrin, kIdVroundps       , kIntrin     , 0, 0, kNone, 1, 0x0Au, kF32V, k32, 4, kNA), // kCeilF32.
   DEFINE_OP(kIdRoundpd    , 2, kIntrin, kIdVroundpd       , kIntrin     , 0, 0, kNone, 1, 0x0Au, kF64V, k64, 8, kNA), // kCeilF64.
-  DEFINE_OP(kIdRoundss    , 2, kIntrin, kIdVroundss       , kIntrin     , 0, 0, kNone, 1, 0x08u, kF32S, k32, 4, kNA), // kRoundF32S.
-  DEFINE_OP(kIdRoundsd    , 2, kIntrin, kIdVroundsd       , kIntrin     , 0, 0, kNone, 1, 0x08u, kF64S, k64, 8, kNA), // kRoundF64S.
-  DEFINE_OP(kIdRoundps    , 2, kIntrin, kIdVroundps       , kIntrin     , 0, 0, kNone, 1, 0x08u, kF32V, k32, 4, kNA), // kRoundF32.
-  DEFINE_OP(kIdRoundpd    , 2, kIntrin, kIdVroundpd       , kIntrin     , 0, 0, kNone, 1, 0x08u, kF64V, k64, 8, kNA), // kRoundF64.
+  DEFINE_OP(kIdRoundss    , 2, kIntrin, kIdVroundss       , kIntrin     , 0, 0, kNone, 1, 0x08u, kF32S, k32, 4, kNA), // kRoundEvenF32S.
+  DEFINE_OP(kIdRoundsd    , 2, kIntrin, kIdVroundsd       , kIntrin     , 0, 0, kNone, 1, 0x08u, kF64S, k64, 8, kNA), // kRoundEvenF64S.
+  DEFINE_OP(kIdRoundps    , 2, kIntrin, kIdVroundps       , kIntrin     , 0, 0, kNone, 1, 0x08u, kF32V, k32, 4, kNA), // kRoundEvenF32.
+  DEFINE_OP(kIdRoundpd    , 2, kIntrin, kIdVroundpd       , kIntrin     , 0, 0, kNone, 1, 0x08u, kF64V, k64, 8, kNA), // kRoundEvenF64.
+  DEFINE_OP(kIdRoundss    , 2, kIntrin, kIdVroundss       , kIntrin     , 0, 0, kNone, 1, 0x0Bu, kF32S, k32, 4, kNA), // kRoundHalfAwayF32S.
+  DEFINE_OP(kIdRoundsd    , 2, kIntrin, kIdVroundsd       , kIntrin     , 0, 0, kNone, 1, 0x0Bu, kF64S, k64, 8, kNA), // kRoundHalfAwayF64S.
+  DEFINE_OP(kIdRoundps    , 2, kIntrin, kIdVroundps       , kIntrin     , 0, 0, kNone, 1, 0x0Bu, kF32V, k32, 4, kNA), // kRoundHalfAwayF32.
+  DEFINE_OP(kIdRoundpd    , 2, kIntrin, kIdVroundpd       , kIntrin     , 0, 0, kNone, 1, 0x0Bu, kF64V, k64, 8, kNA), // kRoundHalfAwayF64.
+  DEFINE_OP(kIdRoundss    , 2, kIntrin, kIdVroundss       , kIntrin     , 0, 0, kNone, 1, 0x09u, kF32S, k32, 4, kNA), // kRoundHalfUpF32S.
+  DEFINE_OP(kIdRoundsd    , 2, kIntrin, kIdVroundsd       , kIntrin     , 0, 0, kNone, 1, 0x09u, kF64S, k64, 8, kNA), // kRoundHalfUpF64S.
+  DEFINE_OP(kIdRoundps    , 2, kIntrin, kIdVroundps       , kIntrin     , 0, 0, kNone, 1, 0x09u, kF32V, k32, 4, kNA), // kRoundHalfUpF32.
+  DEFINE_OP(kIdRoundpd    , 2, kIntrin, kIdVroundpd       , kIntrin     , 0, 0, kNone, 1, 0x09u, kF64V, k64, 8, kNA), // kRoundHalfUpF64.
   DEFINE_OP(kIdNone       , 0, kIntrin, kIdNone           , kIntrin     , 0, 0, kNone, 0, 0x00u, kNone, k32, 4, kNA), // kRcpF32.
   DEFINE_OP(kIdNone       , 0, kIntrin, kIdNone           , kIntrin     , 0, 0, kNone, 0, 0x00u, kNone, k64, 8, kNA), // kRcpF64.
   DEFINE_OP(kIdSqrtss     , 2, kIntrin, kIdVsqrtss        , kIntrin     , 0, 0, kNone, 0, 0x00u, kF32S, k32, 4, kNA), // kSqrtF32S.
@@ -2239,6 +2284,10 @@ static constexpr UniOpVInfo opcode_info_3v[size_t(UniOpVVV::kMaxValue) + 1] = {
   DEFINE_OP(kIdDivsd      , 2, kSSE2  , kIdVdivsd         , kAVX        , 0, 0, kNone, 0, 0x00u, kF64S, k64, 8, kNA), // kDivF64S.
   DEFINE_OP(kIdDivps      , 2, kSSE2  , kIdVdivps         , kAVX        , 0, 0, kNone, 0, 0x00u, kF32V, k32, 4, kNA), // kDivF32.
   DEFINE_OP(kIdDivpd      , 2, kSSE2  , kIdVdivpd         , kAVX        , 0, 0, kNone, 0, 0x00u, kF64V, k64, 8, kNA), // kDivF64.
+  DEFINE_OP(kIdNone       , 0, kIntrin, kIdNone           , kIntrin     , 0, 0, kNone, 0, 0x00u, kF32S, k32, 4, kNA), // kModF32S.
+  DEFINE_OP(kIdNone       , 0, kIntrin, kIdNone           , kIntrin     , 0, 0, kNone, 0, 0x00u, kF64S, k64, 8, kNA), // kModF64S.
+  DEFINE_OP(kIdNone       , 0, kIntrin, kIdNone           , kIntrin     , 0, 0, kNone, 0, 0x00u, kF32V, k32, 4, kNA), // kModF32.
+  DEFINE_OP(kIdNone       , 0, kIntrin, kIdNone           , kIntrin     , 0, 0, kNone, 0, 0x00u, kF64V, k64, 8, kNA), // kModF64.
   DEFINE_OP(kIdMinss      , 2, kSSE2  , kIdVminss         , kAVX        , 0, 0, kSrc , 0, 0x00u, kF32S, k32, 4, kNA), // kMinF32S.
   DEFINE_OP(kIdMinsd      , 2, kSSE2  , kIdVminsd         , kAVX        , 0, 0, kSrc , 0, 0x00u, kF64S, k64, 8, kNA), // kMinF64S.
   DEFINE_OP(kIdMinps      , 2, kSSE2  , kIdVminps         , kAVX        , 0, 0, kSrc , 0, 0x00u, kF32V, k32, 4, kNA), // kMinF32.
@@ -2537,11 +2586,11 @@ static constexpr UniOpVMInfo opcode_info_2mv[size_t(UniOpMV::kMaxValue) + 1] = {
 
 #undef DEFINE_OP
 
-// ujit::UniCompiler  - Vector Instructions - Utility Functions
-// ============================================================
+// ujit::UniCompiler - Vector Instructions - Utility Functions
+// ===========================================================
 
-static ASMJIT_NOINLINE void UniCompiler_loadInto(UniCompiler* pc, const Vec& vec, const Mem& mem, uint32_t broadcast_size = 0) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void UniCompiler_load_into(UniCompiler& uc, const Vec& vec, const Mem& mem, uint32_t broadcast_size = 0) noexcept {
+  BackendCompiler* cc = uc.cc;
   Mem m(mem);
 
   if (mem.has_broadcast() && broadcast_size) {
@@ -2559,7 +2608,7 @@ static ASMJIT_NOINLINE void UniCompiler_loadInto(UniCompiler* pc, const Vec& vec
     m.set_size(vec.size());
     if (vec.is_vec512())
       cc->vmovdqu32(vec, m);
-    else if (pc->has_avx())
+    else if (uc.has_avx())
       cc->vmovdqu(vec, m);
     else
       cc->movdqu(vec, m);
@@ -2568,24 +2617,24 @@ static ASMJIT_NOINLINE void UniCompiler_loadInto(UniCompiler* pc, const Vec& vec
 
 // TODO: Unused for now...
 [[maybe_unused]]
-static ASMJIT_NOINLINE void UniCompiler_moveToDst(UniCompiler* pc, const Vec& dst, const Operand_& src, uint32_t broadcast_size = 0) noexcept {
+static ASMJIT_NOINLINE void UniCompiler_move_to_dst(UniCompiler& uc, const Vec& dst, const Operand_& src, uint32_t broadcast_size = 0) noexcept {
   if (src.is_reg()) {
     ASMJIT_ASSERT(src.is_vec());
     if (dst.id() != src.as<Reg>().id()) {
-      pc->v_mov(dst, src);
+      uc.v_mov(dst, src);
     }
   }
   else if (src.is_mem()) {
-    UniCompiler_loadInto(pc, dst, src.as<Mem>(), broadcast_size);
+    UniCompiler_load_into(uc, dst, src.as<Mem>(), broadcast_size);
   }
   else {
     ASMJIT_NOT_REACHED();
   }
 }
 
-static ASMJIT_NOINLINE Vec UniCompiler_loadNew(UniCompiler* pc, const Vec& ref, const Mem& mem, uint32_t broadcast_size = 0) noexcept {
-  Vec vec = pc->new_similar_reg(ref, "@vec_m");
-  UniCompiler_loadInto(pc, vec, mem, broadcast_size);
+static ASMJIT_NOINLINE Vec UniCompiler_load_new(UniCompiler& uc, const Vec& ref, const Mem& mem, uint32_t broadcast_size = 0) noexcept {
+  Vec vec = uc.new_similar_reg(ref, "@vec_m");
+  UniCompiler_load_into(uc, vec, mem, broadcast_size);
   return vec;
 }
 
@@ -2593,35 +2642,75 @@ static ASMJIT_INLINE bool is_same_vec(const Vec& a, const Operand_& b) noexcept
   return b.is_reg() && a.id() == b.as<Reg>().id();
 }
 
-static ASMJIT_NOINLINE void sse_mov(UniCompiler* pc, const Vec& dst, const Operand_& src) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_INLINE Operand get_fop_one(UniCompiler& uc, const Vec& dst, FloatMode fm) noexcept {
+  Operand op;
+  if (is_f32_op(fm))
+    op = uc.simd_const(&uc.ct().f32_1, Bcst::k32, dst);
+  else
+    op = uc.simd_const(&uc.ct().f64_1, Bcst::k64, dst);
+  return op;
+}
+
+static ASMJIT_INLINE Operand get_fop_half_minus_1ulp(UniCompiler& uc, const Vec& dst, FloatMode fm) noexcept {
+  Operand op;
+  if (is_f32_op(fm))
+    op = uc.simd_const(&uc.ct().f32_0_5_minus_1ulp, Bcst::k32, dst);
+  else
+    op = uc.simd_const(&uc.ct().f64_0_5_minus_1ulp, Bcst::k64, dst);
+  return op;
+}
+
+static ASMJIT_INLINE Operand get_fop_round_magic(UniCompiler& uc, const Vec& dst, FloatMode fm) noexcept {
+  Operand op;
+  if (is_f32_op(fm))
+    op = uc.simd_const(&uc.ct().f32_round_magic, Bcst::k32, dst);
+  else
+    op = uc.simd_const(&uc.ct().f64_round_magic, Bcst::k64, dst);
+  return op;
+}
+
+static ASMJIT_INLINE Operand get_fop_msb_bit(UniCompiler& uc, const Vec& dst, FloatMode fm) noexcept {
+  Operand op;
+  if (is_f32_op(fm))
+    op = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst);
+  else
+    op = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst);
+  return op;
+}
+
+static ASMJIT_NOINLINE void sse_mov(UniCompiler& uc, const Vec& dst, const Operand_& src) noexcept {
+  BackendCompiler* cc = uc.cc;
   if (src.is_mem())
     cc->emit(Inst::kIdMovups, dst, src);
   else if (dst.id() != src.id())
     cc->emit(Inst::kIdMovaps, dst, src);
 }
 
-static ASMJIT_NOINLINE void sse_fmov(UniCompiler* pc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void sse_fmov(UniCompiler& uc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept {
+  BackendCompiler* cc = uc.cc;
   if (src.is_reg()) {
-    if (dst.id() != src.id())
+    if (dst.id() != src.id()) {
       cc->emit(Inst::kIdMovaps, dst, src);
+    }
+  }
+  else if (is_scalar_fp_op(fm)) {
+    cc->emit(sse_float_inst[size_t(fm)].fmovs, dst, src);
   }
   else {
-    cc->emit(sse_float_inst[size_t(fm)].fmovs, dst, src);
+    cc->emit(sse_float_inst[size_t(fm)].fmovu, dst, src);
   }
 }
 
-static ASMJIT_NOINLINE Vec sse_copy(UniCompiler* pc, const Vec& vec, const char* name) noexcept {
-  Vec copy = pc->new_similar_reg(vec, name);
-  pc->cc->emit(Inst::kIdMovaps, copy, vec);
+static ASMJIT_NOINLINE Vec sse_copy(UniCompiler& uc, const Vec& vec, const char* name) noexcept {
+  Vec copy = uc.new_similar_reg(vec, name);
+  uc.cc->emit(Inst::kIdMovaps, copy, vec);
   return copy;
 }
 
-static ASMJIT_NOINLINE void sse_make_vec(UniCompiler* pc, Operand_& op, const char* name) noexcept {
+static ASMJIT_NOINLINE void sse_make_vec(UniCompiler& uc, Operand_& op, const char* name) noexcept {
   if (op.is_mem()) {
-    Vec tmp = pc->new_vec128(name);
-    sse_mov(pc, tmp, op);
+    Vec tmp = uc.new_vec128(name);
+    sse_mov(uc, tmp, op);
     op = tmp;
   }
 }
@@ -2645,59 +2734,59 @@ static ASMJIT_INLINE uint32_t shuf_imm4_from_swizzle(Swizzle2 s) noexcept {
   return x86::shuffle_imm(imm1 * 2u + 1u, imm1 * 2u, imm0 * 2u + 1u, imm0 * 2u);
 }
 
-static ASMJIT_NOINLINE void sse_bit_not(UniCompiler* pc, const Vec& dst, const Operand_& src) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void sse_bit_not(UniCompiler& uc, const Vec& dst, const Operand_& src) noexcept {
+  BackendCompiler* cc = uc.cc;
 
-  sse_mov(pc, dst, src);
-  Operand ones = pc->simd_const(&pc->ct.p_FFFFFFFFFFFFFFFF, Bcst::k32, dst);
+  sse_mov(uc, dst, src);
+  Operand ones = uc.simd_const(&uc.ct().p_FFFFFFFFFFFFFFFF, Bcst::k32, dst);
   cc->emit(Inst::kIdPxor, dst, ones);
 }
 
-static ASMJIT_NOINLINE void sse_msb_flip(UniCompiler* pc, const Vec& dst, const Operand_& src, ElementSize sz) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void sse_msb_flip(UniCompiler& uc, const Vec& dst, const Operand_& src, ElementSize sz) noexcept {
+  BackendCompiler* cc = uc.cc;
   const void* msk_data {};
 
   switch (sz) {
-    case ElementSize::k8 : msk_data = &pc->ct.p_8080808080808080; break;
-    case ElementSize::k16: msk_data = &pc->ct.p_8000800080008000; break;
-    case ElementSize::k32: msk_data = &pc->ct.p_8000000080000000; break;
-    case ElementSize::k64: msk_data = &pc->ct.p_8000000000000000; break;
+    case ElementSize::k8 : msk_data = &uc.ct().p_8080808080808080; break;
+    case ElementSize::k16: msk_data = &uc.ct().p_8000800080008000; break;
+    case ElementSize::k32: msk_data = &uc.ct().p_8000000080000000; break;
+    case ElementSize::k64: msk_data = &uc.ct().p_8000000000000000; break;
 
     default:
       ASMJIT_NOT_REACHED();
   }
 
-  Operand msk = pc->simd_const(msk_data, Bcst::kNA, dst);
-  sse_mov(pc, dst, src);
+  Operand msk = uc.simd_const(msk_data, Bcst::kNA, dst);
+  sse_mov(uc, dst, src);
   cc->emit(Inst::kIdPxor, dst, msk);
 }
 
-static ASMJIT_NOINLINE void sse_fsign_flip(UniCompiler* pc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void sse_fsign_flip(UniCompiler& uc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept {
+  BackendCompiler* cc = uc.cc;
 
   const FloatInst& fi = sse_float_inst[size_t(fm)];
   Operand msk;
 
   switch (fm) {
-    case FloatMode::kF32S: msk = pc->simd_const(&pc->ct.sign32_scalar, Bcst::k32, dst); break;
-    case FloatMode::kF64S: msk = pc->simd_const(&pc->ct.sign64_scalar, Bcst::k64, dst); break;
-    case FloatMode::kF32V: msk = pc->simd_const(&pc->ct.p_8000000080000000, Bcst::k32, dst); break;
-    case FloatMode::kF64V: msk = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst); break;
+    case FloatMode::kF32S: msk = uc.simd_const(&uc.ct().sign32_scalar, Bcst::k32, dst); break;
+    case FloatMode::kF64S: msk = uc.simd_const(&uc.ct().sign64_scalar, Bcst::k64, dst); break;
+    case FloatMode::kF32V: msk = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst); break;
+    case FloatMode::kF64V: msk = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst); break;
 
     default:
       ASMJIT_NOT_REACHED();
   }
 
-  sse_fmov(pc, dst, src, fm);
+  sse_fmov(uc, dst, src, fm);
   cc->emit(fi.fxor, dst, msk);
 }
 
 // Possibly the best solution:
 //   https://stackoverflow.com/questions/65166174/how-to-simulate-pcmpgtq-on-sse2
-static ASMJIT_NOINLINE void sse_cmp_gt_i64(UniCompiler* pc, const Vec& dst, const Operand_& a, const Operand_& b) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void sse_cmp_gt_i64(UniCompiler& uc, const Vec& dst, const Operand_& a, const Operand_& b) noexcept {
+  BackendCompiler* cc = uc.cc;
 
-  if (pc->has_sse4_2()) {
+  if (uc.has_sse4_2()) {
     if (is_same_vec(dst, a)) {
       cc->emit(Inst::kIdPcmpgtq, dst, b);
     }
@@ -2705,9 +2794,9 @@ static ASMJIT_NOINLINE void sse_cmp_gt_i64(UniCompiler* pc, const Vec& dst, cons
       Operand_ second = b;
       if (is_same_vec(dst, b)) {
         second = cc->new_similar_reg(dst, "@tmp");
-        sse_mov(pc, second.as<Vec>(), b);
+        sse_mov(uc, second.as<Vec>(), b);
       }
-      sse_mov(pc, dst, a);
+      sse_mov(uc, dst, a);
       cc->emit(Inst::kIdPcmpgtq, dst, second);
     }
   }
@@ -2722,13 +2811,13 @@ static ASMJIT_NOINLINE void sse_cmp_gt_i64(UniCompiler* pc, const Vec& dst, cons
     cc->emit(Inst::kIdPand, tmp1, tmp2);
 
     if (!is_same_vec(dst, b)) {
-      sse_mov(pc, dst, a);
+      sse_mov(uc, dst, a);
       cc->emit(Inst::kIdPcmpgtd, dst, b);
       cc->emit(Inst::kIdPor, dst, tmp1);
       cc->emit(Inst::kIdPshufd, dst, dst, x86::shuffle_imm(3, 3, 1, 1));
     }
     else {
-      sse_mov(pc, tmp2, a);
+      sse_mov(uc, tmp2, a);
       cc->emit(Inst::kIdPcmpgtd, tmp2, b);
       cc->emit(Inst::kIdPor, tmp2, tmp1);
       cc->emit(Inst::kIdPshufd, dst, tmp2, x86::shuffle_imm(3, 3, 1, 1));
@@ -2738,22 +2827,22 @@ static ASMJIT_NOINLINE void sse_cmp_gt_i64(UniCompiler* pc, const Vec& dst, cons
 
 // Possibly the best solution:
 //   https://stackoverflow.com/questions/65441496/what-is-the-most-efficient-way-to-do-unsigned-64-bit-comparison-on-sse2
-static ASMJIT_NOINLINE void sse_cmp_gt_u64(UniCompiler* pc, const Vec& dst, const Operand_& a, const Operand_& b) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void sse_cmp_gt_u64(UniCompiler& uc, const Vec& dst, const Operand_& a, const Operand_& b) noexcept {
+  BackendCompiler* cc = uc.cc;
 
-  if (pc->has_sse4_2()) {
-    Operand msk = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst);
+  if (uc.has_sse4_2()) {
+    Operand msk = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst);
     Vec tmp = cc->new_similar_reg(dst, "@tmp");
 
     if (is_same_vec(dst, a)) {
-      sse_mov(pc, tmp, msk);
+      sse_mov(uc, tmp, msk);
       cc->emit(Inst::kIdPxor, dst, tmp);
       cc->emit(Inst::kIdPxor, tmp, b);
       cc->emit(Inst::kIdPcmpgtq, dst, tmp);
     }
     else {
-      sse_mov(pc, tmp, b);
-      sse_mov(pc, dst, a);
+      sse_mov(uc, tmp, b);
+      sse_mov(uc, dst, a);
       cc->emit(Inst::kIdPxor, dst, msk);
       cc->emit(Inst::kIdPxor, tmp, msk);
       cc->emit(Inst::kIdPcmpgtq, dst, tmp);
@@ -2764,8 +2853,8 @@ static ASMJIT_NOINLINE void sse_cmp_gt_u64(UniCompiler* pc, const Vec& dst, cons
     Vec tmp2 = cc->new_similar_reg(dst, "@tmp2");
     Vec tmp3 = cc->new_similar_reg(dst, "@tmp3");
 
-    sse_mov(pc, tmp1, b);                   // tmp1 = b;
-    sse_mov(pc, tmp2, a);                   // tmp2 = a;
+    sse_mov(uc, tmp1, b);                  // tmp1 = b;
+    sse_mov(uc, tmp2, a);                  // tmp2 = a;
     cc->emit(Inst::kIdMovaps, tmp3, tmp1); // tmp3 = b;
     cc->emit(Inst::kIdPsubq, tmp3, tmp2);  // tmp3 = b - a
     cc->emit(Inst::kIdPxor, tmp2, tmp1);   // tmp2 = b ^ a
@@ -2777,26 +2866,26 @@ static ASMJIT_NOINLINE void sse_cmp_gt_u64(UniCompiler* pc, const Vec& dst, cons
   }
 }
 
-static ASMJIT_NOINLINE void sse_select(UniCompiler* pc, const Vec& dst, const Vec& a, const Operand_& b, const Vec& msk) noexcept {
-  BackendCompiler* cc = pc->cc;
-  sse_mov(pc, dst, a);
+static ASMJIT_NOINLINE void sse_select(UniCompiler& uc, const Vec& dst, const Vec& a, const Operand_& b, const Vec& msk) noexcept {
+  BackendCompiler* cc = uc.cc;
+  sse_mov(uc, dst, a);
   cc->emit(Inst::kIdPand, dst, msk);
   cc->emit(Inst::kIdPandn, msk, b);
   cc->emit(Inst::kIdPor, dst, msk);
 }
 
-static ASMJIT_NOINLINE void sse_int_widen(UniCompiler* pc, const Vec& dst, const Vec& src, WideningOp cvt) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void sse_int_widen(UniCompiler& uc, const Vec& dst, const Vec& src, WideningOp cvt) noexcept {
+  BackendCompiler* cc = uc.cc;
   WideningOpInfo cvt_info = sse_int_widening_op_info[size_t(cvt)];
 
-  if (pc->has_sse4_1()) {
+  if (uc.has_sse4_1()) {
     cc->emit(cvt_info.mov, dst, src);
     return;
   }
 
   if (!cvt_info.sign_extends && cvt_info.unpack_lo != Inst::kIdNone) {
-    Operand zero = pc->simd_const(&pc->ct.p_0000000000000000, Bcst::kNA, dst);
-    sse_mov(pc, dst, src);
+    Operand zero = uc.simd_const(&uc.ct().p_0000000000000000, Bcst::kNA, dst);
+    sse_mov(uc, dst, src);
     cc->emit(cvt_info.unpack_lo, dst, zero);
     return;
   }
@@ -2816,8 +2905,8 @@ static ASMJIT_NOINLINE void sse_int_widen(UniCompiler* pc, const Vec& dst, const
     }
 
     case WideningOp::kU8ToU32: {
-      Operand zero = pc->simd_const(&pc->ct.p_0000000000000000, Bcst::kNA, dst);
-      sse_mov(pc, dst, src);
+      Operand zero = uc.simd_const(&uc.ct().p_0000000000000000, Bcst::kNA, dst);
+      sse_mov(uc, dst, src);
 
       cc->emit(Inst::kIdPunpcklbw, dst, zero);
       cc->emit(Inst::kIdPunpcklwd, dst, zero);
@@ -2825,8 +2914,8 @@ static ASMJIT_NOINLINE void sse_int_widen(UniCompiler* pc, const Vec& dst, const
     }
 
     case WideningOp::kU8ToU64: {
-      Operand zero = pc->simd_const(&pc->ct.p_0000000000000000, Bcst::kNA, dst);
-      sse_mov(pc, dst, src);
+      Operand zero = uc.simd_const(&uc.ct().p_0000000000000000, Bcst::kNA, dst);
+      sse_mov(uc, dst, src);
 
       cc->emit(Inst::kIdPunpcklbw, dst, zero);
       cc->emit(Inst::kIdPunpcklwd, dst, zero);
@@ -2841,9 +2930,9 @@ static ASMJIT_NOINLINE void sse_int_widen(UniCompiler* pc, const Vec& dst, const
     }
 
     case WideningOp::kI32ToI64: {
-      Vec tmp = pc->new_similar_reg(dst, "@tmp");
-      sse_mov(pc, tmp, src);
-      sse_mov(pc, dst, src);
+      Vec tmp = uc.new_similar_reg(dst, "@tmp");
+      sse_mov(uc, tmp, src);
+      sse_mov(uc, dst, src);
       cc->psrad(tmp, 31);
       cc->punpckldq(dst, tmp);
       return;
@@ -2854,55 +2943,49 @@ static ASMJIT_NOINLINE void sse_int_widen(UniCompiler* pc, const Vec& dst, const
   }
 }
 
-static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Operand& src, FloatMode fm, x86::RoundImm round_mode) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void sse_round(UniCompiler& uc, const Vec& dst, const Operand& src, FloatMode fm, x86::RoundImm round_mode) noexcept {
+  BackendCompiler* cc = uc.cc;
 
   uint32_t is_f32 = fm == FloatMode::kF32S || fm == FloatMode::kF32V;
   const FloatInst& fi = sse_float_inst[size_t(fm)];
 
   // NOTE: This may be dead code as the compiler handles this case well, however, if this function is
   // called as a helper we don't want to emit a longer sequence if we can just use a single instruction.
-  if (pc->has_sse4_1()) {
+  if (uc.has_sse4_1()) {
     cc->emit(fi.fround, dst, src, round_mode | x86::RoundImm::kSuppress);
     return;
   }
 
-  Operand maxn;
-
   // round_max (f32) == 0x4B000000
   // round_max (f64) == 0x4330000000000000
-  if (fm == FloatMode::kF32S || fm == FloatMode::kF32V)
-    maxn = pc->simd_const(&pc->ct.f32_round_max, Bcst::k32, dst);
-  else
-    maxn = pc->simd_const(&pc->ct.f64_round_max, Bcst::k64, dst);
+  Operand maxn = get_fop_round_magic(uc, dst, fm);
 
-  Vec t1 = pc->new_similar_reg(dst, "@t1");
-  Vec t2 = pc->new_similar_reg(dst, "@t2");
-  Vec t3 = pc->new_similar_reg(dst, "@t3");
+  Vec t1 = uc.new_similar_reg(dst, "@t1");
+  Vec t2 = uc.new_similar_reg(dst, "@t2");
+  Vec t3 = uc.new_similar_reg(dst, "@t3");
 
-  // Special cases first - float32/float64 truncation can use float32->int32->float32 conversion.
   if (round_mode == x86::RoundImm::kTrunc) {
     if (fm == FloatMode::kF32S || (fm == FloatMode::kF64S && cc->is_64bit())) {
       Gp r;
       Operand msb;
 
       if (fm == FloatMode::kF32S) {
-        r = pc->new_gp32("@gp_tmp");
-        msb = pc->simd_const(&pc->ct.p_8000000080000000, Bcst::k32, dst);
+        r = uc.new_gp32("@gp_tmp");
+        msb = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst);
       }
       else {
-        r = pc->new_gp64("@gp_tmp");
-        msb = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst);
+        r = uc.new_gp64("@gp_tmp");
+        msb = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst);
       }
 
-      sse_fmov(pc, dst, src, fm);
+      sse_fmov(uc, dst, src, fm);
 
       if (fm == FloatMode::kF32S)
         cc->cvttss2si(r, dst);
       else
         cc->cvttsd2si(r, dst);
 
-      cc->emit(fi.fmov, t2, msb);
+      cc->emit(fi.fmova, t2, msb);
       cc->emit(fi.fandn, t2, dst);
       cc->emit(fi.fxor, t1, t1);
 
@@ -2923,22 +3006,26 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope
   if (round_mode == x86::RoundImm::kNearest) {
     // Pure SSE2 round-to-even implementation:
     //
-    //   float roundeven(float x) {
+    //   float round_even(float x) {
     //     float magic = x >= 0 ? pow(2, 22) : pow(2, 22) + pow(2, 21);
     //     return x >= magic ? x : x + magic - magic;
     //   }
     //
-    //   double roundeven(double x) {
+    //   double round_even(double x) {
     //     double magic = x >= 0 ? pow(2, 52) : pow(2, 52) + pow(2, 51);
     //     return x >= magic ? x : x + magic - magic;
     //   }
-    sse_fmov(pc, dst, src, fm);
-    cc->emit(fi.fmov, t3, dst);
-    cc->emit(fi.psrl, t3, Imm(is_f32 ? 31 : 63));
-    cc->emit(fi.psll, t3, Imm(is_f32 ? 23 : 51));
-    cc->emit(fi.for_, t3, maxn);
+    sse_fmov(uc, dst, src, fm);
+    cc->emit(fi.fmova, t3, dst);
+    // cc->emit(fi.psrl, t3, Imm(is_f32 ? 31 : 63));
+    // cc->emit(fi.psll, t3, Imm(is_f32 ? 23 : 51));
+    // cc->emit(fi.for_, t3, maxn);
 
-    cc->emit(fi.fmov, t1, dst);
+    cc->emit(fi.psrl, t3, Imm(is_f32 ? 31 : 63));
+    cc->emit(fi.psll, t3, Imm(is_f32 ? 23 : 52));
+    cc->emit(is_f32 ? Inst::kIdPaddd : Inst::kIdPaddq, t3, maxn);
+
+    cc->emit(fi.fmova, t1, dst);
     cc->emit(fi.fcmp, t1, t3, x86::CmpImm::kLT);
     cc->emit(fi.fand, t1, t3);
 
@@ -2947,11 +3034,7 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope
     return;
   }
 
-  Operand one;
-  if (fm == FloatMode::kF32S || fm == FloatMode::kF32V)
-    one = pc->simd_const(&pc->ct.f32_1, Bcst::k32, dst);
-  else
-    one = pc->simd_const(&pc->ct.f64_1, Bcst::k64, dst);
+  Operand one = get_fop_one(uc, dst, fm);
 
   if (round_mode == x86::RoundImm::kTrunc) {
     // Should be handled earlier.
@@ -2960,11 +3043,11 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope
     Operand msb;
 
     if (fm == FloatMode::kF32V) {
-      msb = pc->simd_const(&pc->ct.p_8000000080000000, Bcst::k32, dst);
-      sse_fmov(pc, dst, src, fm);
+      msb = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst);
+      sse_fmov(uc, dst, src, fm);
 
       cc->cvttps2dq(t1, dst);
-      cc->emit(fi.fmov, t2, msb);
+      cc->emit(fi.fmova, t2, msb);
       cc->emit(fi.fandn, t2, dst);
       cc->cvtdq2ps(t1, t1);
 
@@ -2972,18 +3055,18 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope
       cc->emit(fi.fand, t1, t2);
       cc->emit(fi.fandn, t2, dst);
       cc->emit(fi.for_, t2, t1);
-      cc->emit(fi.fmov, dst, t2);
+      cc->emit(fi.fmova, dst, t2);
     }
     else {
-      msb = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst);
+      msb = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst);
 
-      sse_fmov(pc, dst, src, fm);
-      cc->emit(fi.fmov, t3, msb);
+      sse_fmov(uc, dst, src, fm);
+      cc->emit(fi.fmova, t3, msb);
       cc->emit(fi.fandn, t3, dst);
-      cc->emit(fi.fmov, t2, t3);
+      cc->emit(fi.fmova, t2, t3);
       cc->emit(fi.fcmp, t2, maxn, x86::CmpImm::kLT);
       cc->emit(fi.fand, t2, maxn);
-      cc->emit(fi.fmov, t1, t3);
+      cc->emit(fi.fmova, t1, t3);
       cc->emit(fi.fadd, t1, t2);
       cc->emit(fi.fsub, t1, t2);
       cc->emit(fi.fcmp, t3, t1, x86::CmpImm::kLT);
@@ -3002,23 +3085,23 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope
     InstId correction_inst_id = round_mode == x86::RoundImm::kDown ? fi.fsub : fi.fadd;
     x86::CmpImm correction_predicate = round_mode == x86::RoundImm::kDown ? x86::CmpImm::kLT : x86::CmpImm::kNLE;
 
-    sse_fmov(pc, dst, src, fm);
+    sse_fmov(uc, dst, src, fm);
 
     // maxn (f32) == 0x4B000000 (f64) == 0x4330000000000000
     // t3   (f32) == 0x00800000 (f64) == 0x0008000000000000
 
-    cc->emit(fi.fmov, t3, dst);
+    cc->emit(fi.fmova, t3, dst);
     cc->emit(fi.psrl, t3, Imm(is_f32 ? 31 : 63));
-    cc->emit(fi.psll, t3, Imm(is_f32 ? 23 : 51));
-    cc->emit(fi.for_, t3, maxn);
+    cc->emit(fi.psll, t3, Imm(is_f32 ? 23 : 52));
+    cc->emit(is_f32 ? Inst::kIdPaddd : Inst::kIdPaddq, t3, maxn);
 
-    cc->emit(fi.fmov, t1, dst);
-    cc->emit(fi.fmov, t2, dst);
+    cc->emit(fi.fmova, t1, dst);
+    cc->emit(fi.fmova, t2, dst);
     cc->emit(fi.fadd, t2, t3);
     cc->emit(fi.fsub, t2, t3);
 
     cc->emit(fi.fcmp, t1, t3, x86::CmpImm::kNLT);
-    cc->emit(fi.fmov, t3, dst);
+    cc->emit(fi.fmova, t3, dst);
     cc->emit(fi.fcmp, t3, t2, correction_predicate);
     cc->emit(fi.fand, t3, one);
 
@@ -3033,8 +3116,8 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope
   ASMJIT_NOT_REACHED();
 }
 
-static ASMJIT_NOINLINE void avx_mov(UniCompiler* pc, const Vec& dst, const Operand_& src) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void avx_mov(UniCompiler& uc, const Vec& dst, const Operand_& src) noexcept {
+  BackendCompiler* cc = uc.cc;
   InstId inst_id = 0;
 
   if (dst.is_vec512()) {
@@ -3047,8 +3130,8 @@ static ASMJIT_NOINLINE void avx_mov(UniCompiler* pc, const Vec& dst, const Opera
   cc->emit(inst_id, dst, src);
 }
 
-static ASMJIT_NOINLINE void avx_fmov(UniCompiler* pc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void avx_fmov(UniCompiler& uc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept {
+  BackendCompiler* cc = uc.cc;
   if (src.is_reg()) {
     if (dst.id() != src.id()) {
       if (fm <= FloatMode::kF64S)
@@ -3057,38 +3140,41 @@ static ASMJIT_NOINLINE void avx_fmov(UniCompiler* pc, const Vec& dst, const Oper
         cc->emit(Inst::kIdVmovaps, dst, src);
     }
   }
-  else {
+  else if (is_scalar_fp_op(fm)) {
     cc->emit(avx_float_inst[size_t(fm)].fmovs, dst, src);
   }
+  else {
+    cc->emit(avx_float_inst[size_t(fm)].fmovu, dst, src);
+  }
 }
 
-static ASMJIT_NOINLINE void avx_make_vec(UniCompiler* pc, Operand_& op, const Vec& ref, const char* name) noexcept {
+static ASMJIT_NOINLINE void avx_make_vec(UniCompiler& uc, Operand_& op, const Vec& ref, const char* name) noexcept {
   if (op.is_mem()) {
-    Vec tmp = pc->new_similar_reg(ref, name);
-    avx_mov(pc, tmp, op);
+    Vec tmp = uc.new_similar_reg(ref, name);
+    avx_mov(uc, tmp, op);
     op = tmp;
   }
 }
 
-static ASMJIT_NOINLINE void avx_zero(UniCompiler* pc, const Vec& dst) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void avx_zero(UniCompiler& uc, const Vec& dst) noexcept {
+  BackendCompiler* cc = uc.cc;
   Vec x = dst.xmm();
   cc->vpxor(x, x, x);
   return;
 }
 
-static ASMJIT_NOINLINE void avx_ones(UniCompiler* pc, const Vec& dst) noexcept {
-  BackendCompiler* cc = pc->cc;
-  if (pc->has_avx512())
+static ASMJIT_NOINLINE void avx_ones(UniCompiler& uc, const Vec& dst) noexcept {
+  BackendCompiler* cc = uc.cc;
+  if (uc.has_avx512())
     cc->emit(Inst::kIdVpternlogd, dst, dst, dst, 0xFF);
   else
     cc->emit(Inst::kIdVpcmpeqb, dst, dst, dst);
 }
 
-static ASMJIT_NOINLINE void avx_bit_not(UniCompiler* pc, const Vec& dst, const Operand_& src) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void avx_bit_not(UniCompiler& uc, const Vec& dst, const Operand_& src) noexcept {
+  BackendCompiler* cc = uc.cc;
 
-  if (pc->has_avx512()) {
+  if (uc.has_avx512()) {
     if (src.is_reg())
       cc->overwrite().emit(Inst::kIdVpternlogd, dst, src, src, 0x55);
     else
@@ -3096,13 +3182,13 @@ static ASMJIT_NOINLINE void avx_bit_not(UniCompiler* pc, const Vec& dst, const O
     return;
   }
 
-  Operand ones = pc->simd_const(&pc->ct.p_FFFFFFFFFFFFFFFF, Bcst::k32, dst);
+  Operand ones = uc.simd_const(&uc.ct().p_FFFFFFFFFFFFFFFF, Bcst::k32, dst);
   if (!src.is_reg()) {
     if (ones.is_reg()) {
       cc->emit(Inst::kIdVpxor, dst, ones, src);
     }
     else {
-      avx_mov(pc, dst, src);
+      avx_mov(uc, dst, src);
       cc->emit(Inst::kIdVpxor, dst, dst, ones);
     }
   }
@@ -3111,17 +3197,17 @@ static ASMJIT_NOINLINE void avx_bit_not(UniCompiler* pc, const Vec& dst, const O
   }
 }
 
-static ASMJIT_NOINLINE void avx_isign_flip(UniCompiler* pc, const Vec& dst, const Operand_& src, ElementSize sz) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void avx_isign_flip(UniCompiler& uc, const Vec& dst, const Operand_& src, ElementSize sz) noexcept {
+  BackendCompiler* cc = uc.cc;
   Operand msk;
 
-  InstId xor_ = (pc->has_avx512() && dst.is_vec512()) ? Inst::kIdVpxord : Inst::kIdVpxor;
+  InstId xor_ = (uc.has_avx512() && dst.is_vec512()) ? Inst::kIdVpxord : Inst::kIdVpxor;
 
   switch (sz) {
-    case ElementSize::k8: msk = pc->simd_const(&pc->ct.p_8080808080808080, Bcst::kNA, dst); break;
-    case ElementSize::k16: msk = pc->simd_const(&pc->ct.p_8000800080008000, Bcst::kNA, dst); break;
-    case ElementSize::k32: msk = pc->simd_const(&pc->ct.p_8000000080000000, Bcst::k32, dst); break;
-    case ElementSize::k64: msk = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst); break;
+    case ElementSize::k8: msk = uc.simd_const(&uc.ct().p_8080808080808080, Bcst::kNA, dst); break;
+    case ElementSize::k16: msk = uc.simd_const(&uc.ct().p_8000800080008000, Bcst::kNA, dst); break;
+    case ElementSize::k32: msk = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst); break;
+    case ElementSize::k64: msk = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst); break;
   }
 
   if (src.is_reg()) {
@@ -3131,22 +3217,22 @@ static ASMJIT_NOINLINE void avx_isign_flip(UniCompiler* pc, const Vec& dst, cons
     cc->emit(xor_, dst, msk, src);
   }
   else {
-    avx_mov(pc, dst, src);
+    avx_mov(uc, dst, src);
     cc->emit(xor_, dst, dst, msk);
   }
 }
 
-static ASMJIT_NOINLINE void avx_fsign_flip(UniCompiler* pc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept {
-  BackendCompiler* cc = pc->cc;
+static ASMJIT_NOINLINE void avx_fsign_flip(UniCompiler& uc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept {
+  BackendCompiler* cc = uc.cc;
 
   const FloatInst& fi = avx_float_inst[size_t(fm)];
   Operand msk;
 
   switch (fm) {
-    case FloatMode::kF32S: msk = pc->simd_const(&pc->ct.sign32_scalar, Bcst::kNA, dst); break;
-    case FloatMode::kF64S: msk = pc->simd_const(&pc->ct.sign64_scalar, Bcst::kNA, dst); break;
-    case FloatMode::kF32V: msk = pc->simd_const(&pc->ct.p_8000000080000000, Bcst::k32, dst); break;
-    case FloatMode::kF64V: msk = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst); break;
+    case FloatMode::kF32S: msk = uc.simd_const(&uc.ct().sign32_scalar, Bcst::kNA, dst); break;
+    case FloatMode::kF64S: msk = uc.simd_const(&uc.ct().sign64_scalar, Bcst::kNA, dst); break;
+    case FloatMode::kF32V: msk = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst); break;
+    case FloatMode::kF64V: msk = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst); break;
 
     default:
       ASMJIT_NOT_REACHED();
@@ -3159,13 +3245,13 @@ static ASMJIT_NOINLINE void avx_fsign_flip(UniCompiler* pc, const Vec& dst, cons
     cc->emit(fi.fxor, dst, msk, src);
   }
   else {
-    avx_fmov(pc, dst, src, fm);
+    avx_fmov(uc, dst, src, fm);
     cc->emit(fi.fxor, dst, dst, msk);
   }
 }
 
-// ujit::UniCompiler  - Vector Instructions - OpArray Iterator
-// ===========================================================
+// ujit::UniCompiler - Vector Instructions - OpArray Iterator
+// ==========================================================
 
 template<typename T>
 class OpArrayIter {
@@ -3190,70 +3276,70 @@ public:
 };
 
 template<typename Src>
-static ASMJIT_INLINE void emit_2v_t(UniCompiler* pc, UniOpVV op, const OpArray& dst_, const Src& src_) noexcept {
+static ASMJIT_INLINE void emit_2v_t(UniCompiler& uc, UniOpVV op, const OpArray& dst_, const Src& src_) noexcept {
   size_t n = dst_.size();
   OpArrayIter<Src> src(src_);
 
   for (size_t i = 0; i < n; i++) {
-    pc->emit_2v(op, dst_[i], src.op());
+    uc.emit_2v(op, dst_[i], src.op());
     src.next();
   }
 }
 
 template<typename Src>
-static ASMJIT_INLINE void emit_2vi_t(UniCompiler* pc, UniOpVVI op, const OpArray& dst_, const Src& src_, uint32_t imm) noexcept {
+static ASMJIT_INLINE void emit_2vi_t(UniCompiler& uc, UniOpVVI op, const OpArray& dst_, const Src& src_, uint32_t imm) noexcept {
   size_t n = dst_.size();
   OpArrayIter<Src> src(src_);
 
   for (size_t i = 0; i < n; i++) {
-    pc->emit_2vi(op, dst_[i], src.op(), imm);
+    uc.emit_2vi(op, dst_[i], src.op(), imm);
     src.next();
   }
 }
 
 template<typename Src1, typename Src2>
-static ASMJIT_INLINE void emit_3v_t(UniCompiler* pc, UniOpVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_) noexcept {
+static ASMJIT_INLINE void emit_3v_t(UniCompiler& uc, UniOpVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_) noexcept {
   size_t n = dst_.size();
   OpArrayIter<Src1> src1(src1_);
   OpArrayIter<Src2> src2(src2_);
 
   for (size_t i = 0; i < n; i++) {
-    pc->emit_3v(op, dst_[i], src1.op(), src2.op());
+    uc.emit_3v(op, dst_[i], src1.op(), src2.op());
     src1.next();
     src2.next();
   }
 }
 
 template<typename Src1, typename Src2>
-static ASMJIT_INLINE void emit_3vi_t(UniCompiler* pc, UniOpVVVI op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, uint32_t imm) noexcept {
+static ASMJIT_INLINE void emit_3vi_t(UniCompiler& uc, UniOpVVVI op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, uint32_t imm) noexcept {
   size_t n = dst_.size();
   OpArrayIter<Src1> src1(src1_);
   OpArrayIter<Src2> src2(src2_);
 
   for (size_t i = 0; i < n; i++) {
-    pc->emit_3vi(op, dst_[i], src1.op(), src2.op(), imm);
+    uc.emit_3vi(op, dst_[i], src1.op(), src2.op(), imm);
     src1.next();
     src2.next();
   }
 }
 
 template<typename Src1, typename Src2, typename Src3>
-static ASMJIT_INLINE void emit_4v_t(UniCompiler* pc, UniOpVVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, const Src3& src3_) noexcept {
+static ASMJIT_INLINE void emit_4v_t(UniCompiler& uc, UniOpVVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, const Src3& src3_) noexcept {
   size_t n = dst_.size();
   OpArrayIter<Src1> src1(src1_);
   OpArrayIter<Src2> src2(src2_);
   OpArrayIter<Src3> src3(src3_);
 
   for (size_t i = 0; i < n; i++) {
-    pc->emit_4v(op, dst_[i], src1.op(), src2.op(), src3.op());
+    uc.emit_4v(op, dst_[i], src1.op(), src2.op(), src3.op());
     src1.next();
     src2.next();
     src3.next();
   }
 }
 
-// ujit::UniCompiler  - Vector Instructions - Emit 2V
-// ==================================================
+// ujit::UniCompiler - Vector Instructions - Emit 2V
+// =================================================
 
 void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_) noexcept {
   ASMJIT_ASSERT(dst_.is_vec());
@@ -3347,8 +3433,8 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
             // AVX doesn't provide 8-bit and 16-bit broadcasts - the simplest way is to just use VPSHUFB to repeat the byte.
             InstId insert_inst_id = element_size == ElementSize::k8 ? Inst::kIdVpinsrb : Inst::kIdVpinsrw;
 
-            const void* pred_data = element_size == ElementSize::k8 ? static_cast<const void*>(&ct.p_0000000000000000)
-                                                                  : static_cast<const void*>(&ct.p_0100010001000100);
+            const void* pred_data = element_size == ElementSize::k8 ? static_cast<const void*>(&ct().p_0000000000000000)
+                                                                    : static_cast<const void*>(&ct().p_0100010001000100);
             Vec pred = simd_vec_const(pred_data, Bcst::k32, dst_xmm);
 
             if (src.is_mem()) {
@@ -3409,7 +3495,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
 
         // 128-bit broadcast is like 128-bit mov in this case as we don't have a wider destination.
         if (dst.is_vec128()) {
-          avx_mov(this, dst, src);
+          avx_mov(*this, dst, src);
           return;
         }
 
@@ -3455,7 +3541,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
 
         // Cannot broadcast 256-bit vector to a 128-bit or 256-bit vector...
         if (!dst.is_vec512()) {
-          avx_mov(this, dst.ymm(), src);
+          avx_mov(*this, dst.ymm(), src);
           return;
         }
 
@@ -3483,7 +3569,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       case UniOpVV::kNotU64:
       case UniOpVV::kNotF32:
       case UniOpVV::kNotF64: {
-        avx_bit_not(this, dst, src);
+        avx_bit_not(*this, dst, src);
         return;
       }
 
@@ -3547,19 +3633,30 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
         return;
       }
 
+      case UniOpVV::kAbsF32S:
+      case UniOpVV::kAbsF64S:
       case UniOpVV::kAbsF32:
       case UniOpVV::kAbsF64:
+      case UniOpVV::kNegF32S:
+      case UniOpVV::kNegF64S:
       case UniOpVV::kNegF32:
       case UniOpVV::kNegF64: {
         // Intrinsic.
-        const void* msk_data = op == UniOpVV::kAbsF32 ? static_cast<const void*>(&ct.p_7FFFFFFF7FFFFFFF) :
-                               op == UniOpVV::kAbsF64 ? static_cast<const void*>(&ct.p_7FFFFFFFFFFFFFFF) :
-                               op == UniOpVV::kNegF32 ? static_cast<const void*>(&ct.p_8000000080000000) :
-                                                        static_cast<const void*>(&ct.p_8000000000000000);
+        FloatMode fm = FloatMode(op_info.float_mode);
+
+        const void* msk_data =
+          op == UniOpVV::kAbsF32 || op == UniOpVV::kAbsF32S ? static_cast<const void*>(&ct().p_7FFFFFFF7FFFFFFF) :
+          op == UniOpVV::kAbsF64 || op == UniOpVV::kAbsF64S ? static_cast<const void*>(&ct().p_7FFFFFFFFFFFFFFF) :
+          op == UniOpVV::kNegF32 || op == UniOpVV::kNegF32S ? static_cast<const void*>(&ct().p_8000000080000000) :
+                                                              static_cast<const void*>(&ct().p_8000000000000000);
         Operand msk = simd_const(msk_data, Bcst(op_info.broadcast_size), dst);
 
-        if (src.is_mem() && msk.is_mem()) {
-          avx_mov(this, dst, msk);
+        if (src.is_mem() && is_scalar_fp_op(fm)) {
+          avx_fmov(*this, dst, src, fm);
+          cc->emit(inst_id, dst, dst, msk);
+        }
+        else if (src.is_mem() && msk.is_mem()) {
+          avx_fmov(*this, dst, msk, fm);
           cc->emit(inst_id, dst, dst, src);
         }
         else if (src.is_mem()) {
@@ -3573,14 +3670,14 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
 
       case UniOpVV::kRcpF32: {
         // Intrinsic.
-        Vec one = simd_vec_const(&ct.f32_1, Bcst::k32, dst);
+        Vec one = simd_vec_const(&ct().f32_1, Bcst::k32, dst);
         cc->emit(Inst::kIdVdivps, dst, one, src);
         return;
       }
 
       case UniOpVV::kRcpF64: {
         // Intrinsic.
-        Vec one = simd_vec_const(&ct.f64_1, Bcst::k32, dst);
+        Vec one = simd_vec_const(&ct().f64_1, Bcst::k32, dst);
         cc->emit(Inst::kIdVdivpd, dst, one, src);
         return;
       }
@@ -3597,11 +3694,17 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       case UniOpVV::kCeilF64S:
       case UniOpVV::kCeilF32:
       case UniOpVV::kCeilF64:
-      case UniOpVV::kRoundF32S:
-      case UniOpVV::kRoundF64S:
-      case UniOpVV::kRoundF32:
-      case UniOpVV::kRoundF64: {
-        if (has_avx512()) {
+      case UniOpVV::kRoundEvenF32S:
+      case UniOpVV::kRoundEvenF64S:
+      case UniOpVV::kRoundEvenF32:
+      case UniOpVV::kRoundEvenF64: {
+        FloatMode fm = FloatMode(op_info.float_mode);
+
+        if (is_scalar_fp_op(fm)) {
+          dst = dst.xmm();
+        }
+
+        if (has_avx512() && dst.is_vec512()) {
           // AVX512 uses a different name.
           constexpr uint16_t avx512_rndscale[4] = {
             Inst::kIdVrndscaless,
@@ -3612,10 +3715,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
           inst_id = avx512_rndscale[(size_t(op) - size_t(UniOpVV::kTruncF32S)) & 0x3];
         }
 
-        FloatMode fm = FloatMode(op_info.float_mode);
-
-        if (fm == FloatMode::kF32S || fm == FloatMode::kF64S) {
-          dst = dst.xmm();
+        if (is_scalar_fp_op(fm)) {
           // These instructions use 3 operand form for historical reasons.
           if (src.is_mem()) {
             cc->emit(avx_float_inst[size_t(op_info.float_mode)].fmovs, dst, src);
@@ -3632,13 +3732,86 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
         return;
       }
 
+      case UniOpVV::kRoundHalfAwayF32S:
+      case UniOpVV::kRoundHalfAwayF64S:
+      case UniOpVV::kRoundHalfAwayF32:
+      case UniOpVV::kRoundHalfAwayF64: {
+        // Intrinsic.
+        FloatMode fm = FloatMode(op_info.float_mode);
+        const FloatInst& fi = avx_float_inst[fm];
+
+        if (is_scalar_fp_op(fm)) {
+          dst = dst.xmm();
+          if (src.is_vec()) {
+            src = src.as<Vec>().clone_as(dst);
+          }
+        }
+
+        if (src.is_mem()) {
+          avx_fmov(*this, dst, src, fm);
+          src = dst;
+        }
+
+        Operand half = get_fop_half_minus_1ulp(*this, dst, fm);
+        Operand msb = get_fop_msb_bit(*this, dst, fm);
+        Vec tmp = new_similar_reg(dst, "@tmp");
+
+        if (has_avx512()) {
+          cc->emit(fi.fmova, tmp, msb);
+          cc->emit(Inst::kIdVpternlogd, tmp, src, half, 0xEAu); // tmp = (msb & src) | half
+        }
+        else {
+          cc->emit(fi.fand, tmp, src, msb);
+          cc->emit(fi.for_, tmp, tmp, half);
+        }
+
+        cc->emit(fi.fadd, dst, src, tmp);
+
+        if (is_scalar_fp_op(fm)) {
+          cc->emit(fi.fround, dst, dst, dst, x86::RoundImm::kTrunc | x86::RoundImm::kSuppress);
+        }
+        else {
+          InstId round_inst = dst.is_vec512() ? fi.frndscale : fi.fround;
+          cc->emit(round_inst, dst, dst, x86::RoundImm::kTrunc | x86::RoundImm::kSuppress);
+        }
+        return;
+      }
+
+      case UniOpVV::kRoundHalfUpF32S:
+      case UniOpVV::kRoundHalfUpF64S:
+      case UniOpVV::kRoundHalfUpF32:
+      case UniOpVV::kRoundHalfUpF64: {
+        // Intrinsic.
+        FloatMode fm = FloatMode(op_info.float_mode);
+
+        if (is_scalar_fp_op(fm)) {
+          dst = dst.xmm();
+        }
+
+        Operand half = get_fop_half_minus_1ulp(*this, dst, fm);
+        UniOpVVV add_op = translate_op(op, UniOpVV::kRoundHalfUpF32S, UniOpVVV::kAddF32S);
+        UniOpVV floor_op = translate_op(op, UniOpVV::kRoundHalfUpF32S, UniOpVV::kFloorF32S);
+
+        if (src.is_mem()) {
+          Vec tmp = new_similar_reg(dst, "@tmp");
+          avx_fmov(*this, tmp, src, fm);
+          emit_3v(add_op, tmp, tmp, half);
+          emit_2v(floor_op, dst, tmp);
+        }
+        else {
+          emit_3v(add_op, dst, src.as<Vec>().clone_as(dst), half);
+          emit_2v(floor_op, dst, dst);
+        }
+        return;
+      }
+
       case UniOpVV::kSqrtF32S:
       case UniOpVV::kSqrtF64S: {
         dst = dst.xmm();
 
         // Intrinsic - these instructions use 3 operand form for historical reasons.
         if (src.is_mem()) {
-          avx_fmov(this, dst, src, FloatMode(op_info.float_mode));
+          avx_fmov(*this, dst, src, FloatMode(op_info.float_mode));
           cc->emit(inst_id, dst, dst, dst);
         }
         else {
@@ -3655,7 +3828,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
           src = src.as<Vec>().xmm();
 
         // Intrinsic - these instructions use 3 operand form for historical reasons.
-        Vec zeros = simd_vec_const(&ct.p_0000000000000000, Bcst::k32, dst);
+        Vec zeros = simd_vec_const(&ct().p_0000000000000000, Bcst::k32, dst);
         cc->emit(inst_id, dst, zeros, src);
         return;
       }
@@ -3680,7 +3853,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       case UniOpVV::kCvtI32HiToF64: {
         if (src.is_reg()) {
           uint32_t w = dst.size() >> 6;
-          Vec tmp = new_vec(VecWidth(w), "@tmp");
+          Vec tmp = new_vec_with_width(VecWidth(w), "@tmp");
 
           src.set_signature(signature_of_xmm_ymm_zmm[w]);
           if (dst.is_vec512()) {
@@ -3731,7 +3904,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       case UniOpVV::kCvtTruncF64ToI32Hi:
       case UniOpVV::kCvtRoundF64ToI32Hi: {
         uint32_t w = dst.size() >> 6;
-        Vec tmp = new_vec(VecWidth(w), "@tmp");
+        Vec tmp = new_vec_with_width(VecWidth(w), "@tmp");
 
         if (src.is_mem())
           src.as<Mem>().set_size(dst.size());
@@ -3816,16 +3989,16 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
 
         if (has_ssse3()) {
           if (element_size == ElementSize::k8 || (element_size == ElementSize::k16 && is_same_vec(dst, src))) {
-            Operand predicate = element_size == ElementSize::k8 ? simd_const(&ct.p_0000000000000000, Bcst::kNA, dst.as<Vec>())
-                                                                : simd_const(&ct.p_0100010001000100, Bcst::kNA, dst.as<Vec>());
-            sse_mov(this, dst, src);
+            Operand predicate = element_size == ElementSize::k8 ? simd_const(&ct().p_0000000000000000, Bcst::kNA, dst.as<Vec>())
+                                                                : simd_const(&ct().p_0100010001000100, Bcst::kNA, dst.as<Vec>());
+            sse_mov(*this, dst, src);
             cc->emit(Inst::kIdPshufb, dst, predicate);
             return;
           }
         }
 
         if (element_size == ElementSize::k8) {
-          sse_mov(this, dst, src);
+          sse_mov(*this, dst, src);
           cc->emit(Inst::kIdPunpcklbw, dst, dst);
           src = dst;
         }
@@ -3885,7 +4058,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       case UniOpVV::kBroadcastV128_F32:
       case UniOpVV::kBroadcastV128_F64: {
         // 128-bit broadcast is like 128-bit mov in this case as we don't have wider vectors.
-        sse_mov(this, dst, src);
+        sse_mov(*this, dst, src);
         return;
       }
 
@@ -3926,7 +4099,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
         Vec tmp = new_similar_reg(dst, "@tmp");
         cc->emit(Inst::kIdMovaps, tmp, src);
         cc->emit(Inst::kIdPsrad, tmp, 31);
-        sse_mov(this, dst, src);
+        sse_mov(*this, dst, src);
         cc->emit(Inst::kIdPxor, dst, tmp);
         cc->emit(Inst::kIdPsubd, dst, tmp);
         return;
@@ -3937,7 +4110,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
         Vec tmp = new_similar_reg(dst, "@tmp");
         cc->emit(Inst::kIdPshufd, tmp, src, x86::shuffle_imm(3, 3, 1, 1));
         cc->emit(Inst::kIdPsrad, tmp, 31);
-        sse_mov(this, dst, src);
+        sse_mov(*this, dst, src);
         cc->emit(Inst::kIdPxor, dst, tmp);
         cc->emit(Inst::kIdPsubq, dst, tmp);
         return;
@@ -3947,7 +4120,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       case UniOpVV::kNotU64:
       case UniOpVV::kNotF32:
       case UniOpVV::kNotF64: {
-        sse_bit_not(this, dst, src);
+        sse_bit_not(*this, dst, src);
         return;
       }
 
@@ -3967,7 +4140,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
         }
 
         WideningOp cvt = (op == UniOpVV::kCvtI8ToI32) ? WideningOp::kI8ToI32 : WideningOp::kU8ToU32;
-        sse_int_widen(this, dst, src.as<Vec>(), cvt);
+        sse_int_widen(*this, dst, src.as<Vec>(), cvt);
         return;
       }
 
@@ -3985,7 +4158,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       case UniOpVV::kCvtI16HiToI32:
       case UniOpVV::kCvtI32HiToI64:
         if (src.is_vec()) {
-          sse_mov(this, dst, src);
+          sse_mov(*this, dst, src);
 
           switch (op) {
             case UniOpVV::kCvtI8HiToI16: {
@@ -3995,7 +4168,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
             }
 
             case UniOpVV::kCvtU8HiToU16: {
-              cc->emit(Inst::kIdPunpckhbw, dst, simd_const(&ct.p_0000000000000000, Bcst::kNA, dst));
+              cc->emit(Inst::kIdPunpckhbw, dst, simd_const(&ct().p_0000000000000000, Bcst::kNA, dst));
               break;
             }
 
@@ -4006,20 +4179,20 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
             }
 
             case UniOpVV::kCvtU16HiToU32: {
-              cc->emit(Inst::kIdPunpckhwd, dst, simd_const(&ct.p_0000000000000000, Bcst::kNA, dst));
+              cc->emit(Inst::kIdPunpckhwd, dst, simd_const(&ct().p_0000000000000000, Bcst::kNA, dst));
               break;
             }
 
             case UniOpVV::kCvtI32HiToI64: {
               Vec tmp = new_vec128("@tmp");
-              sse_mov(this, tmp, dst);
+              sse_mov(*this, tmp, dst);
               cc->psrad(tmp, 31);
               cc->punpckhdq(dst, tmp);
               break;
             }
 
             case UniOpVV::kCvtU32HiToU64: {
-              cc->emit(Inst::kIdPunpckhdq, dst, simd_const(&ct.p_0000000000000000, Bcst::kNA, dst));
+              cc->emit(Inst::kIdPunpckhdq, dst, simd_const(&ct().p_0000000000000000, Bcst::kNA, dst));
               break;
             }
 
@@ -4068,7 +4241,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
             ASMJIT_NOT_REACHED();
         }
 
-        sse_int_widen(this, dst, src.as<Vec>(), cvt);
+        sse_int_widen(*this, dst, src.as<Vec>(), cvt);
         return;
       }
 
@@ -4078,8 +4251,8 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       case UniOpVV::kFloorF64:
       case UniOpVV::kCeilF32:
       case UniOpVV::kCeilF64:
-      case UniOpVV::kRoundF32:
-      case UniOpVV::kRoundF64:
+      case UniOpVV::kRoundEvenF32:
+      case UniOpVV::kRoundEvenF64:
         // Native operation requires SSE4.1.
         if (has_sse4_1()) {
           cc->emit(inst_id, dst, src, Imm(op_info.imm));
@@ -4093,63 +4266,105 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
       case UniOpVV::kFloorF64S:
       case UniOpVV::kCeilF32S:
       case UniOpVV::kCeilF64S:
-      case UniOpVV::kRoundF32S:
-      case UniOpVV::kRoundF64S: {
+      case UniOpVV::kRoundEvenF32S:
+      case UniOpVV::kRoundEvenF64S: {
         // Native operation requires SSE4.1.
         if (has_sse4_1()) {
-          if (!is_same_vec(dst, src))
-            sse_fmov(this, dst, src, FloatMode(op_info.float_mode));
+          sse_fmov(*this, dst, src, FloatMode(op_info.float_mode));
           cc->emit(inst_id, dst, dst, Imm(op_info.imm));
           return;
         }
 
-        sse_round(this, dst, src, FloatMode(op_info.float_mode), x86::RoundImm(op_info.imm & 0x7));
+        sse_round(*this, dst, src, FloatMode(op_info.float_mode), x86::RoundImm(op_info.imm & 0x7));
         return;
       }
 
+      case UniOpVV::kRoundHalfAwayF32S:
+      case UniOpVV::kRoundHalfAwayF64S:
+      case UniOpVV::kRoundHalfAwayF32:
+      case UniOpVV::kRoundHalfAwayF64: {
+        // Intrinsic.
+        FloatMode fm = FloatMode(op_info.float_mode);
+        const FloatInst& fi = sse_float_inst[fm];
+
+        Operand half = get_fop_half_minus_1ulp(*this, dst, fm);
+        Operand msb = get_fop_msb_bit(*this, dst, fm);
+
+        Vec tmp = new_similar_reg(dst, "@tmp");
+
+        sse_fmov(*this, dst, src, fm);
+        sse_mov(*this, tmp, msb);
+        cc->emit(fi.fand, tmp, dst);
+        cc->emit(fi.for_, tmp, half);
+        cc->emit(fi.fadd, dst, tmp);
+
+        sse_round(*this, dst, dst, fm, x86::RoundImm(op_info.imm & 0x7));
+        return;
+      }
+
+      case UniOpVV::kRoundHalfUpF32S:
+      case UniOpVV::kRoundHalfUpF64S:
+      case UniOpVV::kRoundHalfUpF32:
+      case UniOpVV::kRoundHalfUpF64: {
+        // Intrinsic.
+        FloatMode fm = FloatMode(op_info.float_mode);
+        const FloatInst& fi = sse_float_inst[fm];
+
+        Operand half = get_fop_half_minus_1ulp(*this, dst, fm);
+        sse_fmov(*this, dst, src, fm);
+        cc->emit(fi.fadd, dst, half);
+        sse_round(*this, dst, dst, fm, x86::RoundImm(op_info.imm & 0x7));
+        return;
+      }
+
+      case UniOpVV::kAbsF32S:
+      case UniOpVV::kAbsF64S:
       case UniOpVV::kAbsF32:
       case UniOpVV::kAbsF64:
+      case UniOpVV::kNegF32S:
+      case UniOpVV::kNegF64S:
       case UniOpVV::kNegF32:
       case UniOpVV::kNegF64: {
         // Intrinsic.
-        const void* msk_data = op == UniOpVV::kAbsF32 ? static_cast<const void*>(&ct.p_7FFFFFFF7FFFFFFF) :
-                               op == UniOpVV::kAbsF64 ? static_cast<const void*>(&ct.p_7FFFFFFFFFFFFFFF) :
-                               op == UniOpVV::kNegF32 ? static_cast<const void*>(&ct.p_8000000080000000) :
-                                                        static_cast<const void*>(&ct.p_8000000000000000) ;
-        Operand msk = simd_const(msk_data, Bcst::k32, dst);
+        FloatMode fm = FloatMode(op_info.float_mode);
 
-        if (!is_same_vec(dst, src))
-          sse_mov(this, dst, src);
+        const void* msk_data =
+          op == UniOpVV::kAbsF32 || op == UniOpVV::kAbsF32S ? static_cast<const void*>(&ct().p_7FFFFFFF7FFFFFFF) :
+          op == UniOpVV::kAbsF64 || op == UniOpVV::kAbsF64S ? static_cast<const void*>(&ct().p_7FFFFFFFFFFFFFFF) :
+          op == UniOpVV::kNegF32 || op == UniOpVV::kNegF32S ? static_cast<const void*>(&ct().p_8000000080000000) :
+                                                              static_cast<const void*>(&ct().p_8000000000000000);
+        Operand msk = simd_const(msk_data, Bcst(op_info.broadcast_size), dst);
 
+        sse_fmov(*this, dst, src, fm);
         cc->emit(inst_id, dst, msk);
         return;
       }
 
       case UniOpVV::kRcpF32: {
-        Operand one = simd_const(&ct.f32_1, Bcst::k32, dst);
+        Operand one = simd_const(&ct().f32_1, Bcst::k32, dst);
         if (is_same_vec(dst, src)) {
           Vec tmp = new_similar_reg(dst, "@tmp");
-          sse_mov(this, tmp, one);
+          sse_mov(*this, tmp, one);
           cc->emit(Inst::kIdDivps, tmp, src);
-          sse_mov(this, dst, tmp);
+          sse_mov(*this, dst, tmp);
         }
         else {
-          sse_mov(this, dst, one);
+          sse_mov(*this, dst, one);
           cc->emit(Inst::kIdDivps, dst, src);
         }
         return;
       }
 
       case UniOpVV::kRcpF64: {
-        Operand one = simd_const(&ct.f64_1, Bcst::k64, dst);
+        Operand one = simd_const(&ct().f64_1, Bcst::k64, dst);
         if (is_same_vec(dst, src)) {
           Vec tmp = new_similar_reg(dst, "@tmp");
-          sse_mov(this, tmp, one);
+          sse_mov(*this, tmp, one);
           cc->emit(Inst::kIdDivpd, tmp, src);
-          sse_mov(this, dst, tmp);
+          sse_mov(*this, dst, tmp);
         }
         else {
-          sse_mov(this, dst, one);
+          sse_mov(*this, dst, one);
           cc->emit(Inst::kIdDivpd, dst, src);
         }
         return;
@@ -4157,7 +4372,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
 
       case UniOpVV::kSqrtF32S:
       case UniOpVV::kSqrtF64S: {
-        sse_mov(this, dst, src);
+        sse_mov(*this, dst, src);
         cc->emit(inst_id, dst, dst);
         return;
       }
@@ -4210,11 +4425,11 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_
   }
 }
 
-void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const Operand_& src_) noexcept { emit_2v_t(this, op, dst_, src_); }
-void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const OpArray& src_) noexcept { emit_2v_t(this, op, dst_, src_); }
+void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const Operand_& src_) noexcept { emit_2v_t(*this, op, dst_, src_); }
+void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const OpArray& src_) noexcept { emit_2v_t(*this, op, dst_, src_); }
 
-// ujit::UniCompiler  - Vector Instructions - Emit 2VI
-// ===================================================
+// ujit::UniCompiler - Vector Instructions - Emit 2VI
+// ==================================================
 
 void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& src_, uint32_t imm) noexcept {
   ASMJIT_ASSERT(dst_.is_vec());
@@ -4249,7 +4464,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
       case UniOpVVI::kSrlbU128: {
         // This instruction requires AVX-512 if the source is a memory operand.
         if (src.is_mem()) {
-          avx_mov(this, dst, src);
+          avx_mov(*this, dst, src);
           cc->emit(inst_id, dst, dst, imm);
         }
         else {
@@ -4261,7 +4476,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
       case UniOpVVI::kSraI64: {
         // Native operation requires AVX-512, which is not supported by the target.
         if (imm == 0) {
-          avx_mov(this, dst, src);
+          avx_mov(*this, dst, src);
           return;
         }
 
@@ -4274,7 +4489,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
         Vec tmp = new_similar_reg(dst, "@tmp");
 
         if (src.is_mem()) {
-          avx_mov(this, dst, src);
+          avx_mov(*this, dst, src);
           src = dst;
         }
 
@@ -4433,7 +4648,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
       ASMJIT_ASSERT(inst_id != Inst::kIdNone);
 
       if (op_info.sse_op_count == 2) {
-        sse_mov(this, dst, src);
+        sse_mov(*this, dst, src);
         cc->emit(inst_id, dst, imm);
         return;
       }
@@ -4449,7 +4664,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
       case UniOpVVI::kSraI64: {
         // Intrinsic (SSE2).
         if (imm == 0) {
-          sse_mov(this, dst, src);
+          sse_mov(*this, dst, src);
           return;
         }
 
@@ -4462,15 +4677,15 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
         Vec tmp = new_similar_reg(dst, "@tmp");
 
         if (imm <= 32 && has_sse4_1()) {
-          sse_mov(this, dst, src);
-          sse_mov(this, tmp, src.is_reg() ? src.as<Vec>() : dst);
+          sse_mov(*this, dst, src);
+          sse_mov(*this, tmp, src.is_reg() ? src.as<Vec>() : dst);
           cc->emit(Inst::kIdPsrad, tmp, Support::min<uint32_t>(imm, 31u));
           cc->emit(Inst::kIdPsrlq, dst, imm);
           cc->emit(Inst::kIdPblendw, dst, tmp, 0xCC);
           return;
         }
 
-        sse_mov(this, dst, src);
+        sse_mov(*this, dst, src);
         cc->emit(Inst::kIdPshufd, tmp, src.is_reg() ? src.as<Vec>() : dst, x86::shuffle_imm(3, 3, 1, 1));
         cc->emit(Inst::kIdPsrad, tmp, 31);
         cc->emit(Inst::kIdPsrlq, dst, imm);
@@ -4503,7 +4718,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
       case UniOpVVI::kSwizzleU64x2: {
         // Intrinsic (SSE2 | SSE3).
         if (Swizzle2{imm} == swizzle(1, 0)) {
-          sse_mov(this, dst, src);
+          sse_mov(*this, dst, src);
         }
         else if (Swizzle2{imm} == swizzle(0, 0) && has_sse3()) {
           cc->emit(Inst::kIdMovddup, dst, src);
@@ -4534,7 +4749,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
       case UniOpVVI::kSwizzleF64x2: {
         // Intrinsic (SSE2 | SSE3).
         if (Swizzle2{imm} == swizzle(1, 0)) {
-          sse_mov(this, dst, src);
+          sse_mov(*this, dst, src);
         }
         else if (Swizzle2{imm} == swizzle(0, 0) && has_sse3()) {
           cc->emit(Inst::kIdMovddup, dst, src);
@@ -4575,11 +4790,11 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr
   }
 }
 
-void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept { emit_2vi_t(this, op, dst_, src_, imm); }
-void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept { emit_2vi_t(this, op, dst_, src_, imm); }
+void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept { emit_2vi_t(*this, op, dst_, src_, imm); }
+void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept { emit_2vi_t(*this, op, dst_, src_, imm); }
 
-// ujit::UniCompiler  - Vector Instructions - Emit 2VS
-// ===================================================
+// ujit::UniCompiler - Vector Instructions - Emit 2VS
+// ==================================================
 
 void UniCompiler::emit_2vs(UniOpVR op, const Operand_& dst_, const Operand_& src_, uint32_t idx) noexcept {
   UniOpVInfo op_info = opcode_info_2vs[size_t(op)];
@@ -4867,8 +5082,8 @@ void UniCompiler::emit_2vs(UniOpVR op, const Operand_& dst_, const Operand_& src
   }
 }
 
-// ujit::UniCompiler  - Vector Instructions - Emit 2VM
-// ===================================================
+// ujit::UniCompiler - Vector Instructions - Emit 2VM
+// ==================================================
 
 void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignment alignment, uint32_t idx) noexcept {
   ASMJIT_ASSERT(dst_.is_vec());
@@ -4886,7 +5101,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen
       case UniOpVM::kLoad8: {
         dst = dst.xmm();
         src.set_size(1);
-        avx_zero(this, dst);
+        avx_zero(*this, dst);
         cc->vpinsrb(dst, dst, src, 0);
         return;
       }
@@ -4895,7 +5110,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen
         if (!has_avx512_fp16()) {
           dst = dst.xmm();
           src.set_size(1);
-          avx_zero(this, dst);
+          avx_zero(*this, dst);
           cc->vpinsrw(dst, dst, src, 0);
         }
         [[fallthrough]];
@@ -5147,7 +5362,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen
       case UniOpVM::kLoadCvt32_U32ToU64: {
         src.set_size(4);
         cc->vmovd(dst, src);
-        sse_int_widen(this, dst, dst, WideningOp(op_info.cvt));
+        sse_int_widen(*this, dst, dst, WideningOp(op_info.cvt));
         return;
       }
 
@@ -5170,7 +5385,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen
         }
         else {
           cc->movq(dst, src);
-          sse_int_widen(this, dst, dst, WideningOp(op_info.cvt));
+          sse_int_widen(*this, dst, dst, WideningOp(op_info.cvt));
         }
         return;
       }
@@ -5651,8 +5866,8 @@ void UniCompiler::emit_mv(UniOpMV op, const Mem& dst_, const OpArray& src_, Alig
   }
 }
 
-// ujit::UniCompiler  - Vector Instructions - Emit 3V
-// ==================================================
+// ujit::UniCompiler - Vector Instructions - Emit 3V
+// =================================================
 
 void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_) noexcept {
   ASMJIT_ASSERT(dst_.is_vec());
@@ -5678,9 +5893,9 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
 
     if (is_same_vec(src1v, src2)) {
       switch (SameVecOp(op_info.same_vec_op)) {
-        case SameVecOp::kZero: avx_zero(this, dst); return;
-        case SameVecOp::kOnes: avx_ones(this, dst); return;
-        case SameVecOp::kSrc: avx_mov(this, dst, src1v); return;
+        case SameVecOp::kZero: avx_zero(*this, dst); return;
+        case SameVecOp::kOnes: avx_ones(*this, dst); return;
+        case SameVecOp::kSrc: avx_mov(*this, dst, src1v); return;
 
         default:
           break;
@@ -5691,8 +5906,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       ASMJIT_ASSERT(inst_id != Inst::kIdNone);
 
       FloatMode fm = FloatMode(op_info.float_mode);
-
-      if (fm == FloatMode::kF32S || fm == FloatMode::kF64S) {
+      if (is_scalar_fp_op(fm)) {
         dst.set_signature(signature_of_xmm_ymm_zmm[0]);
         src1v.set_signature(signature_of_xmm_ymm_zmm[0]);
 
@@ -5752,17 +5966,35 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
           inst_id = Inst::kIdVpandn;
 
         if (src2.is_mem()) {
-          src2 = UniCompiler_loadNew(this, dst, src2.as<Mem>(), op_info.broadcast_size);
+          src2 = UniCompiler_load_new(*this, dst, src2.as<Mem>(), op_info.broadcast_size);
         }
 
         cc->emit(inst_id, dst, src2, src1v);
         return;
       }
 
+      // dst = a - (floor(a / b) * b).
+      case UniOpVVV::kModF32S:
+      case UniOpVVV::kModF64S:
+      case UniOpVVV::kModF32:
+      case UniOpVVV::kModF64: {
+        FloatMode fm = FloatMode(op_info.float_mode);
+        UniOpVV trunc_op = translate_op(op, UniOpVVV::kModF32S, UniOpVV::kTruncF32);
+        const FloatInst& fi = avx_float_inst[fm];
+
+        x86::Vec tmp = new_similar_reg(dst, "@mod_tmp");
+        cc->emit(fi.fdiv, tmp, src1v, src2);
+        emit_2v(trunc_op, tmp, tmp);
+        cc->emit(fi.fmul, tmp, tmp, src2);
+        cc->emit(fi.fsub, dst, src1v, tmp);
+
+        return;
+      }
+
       case UniOpVVV::kMulU64: {
         // Native operation requires AVX512, which is not supported by the target.
         if (src2.is_mem()) {
-          src2 = UniCompiler_loadNew(this, dst, src2.as<Mem>(), op_info.broadcast_size);
+          src2 = UniCompiler_load_new(*this, dst, src2.as<Mem>(), op_info.broadcast_size);
         }
 
         Vec src2v = src2.as<Vec>().clone_as(dst);
@@ -5789,7 +6021,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         Vec tmp = new_similar_reg(dst.as<Vec>(), "@tmp");
 
         if (has_avx512()) {
-          Vec msk = simd_vec_const(&ct.p_FFFFFFFF00000000, Bcst::k64, dst);
+          Vec msk = simd_vec_const(&ct().p_FFFFFFFF00000000, Bcst::k64, dst);
           cc->emit(Inst::kIdVpandnq, tmp, msk, src2);
           cc->emit(Inst::kIdVpmullq, dst, src1v, tmp);
         }
@@ -5807,7 +6039,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kMaxI64: {
         // Native operation requires AVX512, which is not supported by the target.
         if (src2.is_mem()) {
-          src2 = UniCompiler_loadNew(this, dst, src2.as<Mem>(), op_info.broadcast_size);
+          src2 = UniCompiler_load_new(*this, dst, src2.as<Mem>(), op_info.broadcast_size);
         }
 
         ASMJIT_ASSERT(src2.is_vec());
@@ -5829,7 +6061,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kMinU64:
       case UniOpVVV::kMaxU64: {
         if (src2.is_mem()) {
-          src2 = UniCompiler_loadNew(this, dst, src2.as<Mem>(), op_info.broadcast_size);
+          src2 = UniCompiler_load_new(*this, dst, src2.as<Mem>(), op_info.broadcast_size);
         }
 
         ASMJIT_ASSERT(src2.is_vec());
@@ -5842,8 +6074,8 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
           tmp1 = new_similar_reg(dst, "@tmp1");
         }
 
-        avx_isign_flip(this, tmp1, src1v, ElementSize::k64);
-        avx_isign_flip(this, tmp2, src2v, ElementSize::k64);
+        avx_isign_flip(*this, tmp1, src1v, ElementSize::k64);
+        avx_isign_flip(*this, tmp2, src2v, ElementSize::k64);
 
         cc->vpcmpgtq(tmp1, tmp1, tmp2);           // tmp1 = src1 > src2
         if (op == UniOpVVV::kMinU64)
@@ -5867,19 +6099,19 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
           cc->emit(inst.pmin, dst, src1v, src2);
           cc->emit(inst.peq, dst, dst, src1v);
         }
-        avx_bit_not(this, dst, dst);
+        avx_bit_not(*this, dst, dst);
         return;
       }
 
       case UniOpVVV::kCmpGtU64:
       case UniOpVVV::kCmpLeU64: {
         Vec tmp = new_similar_reg(dst, "@tmp");
-        avx_isign_flip(this, tmp, src2, ElementSize::k64);
-        avx_isign_flip(this, dst, src1v, ElementSize::k64);
+        avx_isign_flip(*this, tmp, src2, ElementSize::k64);
+        avx_isign_flip(*this, dst, src1v, ElementSize::k64);
         cc->emit(Inst::kIdVpcmpgtq, dst, dst, tmp);
 
         if (op == UniOpVVV::kCmpLeU64) {
-          avx_bit_not(this, dst, dst);
+          avx_bit_not(*this, dst, dst);
         }
         return;
       }
@@ -5918,7 +6150,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kCmpGeI64: {
         if (!src2.is_reg()) {
           Vec tmp = new_similar_reg(dst, "@tmp");
-          avx_mov(this, tmp, src2);
+          avx_mov(*this, tmp, src2);
           src2 = tmp;
         }
 
@@ -5926,7 +6158,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         cc->emit(inst.pgt, dst, src2, src1v);
 
         if (op == UniOpVVV::kCmpGeI64) {
-          avx_bit_not(this, dst, dst);
+          avx_bit_not(*this, dst, dst);
         }
         return;
       }
@@ -5937,14 +6169,14 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kCmpLtU64:
       case UniOpVVV::kCmpGeU64: {
         Vec tmp = new_similar_reg(dst, "@tmp");
-        avx_isign_flip(this, tmp, src2, ElementSize(op_info.element_size));
-        avx_isign_flip(this, dst, src1v, ElementSize(op_info.element_size));
+        avx_isign_flip(*this, tmp, src2, ElementSize(op_info.element_size));
+        avx_isign_flip(*this, dst, src1v, ElementSize(op_info.element_size));
 
         CmpMinMaxInst inst = avx_cmp_min_max[(size_t(op) - size_t(UniOpVVV::kCmpLtI8)) & 0x7u];
         cc->emit(inst.pgt, dst, tmp, dst);
 
         if (op == UniOpVVV::kCmpGeU64) {
-          avx_bit_not(this, dst, dst);
+          avx_bit_not(*this, dst, dst);
         }
         return;
       }
@@ -5979,7 +6211,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kCmpLeI64: {
         cc->emit(Inst::kIdVpcmpgtq, dst, src1v, src2);
 
-        avx_bit_not(this, dst, dst);
+        avx_bit_not(*this, dst, dst);
         return;
       }
 
@@ -6004,7 +6236,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         // Intrinsic - dst = {src1.u64[0], src2.64[1]} - combining low part of src1 and high part of src1.
         if (!src2.is_reg()) {
           Vec tmp = new_similar_reg(dst, "@tmp");
-          avx_mov(this, tmp, src2);
+          avx_mov(*this, tmp, src2);
           src2 = tmp;
         }
 
@@ -6026,7 +6258,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
 
         if (!src2.is_reg()) {
           Vec tmp = new_similar_reg(dst, "@tmp");
-          avx_mov(this, tmp, src2);
+          avx_mov(*this, tmp, src2);
           src2 = tmp;
         }
 
@@ -6072,7 +6304,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
           return;
 
         case SameVecOp::kSrc:
-          sse_mov(this, dst, src1v);
+          sse_mov(*this, dst, src1v);
           return;
 
         default:
@@ -6086,11 +6318,11 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       if (!is_same_vec(dst, src1v)) {
         if (is_same_vec(dst, src2)) {
           Vec tmp = new_similar_reg(dst, "tmp");
-          sse_mov(this, tmp, src2);
+          sse_mov(*this, tmp, src2);
           src2 = tmp;
         }
 
-        sse_mov(this, dst, src1v);
+        sse_mov(*this, dst, src1v);
       }
 
       if (op_info.use_imm)
@@ -6112,15 +6344,37 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
 
         if (is_same_vec(dst, src1v)) {
           Vec tmp = new_similar_reg(dst);
-          sse_mov(this, tmp, src1v);
+          sse_mov(*this, tmp, src1v);
           src1v = tmp;
         }
 
-        sse_mov(this, dst, src2);
+        sse_mov(*this, dst, src2);
         cc->emit(inst_id, dst, src1v);
         return;
       }
 
+      // dst = a - (floor(a / b) * b).
+      case UniOpVVV::kModF32S:
+      case UniOpVVV::kModF64S:
+      case UniOpVVV::kModF32:
+      case UniOpVVV::kModF64: {
+        FloatMode fm = FloatMode(op_info.float_mode);
+        UniOpVV trunc_op = translate_op(op, UniOpVVV::kModF32S, UniOpVV::kTruncF32);
+        const FloatInst& fi = sse_float_inst[fm];
+
+        x86::Vec tmp = new_similar_reg(dst, "@mod_tmp");
+
+        cc->emit(fi.fmova, tmp, src1v);
+        cc->emit(fi.fdiv, tmp, src2);
+
+        emit_2v(trunc_op, tmp, tmp);
+        cc->emit(fi.fmul, tmp, src2);
+
+        sse_fmov(*this, dst, src1v, fm);
+        cc->emit(fi.fsub, dst, tmp);
+        return;
+      }
+
       case UniOpVVV::kMulU32: {
         // Native operation requires SSE4.1, which is not supported by the target.
         Vec tmp1 = new_similar_reg(dst, "tmp1");
@@ -6130,7 +6384,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         cc->emit(Inst::kIdPshufd, tmp2, src2, x86::shuffle_imm(3, 3, 1, 1));
         cc->emit(Inst::kIdPmuludq, tmp1, tmp2);
 
-        sse_mov(this, dst, src1v);
+        sse_mov(*this, dst, src1v);
         cc->emit(Inst::kIdPmuludq, dst, src2);
         cc->emit(Inst::kIdShufps, dst, tmp1, x86::shuffle_imm(2, 0, 2, 0));
         cc->emit(Inst::kIdPshufd, dst, dst, x86::shuffle_imm(3, 1, 2, 0));
@@ -6149,7 +6403,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         cc->emit(Inst::kIdPmuludq, ah_bl, src2);
         cc->emit(Inst::kIdPaddq, al_bh, ah_bl);
 
-        sse_mov(this, dst, src1v);
+        sse_mov(*this, dst, src1v);
         cc->emit(Inst::kIdPmuludq, dst, src2);
         cc->emit(Inst::kIdPsllq, al_bh, 32);
         cc->emit(Inst::kIdPaddq, dst, al_bh);
@@ -6166,7 +6420,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
           cc->emit(Inst::kIdPmuludq, dst, src1v);
         }
         else {
-          sse_mov(this, dst, src1v);
+          sse_mov(*this, dst, src1v);
           cc->emit(Inst::kIdPmuludq, dst, src2);
         }
         cc->emit(Inst::kIdPsllq, tmp, 32);
@@ -6179,8 +6433,8 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kMinI64:
         if (!has_sse4_2()) {
           Vec msk = new_vec128("@msk");
-          sse_cmp_gt_i64(this, msk, src2, src1v);
-          sse_select(this, dst, src1v, src2, msk);
+          sse_cmp_gt_i64(*this, msk, src2, src1v);
+          sse_select(*this, dst, src1v, src2, msk);
           return;
         }
         [[fallthrough]];
@@ -6193,7 +6447,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         Vec msk = new_vec128("@msk");
         cc->emit(Inst::kIdMovaps, msk, src2);
         cc->emit(cmp_inst_id, msk, src1v);
-        sse_select(this, dst, src1v, src2, msk);
+        sse_select(*this, dst, src1v, src2, msk);
         return;
       }
 
@@ -6201,8 +6455,8 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         // Native operation requires AVX512, which is not supported by the target.
         if (!has_sse4_2()) {
           Vec msk = new_vec128("@msk");
-          sse_cmp_gt_i64(this, msk, src1v, src2);
-          sse_select(this, dst, src1v, src2, msk);
+          sse_cmp_gt_i64(*this, msk, src1v, src2);
+          sse_select(*this, dst, src1v, src2, msk);
           return;
         }
         [[fallthrough]];
@@ -6215,7 +6469,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         Vec msk = new_vec128("@msk");
         cc->emit(Inst::kIdMovaps, msk, src1v);
         cc->emit(cmp_inst_id, msk, src2);
-        sse_select(this, dst, src1v, src2, msk);
+        sse_select(*this, dst, src1v, src2, msk);
         return;
       }
 
@@ -6224,14 +6478,14 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         Vec tmp = new_vec128("@tmp");
         cc->emit(Inst::kIdMovaps, tmp, src1v);
         cc->emit(Inst::kIdPsubusw, tmp, src2);
-        sse_mov(this, dst, src1v);
+        sse_mov(*this, dst, src1v);
         cc->emit(Inst::kIdPsubw, dst, tmp);
         return;
       }
 
       case UniOpVVV::kMaxU16: {
         // Native operation requires SSE4.1, which is not supported by the target.
-        sse_mov(this, dst, src1v);
+        sse_mov(*this, dst, src1v);
         cc->emit(Inst::kIdPsubusw, dst, src2);
         cc->emit(Inst::kIdPaddw, dst, src2);
         return;
@@ -6240,47 +6494,47 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kMinU32:
       case UniOpVVV::kMaxU32: {
         // Native operation requires SSE4.1, which is not supported by the target.
-        Operand flip_mask = simd_const(&ct.p_8000000080000000, Bcst::kNA, dst);
+        Operand flip_mask = simd_const(&ct().p_8000000080000000, Bcst::kNA, dst);
         Vec tmp1 = new_similar_reg(dst, "@tmp1");
         Vec tmp2 = new_similar_reg(dst, "@tmp2");
 
         if (op == UniOpVVV::kMinU32) {
-          sse_mov(this, tmp1, src2);
-          sse_mov(this, tmp2, src1v);
+          sse_mov(*this, tmp1, src2);
+          sse_mov(*this, tmp2, src1v);
         }
         else {
-          sse_mov(this, tmp1, src1v);
-          sse_mov(this, tmp2, src2);
+          sse_mov(*this, tmp1, src1v);
+          sse_mov(*this, tmp2, src2);
         }
 
         cc->emit(Inst::kIdPxor, tmp1, flip_mask);
         cc->emit(Inst::kIdPxor, tmp2, flip_mask);
         cc->emit(Inst::kIdPcmpgtd, tmp1, tmp2);
 
-        sse_select(this, dst, src1v, src2, tmp1);
+        sse_select(*this, dst, src1v, src2, tmp1);
         return;
       }
 
       case UniOpVVV::kMinU64: {
         // Native operation requires AVX512, which is not supported by the target.
         Vec msk = new_similar_reg(dst, "@tmp1");
-        sse_cmp_gt_u64(this, msk, src2, src1v);
-        sse_select(this, dst, src1v, src2, msk);
+        sse_cmp_gt_u64(*this, msk, src2, src1v);
+        sse_select(*this, dst, src1v, src2, msk);
         return;
       }
 
       case UniOpVVV::kMaxU64: {
         // Native operation requires AVX512, which is not supported by the target.
         Vec msk = new_similar_reg(dst, "@tmp1");
-        sse_cmp_gt_u64(this, msk, src1v, src2);
-        sse_select(this, dst, src1v, src2, msk);
+        sse_cmp_gt_u64(*this, msk, src1v, src2);
+        sse_select(*this, dst, src1v, src2, msk);
         return;
       }
 
       case UniOpVVV::kCmpEqU64: {
         // Native operation requires SSE4.1, which is not supported by the target.
         Vec tmp = new_similar_reg(dst, "@tmp");
-        sse_mov(this, dst, src1v);
+        sse_mov(*this, dst, src1v);
         cc->emit(Inst::kIdPcmpeqd, dst, src2);
         cc->emit(Inst::kIdPshufd, tmp, dst, x86::shuffle_imm(2, 3, 0, 1));
         cc->emit(Inst::kIdPand, dst, tmp);
@@ -6289,7 +6543,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
 
       case UniOpVVV::kCmpGtI64: {
         // Native operation requires SSE4.2, which is not supported by the target.
-        sse_cmp_gt_i64(this, dst, src1v, src2);
+        sse_cmp_gt_i64(*this, dst, src1v, src2);
         return;
       }
 
@@ -6315,20 +6569,20 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
             cc->emit(inst.peq, dst, src1v);
           }
 
-          sse_bit_not(this, dst, dst);
+          sse_bit_not(*this, dst, dst);
           return;
         }
 
         Vec tmp = new_similar_reg(dst, "@tmp");
-        sse_msb_flip(this, tmp, src2, ElementSize(op_info.element_size));
-        sse_msb_flip(this, dst, src1v, ElementSize(op_info.element_size));
+        sse_msb_flip(*this, tmp, src2, ElementSize(op_info.element_size));
+        sse_msb_flip(*this, dst, src1v, ElementSize(op_info.element_size));
         cc->emit(inst.pgt, dst, tmp);
         return;
       }
 
       case UniOpVVV::kCmpGtU64: {
         // Native operation requires AVX512, which is not supported by the target.
-        sse_cmp_gt_u64(this, dst, src1v, src2);
+        sse_cmp_gt_u64(*this, dst, src1v, src2);
         return;
       }
 
@@ -6363,11 +6617,11 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         if (op == UniOpVVV::kCmpGeU16) {
           Vec tmp = new_similar_reg(dst, "@tmp");
 
-          sse_mov(this, tmp, src1v);
+          sse_mov(*this, tmp, src1v);
           cc->emit(Inst::kIdPsubusw, tmp, src2);
           cc->emit(Inst::kIdPaddw, tmp, src2);
 
-          sse_mov(this, dst, src1v);
+          sse_mov(*this, dst, src1v);
           cc->emit(Inst::kIdPcmpeqw, dst, tmp);
           return;
         }
@@ -6378,7 +6632,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         // Native operation requires AVX512, which is not supported by the target.
         if (src2.is_mem()) {
           Vec tmp = new_similar_reg(dst, "@tmp");
-          sse_mov(this, tmp, src2);
+          sse_mov(*this, tmp, src2);
           src2 = tmp;
         }
 
@@ -6393,7 +6647,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
             ASMJIT_NOT_REACHED();
         }
 
-        sse_bit_not(this, dst, dst);
+        sse_bit_not(*this, dst, dst);
         return;
 
       case UniOpVVV::kCmpLtI8:
@@ -6401,11 +6655,11 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kCmpLtI32: {
         if (is_same_vec(dst, src1v)) {
           Vec tmp = new_similar_reg(dst, "@tmp");
-          sse_mov(this, tmp, src1v);
+          sse_mov(*this, tmp, src1v);
           src1v = tmp;
         }
 
-        sse_mov(this, dst, src2);
+        sse_mov(*this, dst, src2);
         cc->emit(inst_id, dst, src1v);
         return;
       }
@@ -6414,36 +6668,36 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kCmpLtU16:
       case UniOpVVV::kCmpLtU32: {
         Vec tmp = new_similar_reg(dst, "@tmp");
-        sse_mov(this, tmp, src1v);
-        sse_msb_flip(this, tmp, src1v, ElementSize(op_info.element_size));
-        sse_msb_flip(this, dst, src2, ElementSize(op_info.element_size));
+        sse_mov(*this, tmp, src1v);
+        sse_msb_flip(*this, tmp, src1v, ElementSize(op_info.element_size));
+        sse_msb_flip(*this, dst, src2, ElementSize(op_info.element_size));
         cc->emit(inst_id, dst, tmp);
         return;
       }
 
       case UniOpVVV::kCmpLtI64: {
         // Native operation requires AVX512, which is not supported by the target.
-        sse_cmp_gt_i64(this, dst, src2, src1v);
+        sse_cmp_gt_i64(*this, dst, src2, src1v);
         return;
       }
 
       case UniOpVVV::kCmpLtU64: {
         // Native operation requires AVX512, which is not supported by the target.
-        sse_cmp_gt_u64(this, dst, src2, src1v);
+        sse_cmp_gt_u64(*this, dst, src2, src1v);
         return;
       }
 
       case UniOpVVV::kCmpLeU8: {
         if (is_same_vec(dst, src2)) {
           Vec tmp = new_similar_reg(dst, "@tmp");
-          sse_mov(this, tmp, src2);
+          sse_mov(*this, tmp, src2);
           src2 = tmp;
         }
 
-        sse_mov(this, dst, src1v);
+        sse_mov(*this, dst, src1v);
         cc->emit(Inst::kIdPsubusb, dst, src2);
 
-        Vec zeros = simd_vec_const(&ct.p_0000000000000000, Bcst::k32, dst);
+        Vec zeros = simd_vec_const(&ct().p_0000000000000000, Bcst::k32, dst);
         cc->emit(Inst::kIdPcmpeqb, dst, zeros);
         return;
       }
@@ -6489,7 +6743,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
             ASMJIT_NOT_REACHED();
         }
 
-        sse_bit_not(this, dst, dst);
+        sse_bit_not(*this, dst, dst);
         return;
 
       case UniOpVVV::kCmpLtF32S:
@@ -6506,8 +6760,8 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
           // Unfortunately we have to do two moves, because there are no predicates that
           // we could use in case of reversed operands (AVX is much better in this regard).
           Vec tmp = new_similar_reg(dst, "@tmp");
-          sse_mov(this, tmp, src2);
-          sse_mov(this, dst, src1v);
+          sse_mov(*this, tmp, src2);
+          sse_mov(*this, dst, src1v);
           cc->emit(inst_id, dst, tmp, pred);
           return;
         }
@@ -6530,7 +6784,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kCmpUnordF32:
       case UniOpVVV::kCmpUnordF64: {
         uint8_t pred = sse_fcmp_imm_table[(size_t(op) - size_t(UniOpVVV::kCmpEqF32S)) / 4u];
-        sse_mov(this, dst, src1v);
+        sse_mov(*this, dst, src1v);
         cc->emit(inst_id, dst, src2, pred);
         return;
       }
@@ -6547,14 +6801,14 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         uint8_t pred = sse_fcmp_imm_table[(size_t(op) - size_t(UniOpVVV::kCmpEqF32S)) / 4u];
 
         if (dst.id() != src1v.id()) {
-          sse_mov(this, dst, src2);
+          sse_mov(*this, dst, src2);
           cc->emit(inst_id, dst, src1v, pred);
         }
         else {
           Vec tmp = new_similar_reg(dst, "@tmp");
-          sse_mov(this, tmp, src2);
+          sse_mov(*this, tmp, src2);
           cc->emit(inst_id, tmp, src1v, pred);
-          sse_mov(this, dst, tmp);
+          sse_mov(*this, dst, tmp);
         }
         return;
       }
@@ -6579,7 +6833,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
           if (src2.is_mem()) {
             Mem m(src2.as<Mem>());
 
-            sse_mov(this, dst, src1v);
+            sse_mov(*this, dst, src1v);
             v_swap_f64(tmp, dst);
             cc->movhpd(dst, m);
 
@@ -6588,16 +6842,16 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
             cc->addpd(dst, tmp);
           }
           else if (is_same_vec(dst, src2)) {
-            sse_mov(this, tmp, src1v);
+            sse_mov(*this, tmp, src1v);
             cc->unpcklpd(tmp, src2.as<Vec>());
             cc->movhlps(dst, src1v);
             cc->addpd(dst, tmp);
           }
           else {
-            sse_mov(this, tmp, src1v);
+            sse_mov(*this, tmp, src1v);
             cc->unpckhpd(tmp, src2.as<Vec>());
 
-            sse_mov(this, dst, src1v);
+            sse_mov(*this, dst, src1v);
             cc->unpcklpd(dst, src2.as<Vec>());
 
             cc->addpd(dst, tmp.as<Vec>());
@@ -6642,7 +6896,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
       case UniOpVVV::kCombineHiLoF64: {
         // Intrinsic - dst = {src1.u64[1], src2.64[0]} - combining high part of src1 and low part of src2.
         if (src2.is_mem()) {
-          sse_mov(this, dst, src1v);
+          sse_mov(*this, dst, src1v);
           cc->emit(Inst::kIdMovlpd, dst, src2);
         }
         else if (is_same_vec(dst, src2)) {
@@ -6651,7 +6905,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         }
         else {
           // dst = {src1.u64[1], src2.u64[0]}
-          sse_mov(this, dst, src1v);
+          sse_mov(*this, dst, src1v);
           cc->emit(Inst::kIdMovsd, dst, src2);
         }
         return;
@@ -6669,15 +6923,15 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         // In general, if you hit this code-path (not having SSE4.1 and still needing exactly this instruction) I would
         // recommend using a different strategy in this case, completely avoiding this code path. Usually, inputs are not
         // arbitrary and knowing the range could help a lot to reduce the approach to use a native 'packssdw' instruction.
-        Operand bias = simd_const(&ct.p_0000800000008000, Bcst::kNA, dst);
-        Operand unbias = simd_const(&ct.p_8000800080008000, Bcst::kNA, dst);
+        Operand bias = simd_const(&ct().p_0000800000008000, Bcst::kNA, dst);
+        Operand unbias = simd_const(&ct().p_8000800080008000, Bcst::kNA, dst);
 
         if (is_same_vec(src1v, src2)) {
           Vec tmp = dst;
           if (is_same_vec(dst, src1v))
             tmp = new_similar_reg(dst, "@tmp1");
 
-          sse_mov(this, tmp, src1v);
+          sse_mov(*this, tmp, src1v);
 
           cc->emit(Inst::kIdPsrad, tmp, 31);
           cc->emit(Inst::kIdPandn, tmp, src1v);
@@ -6685,14 +6939,14 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
           cc->emit(Inst::kIdPackssdw, tmp, tmp);
           cc->emit(Inst::kIdPaddw, tmp, unbias);
 
-          sse_mov(this, dst, tmp);
+          sse_mov(*this, dst, tmp);
         }
         else {
           Vec tmp1 = new_similar_reg(dst, "@tmp1");
           Vec tmp2 = new_similar_reg(dst, "@tmp2");
 
-          sse_mov(this, tmp1, src1v);
-          sse_mov(this, tmp2, src2);
+          sse_mov(*this, tmp1, src1v);
+          sse_mov(*this, tmp2, src2);
 
           cc->emit(Inst::kIdPsrad, tmp1, 31);
           cc->emit(Inst::kIdPsrad, tmp2, 31);
@@ -6703,7 +6957,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
           cc->emit(Inst::kIdPackssdw, tmp1, tmp2);
           cc->emit(Inst::kIdPaddw, tmp1, unbias);
 
-          sse_mov(this, dst, tmp1);
+          sse_mov(*this, dst, tmp1);
         }
         return;
       }
@@ -6723,9 +6977,9 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
         // The trick is to AND all indexes by 0x0F and then to do unsigned minimum so all indexes are in [0, 17) range,
         // where index 16 maps to zero.
         Vec tmp = new_similar_reg(dst, "@tmp");
-        cc->vmovaps(tmp, simd_mem_const(&ct.p_0F0F0F0F0F0F0F0F, Bcst::kNA, tmp));
+        cc->vmovaps(tmp, simd_mem_const(&ct().p_0F0F0F0F0F0F0F0F, Bcst::kNA, tmp));
         cc->pand(tmp, src2.as<Vec>());
-        cc->pminub(tmp, simd_mem_const(&ct.p_1010101010101010, Bcst::kNA, tmp));
+        cc->pminub(tmp, simd_mem_const(&ct().p_1010101010101010, Bcst::kNA, tmp));
         cc->movaps(m_pred, tmp);
         cc->mov(m_data.clone_adjusted(16), 0);
 
@@ -6758,12 +7012,12 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src
   }
 }
 
-void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); }
-void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); }
-void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); }
+void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); }
+void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); }
+void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); }
 
-// ujit::UniCompiler  - Vector Instructions - Emit 3VI
-// ===================================================
+// ujit::UniCompiler - Vector Instructions - Emit 3VI
+// ==================================================
 
 void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_, uint32_t imm) noexcept {
   ASMJIT_ASSERT(dst_.is_vec());
@@ -6791,7 +7045,7 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
       // Intrin - short-circuit if possible based on the predicate.
       case UniOpVVVI::kAlignr_U128: {
         if (imm == 0) {
-          avx_mov(this, dst, src2);
+          avx_mov(*this, dst, src2);
           return;
         }
 
@@ -6896,7 +7150,7 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
       // Intrin - short-circuit if possible based on the predicate.
       case UniOpVVVI::kAlignr_U128: {
         if (imm == 0) {
-          sse_mov(this, dst, src2);
+          sse_mov(*this, dst, src2);
           return;
         }
 
@@ -6913,11 +7167,11 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
         if (has_ssse3()) {
           if (is_same_vec(dst, src2) && !is_same_vec(dst, src1v)) {
             Vec tmp = new_similar_reg(dst, "@tmp");
-            sse_mov(this, tmp, src2);
+            sse_mov(*this, tmp, src2);
             src2 = tmp;
           }
 
-          sse_mov(this, dst, src1v);
+          sse_mov(*this, dst, src1v);
           cc->emit(Inst::kIdPalignr, dst, src2, imm);
           return;
         }
@@ -6927,13 +7181,13 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
         uint32_t src2_shift = imm;
 
         if (is_same_vec(dst, src1v)) {
-          sse_mov(this, tmp, src2);
+          sse_mov(*this, tmp, src2);
           cc->emit(Inst::kIdPsrldq, tmp, src2_shift);
           cc->emit(Inst::kIdPslldq, dst, src1_shift);
         }
         else {
-          sse_mov(this, tmp, src1v);
-          sse_mov(this, dst, src2);
+          sse_mov(*this, tmp, src1v);
+          sse_mov(*this, dst, src2);
           cc->emit(Inst::kIdPslldq, tmp, src1_shift);
           cc->emit(Inst::kIdPsrldq, dst, src2_shift);
         }
@@ -6956,7 +7210,7 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
           shuf_imm = shuf_imm2_from_swizzle(Swizzle2{imm});
 
         if (is_same_vec(src1v, src2)) {
-          UniOpVVI vvi_op =  UniOpVVI(uint32_t(UniOpVVI::kSwizzleU32x4) + (uint32_t(op) - uint32_t(UniOpVVVI::kInterleaveShuffleU32x4)));
+          UniOpVVI vvi_op =  translate_op(op, UniOpVVVI::kInterleaveShuffleU32x4, UniOpVVI::kSwizzleU32x4);
           emit_2vi(vvi_op, dst, src1v, imm);
           return;
         }
@@ -6975,7 +7229,7 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
           cc->emit(Inst::kIdPshufd, dst, dst, x86::shuffle_imm(1, 0, 3, 2));
         }
         else {
-          sse_mov(this, dst, src1v);
+          sse_mov(*this, dst, src1v);
           cc->emit(inst_id, dst, src2, shuf_imm);
         }
         return;
@@ -6998,12 +7252,12 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s
   }
 }
 
-void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); }
-void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); }
-void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); }
+void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); }
+void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); }
+void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); }
 
-// ujit::UniCompiler  - Vector Instructions - Emit 4V
-// ==================================================
+// ujit::UniCompiler - Vector Instructions - Emit 4V
+// =================================================
 
 void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_, const Operand_& src3_) noexcept {
   ASMJIT_ASSERT(dst_.is_vec());
@@ -7035,7 +7289,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
     switch (op) {
       case UniOpVVVV::kBlendV_U8: {
         // Blend(a, b, cond) == (a & ~cond) | (b & cond)
-        avx_make_vec(this, src3, dst, "msk");
+        avx_make_vec(*this, src3, dst, "msk");
         cc->emit(op_info.avx_inst_id, dst, src1, src2, src3);
         return;
       }
@@ -7100,7 +7354,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
         size_t fma_id = size_t(op) - size_t(UniOpVVVV::kMAddF32S);
         FloatMode fm = FloatMode(op_info.float_mode);
 
-        if (fm == FloatMode::kF32S || fm == FloatMode::kF64S) {
+        if (is_scalar_fp_op(fm)) {
           dst.set_signature(signature_of_xmm_ymm_zmm[0]);
           src1.set_signature(signature_of_xmm_ymm_zmm[0]);
 
@@ -7150,7 +7404,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
             cc->emit(fma_bc_add_a[fma_id], dst, src1, src2);
           }
           else {
-            avx_mov(this, dst, src1);
+            avx_mov(*this, dst, src1);
             if (!src2.is_reg())
               cc->emit(fma_ac_add_b[fma_id], dst, src3, src2);
             else if (!src3.is_reg())
@@ -7183,7 +7437,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
           else {
             // NMAdd or NMSub Operation.
             Vec tmp = new_similar_reg(dst, "@tmp");
-            avx_fsign_flip(this, tmp, src1, fm);
+            avx_fsign_flip(*this, tmp, src1, fm);
 
             cc->emit(fi.fmul, tmp, tmp, src2);
             cc->emit(fi_facc, dst, tmp, src3);
@@ -7205,8 +7459,8 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
         // Blend(a, b, cond) == (a & ~cond) | (b & cond)
         if (has_sse4_1()) {
           if (is_same_vec(dst, src1) || (!is_same_vec(dst, src2) && !is_same_vec(dst, src3))) {
-            sse_make_vec(this, src3, "tmp");
-            sse_mov(this, dst, src1);
+            sse_make_vec(*this, src3, "tmp");
+            sse_mov(*this, dst, src1);
             cc->emit(op_info.sse_inst_id, dst, src2, src3);
             return;
           }
@@ -7277,7 +7531,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
         if (is_same_vec(dst, src2)) {
           // Unfortunately, to follow the FMA behavior in scalar case, we have to copy.
           if (fm <= FloatMode::kF64S)
-            src2 = sse_copy(this, src2.as<Vec>(), "@copy_src2");
+            src2 = sse_copy(*this, src2.as<Vec>(), "@copy_src2");
           else
             std::swap(src1, src2.as<Vec>());
         }
@@ -7288,11 +7542,11 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
         if (is_same_vec(dst, src3)) {
           if (fm <= FloatMode::kF64S || !mul_add) {
             // Copy if we couldn't avoid the extra move.
-            src3 = sse_copy(this, src3.as<Vec>(), "@copy_src3");
+            src3 = sse_copy(*this, src3.as<Vec>(), "@copy_src3");
           }
           else {
             Vec tmp = cc->new_similar_reg(dst, "@tmp");
-            sse_mov(this, tmp, src1);
+            sse_mov(*this, tmp, src1);
             cc->emit(fi.fmul, tmp, src2);
             cc->emit(neg_mul ? fi.fsub : fi.fadd, dst, tmp);
             return;
@@ -7300,9 +7554,9 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
         }
 
         if (neg_mul)
-          sse_fsign_flip(this, dst, src1, fm);
+          sse_fsign_flip(*this, dst, src1, fm);
         else
-          sse_mov(this, dst, src1);
+          sse_mov(*this, dst, src1);
 
         cc->emit(fi.fmul, dst, src2);
         cc->emit(fi_facc, dst, src3);
@@ -7315,13 +7569,13 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr
   }
 }
 
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
-void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
+void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); }
 
 ASMJIT_END_SUB_NAMESPACE
 
diff --git a/src/asmjit/ujit/unicondition.h b/src/asmjit/ujit/unicondition.h
new file mode 100644
index 0000000..3e6ee81
--- /dev/null
+++ b/src/asmjit/ujit/unicondition.h
@@ -0,0 +1,293 @@
+// This file is part of AsmJit project <https://asmjit.com>
+//
+// See <asmjit/core.h> or LICENSE.md for license and copyright information
+// SPDX-License-Identifier: Zlib
+
+#ifndef ASMJIT_UJIT_UNICONDITION_H_INCLUDED
+#define ASMJIT_UJIT_UNICONDITION_H_INCLUDED
+
+#include "ujitbase.h"
+#include "uniop.h"
+
+#if !defined(ASMJIT_NO_UJIT)
+
+ASMJIT_BEGIN_SUB_NAMESPACE(ujit)
+
+//! \addtogroup asmjit_ujit
+//! \{
+
+//! Condition represents either a condition or an assignment operation that can be checked.
+class UniCondition {
+public:
+  //! \name Members
+  //! \{
+
+  UniOpCond op;
+  CondCode cond;
+  Operand a;
+  Operand b;
+
+  //! \}
+
+  //! \name Construction & Destruction
+  //! \{
+
+  ASMJIT_INLINE_NODEBUG UniCondition(UniOpCond op, CondCode cond, const Operand& a, const Operand& b) noexcept
+    : op(op),
+      cond(cond),
+      a(a),
+      b(b) {}
+
+  ASMJIT_INLINE_NODEBUG UniCondition(const UniCondition& other) noexcept = default;
+
+  //! \}
+
+  //! \name Overloaded Operators
+  //! \{
+
+  ASMJIT_INLINE_NODEBUG UniCondition& operator=(const UniCondition& other) noexcept = default;
+
+  //! \}
+};
+
+//! Constructs a condition that would be `true` when `a = (a & b)` becomes zero.
+static ASMJIT_INLINE UniCondition and_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a & b)` becomes zero.
+static ASMJIT_INLINE UniCondition and_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a & b)` becomes zero.
+static ASMJIT_INLINE UniCondition and_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a & b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition and_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a & b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition and_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a & b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition and_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); }
+
+//! Constructs a condition that would be `true` when `a = (a | b)` becomes zero.
+static ASMJIT_INLINE UniCondition or_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a | b)` becomes zero.
+static ASMJIT_INLINE UniCondition or_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a | b)` becomes zero.
+static ASMJIT_INLINE UniCondition or_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a | b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition or_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a | b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition or_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a | b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition or_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); }
+
+//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes zero.
+static ASMJIT_INLINE UniCondition xor_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes zero.
+static ASMJIT_INLINE UniCondition xor_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes zero.
+static ASMJIT_INLINE UniCondition xor_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition xor_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition xor_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition xor_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); }
+
+//! Constructs a condition that would be `true` when `a = (a + b)` becomes zero.
+static ASMJIT_INLINE UniCondition add_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` becomes zero.
+static ASMJIT_INLINE UniCondition add_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` becomes zero.
+static ASMJIT_INLINE UniCondition add_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition add_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition add_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition add_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); }
+
+//! Constructs a condition that would be `true` when `a = (a + b)` wraps (sets carry flag).
+static ASMJIT_INLINE UniCondition add_c(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` wraps (sets carry flag).
+static ASMJIT_INLINE UniCondition add_c(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` wraps (sets carry flag).
+static ASMJIT_INLINE UniCondition add_c(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` doesn't wrap (doesn't set carry flag).
+static ASMJIT_INLINE UniCondition add_nc(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` doesn't wrap (doesn't set carry flag).
+static ASMJIT_INLINE UniCondition add_nc(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` doesn't wrap (doesn't set carry flag).
+static ASMJIT_INLINE UniCondition add_nc(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); }
+
+//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit set.
+static ASMJIT_INLINE UniCondition add_s(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit set.
+static ASMJIT_INLINE UniCondition add_s(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit set.
+static ASMJIT_INLINE UniCondition add_s(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit unset.
+static ASMJIT_INLINE UniCondition add_ns(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit unset.
+static ASMJIT_INLINE UniCondition add_ns(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); }
+//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit unset.
+static ASMJIT_INLINE UniCondition add_ns(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); }
+
+//! Constructs a condition that would be `true` when `a = (a - b)` becomes zero.
+static ASMJIT_INLINE UniCondition sub_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` becomes zero.
+static ASMJIT_INLINE UniCondition sub_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` becomes zero.
+static ASMJIT_INLINE UniCondition sub_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition sub_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition sub_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition sub_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); }
+
+//! Constructs a condition that would be `true` when `a = (a - b)` wraps.
+static ASMJIT_INLINE UniCondition sub_c(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` wraps.
+static ASMJIT_INLINE UniCondition sub_c(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` wraps.
+static ASMJIT_INLINE UniCondition sub_c(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` doesn't wrap.
+static ASMJIT_INLINE UniCondition sub_nc(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` doesn't wrap.
+static ASMJIT_INLINE UniCondition sub_nc(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` doesn't wrap.
+static ASMJIT_INLINE UniCondition sub_nc(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); }
+
+//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit set.
+static ASMJIT_INLINE UniCondition sub_s(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kSign, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit set.
+static ASMJIT_INLINE UniCondition sub_s(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kSign, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit set.
+static ASMJIT_INLINE UniCondition sub_s(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kSign, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit unset.
+static ASMJIT_INLINE UniCondition sub_ns(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit unset.
+static ASMJIT_INLINE UniCondition sub_ns(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); }
+//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit unset.
+static ASMJIT_INLINE UniCondition sub_ns(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); }
+
+static ASMJIT_INLINE UniCondition sub_ugt(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); }
+static ASMJIT_INLINE UniCondition sub_ugt(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); }
+static ASMJIT_INLINE UniCondition sub_ugt(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); }
+
+//! Constructs a condition that would be `true` when `a = (a << b)` becomes zero.
+static ASMJIT_INLINE UniCondition shr_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a << b)` becomes zero.
+static ASMJIT_INLINE UniCondition shr_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a << b)` becomes zero.
+static ASMJIT_INLINE UniCondition shr_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a << b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition shr_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a << b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition shr_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a = (a << b)` becomes non-zero.
+static ASMJIT_INLINE UniCondition shr_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); }
+
+//! Constructs a condition that would be `true` when `a == b)`.
+static ASMJIT_INLINE UniCondition cmp_eq(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kEqual, a, b); }
+//! Constructs a condition that would be `true` when `a == b)`.
+static ASMJIT_INLINE UniCondition cmp_eq(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kEqual, a, b); }
+//! Constructs a condition that would be `true` when `a == b)`.
+static ASMJIT_INLINE UniCondition cmp_eq(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kEqual, a, b); }
+
+//! Constructs a condition that would be `true` when `a != b)`.
+static ASMJIT_INLINE UniCondition cmp_ne(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); }
+//! Constructs a condition that would be `true` when `a != b)`.
+static ASMJIT_INLINE UniCondition cmp_ne(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); }
+//! Constructs a condition that would be `true` when `a != b)`.
+static ASMJIT_INLINE UniCondition cmp_ne(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); }
+
+//! Constructs a condition that would be `true` when `a < b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_lt(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); }
+//! Constructs a condition that would be `true` when `a < b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_lt(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); }
+//! Constructs a condition that would be `true` when `a < b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_lt(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); }
+
+//! Constructs a condition that would be `true` when `a <= b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_le(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); }
+//! Constructs a condition that would be `true` when `a <= b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_le(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); }
+//! Constructs a condition that would be `true` when `a <= b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_le(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); }
+
+//! Constructs a condition that would be `true` when `a > b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_gt(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); }
+//! Constructs a condition that would be `true` when `a > b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_gt(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); }
+//! Constructs a condition that would be `true` when `a > b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_gt(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); }
+
+//! Constructs a condition that would be `true` when `a >= b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_ge(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); }
+//! Constructs a condition that would be `true` when `a >= b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_ge(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); }
+//! Constructs a condition that would be `true` when `a >= b` (signed comparison).
+static ASMJIT_INLINE UniCondition scmp_ge(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); }
+
+//! Constructs a condition that would be `true` when `a < b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_lt(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); }
+//! Constructs a condition that would be `true` when `a < b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_lt(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); }
+//! Constructs a condition that would be `true` when `a < b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_lt(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); }
+
+//! Constructs a condition that would be `true` when `a <= b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_le(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); }
+//! Constructs a condition that would be `true` when `a <= b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_le(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); }
+//! Constructs a condition that would be `true` when `a <= b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_le(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); }
+
+//! Constructs a condition that would be `true` when `a > b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_gt(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); }
+//! Constructs a condition that would be `true` when `a > b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_gt(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); }
+//! Constructs a condition that would be `true` when `a > b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_gt(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); }
+
+//! Constructs a condition that would be `true` when `a >= b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_ge(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); }
+//! Constructs a condition that would be `true` when `a >= b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_ge(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); }
+//! Constructs a condition that would be `true` when `a >= b` (unsigned comparison).
+static ASMJIT_INLINE UniCondition ucmp_ge(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); }
+
+//! Constructs a condition that would be `true` when `a` is zero.
+static ASMJIT_INLINE UniCondition test_z(const Gp& a) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kEqual, a, Imm(0)); }
+//! Constructs a condition that would be `true` when `a` is non-zero.
+static ASMJIT_INLINE UniCondition test_nz(const Gp& a) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kNotEqual, a, Imm(0)); }
+
+//! Constructs a condition that would be `true` when `a & b` is zero.
+static ASMJIT_INLINE UniCondition test_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a & b` is zero.
+static ASMJIT_INLINE UniCondition test_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a & b` is zero.
+static ASMJIT_INLINE UniCondition test_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kZero, a, b); }
+//! Constructs a condition that would be `true` when `a & b` is non-zero.
+static ASMJIT_INLINE UniCondition test_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a & b` is non-zero.
+static ASMJIT_INLINE UniCondition test_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kNotZero, a, b); }
+//! Constructs a condition that would be `true` when `a & b` is non-zero.
+static ASMJIT_INLINE UniCondition test_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kNotZero, a, b); }
+
+//! Constructs a condition that would be `true` when a bit in `a` at `b` is zero (`((a >> b) & 1) == 0`).
+static ASMJIT_INLINE UniCondition bt_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); }
+//! Constructs a condition that would be `true` when a bit in `a` at `b` is zero (`((a >> b) & 1) == 0`).
+static ASMJIT_INLINE UniCondition bt_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); }
+//! Constructs a condition that would be `true` when a bit in `a` at `b` is zero (`((a >> b) & 1) == 0`).
+static ASMJIT_INLINE UniCondition bt_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); }
+//! Constructs a condition that would be `true` when a bit in `a` at `b` is non-zero (`((a >> b) & 1) == 1`).
+static ASMJIT_INLINE UniCondition bt_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); }
+//! Constructs a condition that would be `true` when a bit in `a` at `b` is non-zero (`((a >> b) & 1) == 1`).
+static ASMJIT_INLINE UniCondition bt_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); }
+//! Constructs a condition that would be `true` when a bit in `a` at `b` is non-zero (`((a >> b) & 1) == 1`).
+static ASMJIT_INLINE UniCondition bt_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); }
+
+//! \}
+
+ASMJIT_END_SUB_NAMESPACE
+
+#endif // !ASMJIT_NO_UJIT
+#endif // ASMJIT_UJIT_UNICONDITION_H_INCLUDED
diff --git a/src/asmjit/ujit/uniop.h b/src/asmjit/ujit/uniop.h
index d7a3697..8e18de9 100644
--- a/src/asmjit/ujit/uniop.h
+++ b/src/asmjit/ujit/uniop.h
@@ -15,241 +15,264 @@ ASMJIT_BEGIN_SUB_NAMESPACE(ujit)
 //! \addtogroup asmjit_ujit
 //! \{
 
+//! Instruction that can be used by \ref UniCondition.
 enum class UniOpCond : uint32_t {
-  kAssignAnd,
-  kAssignOr,
-  kAssignXor,
-  kAssignAdd,
-  kAssignSub,
-  kAssignShr,
-  kTest,
-  kBitTest,
-  kCompare,
+  kAssignAnd,                 //!< Assign-and `a &= b`.
+  kAssignOr,                  //!< Assign-or  `a |= b`.
+  kAssignXor,                 //!< Assign-xor `a ^= b`.
+  kAssignAdd,                 //!< Assign-add `a += b`.
+  kAssignSub,                 //!< Assign-sub `a -= b`.
+  kAssignShr,                 //!< Assign-shr `a >>= b`.
+  kTest,                      //!< Test       `a & b`.
+  kBitTest,                   //!< Bit-test   `a & (1 << b)`.
+  kCompare,                   //!< Compare    `a <=> b`.
 
   kMaxValue = kCompare
 };
 
+//! Instruction with a single memory operand.
 enum class UniOpM : uint32_t {
-  kStoreZeroReg,
-  kStoreZeroU8,
-  kStoreZeroU16,
-  kStoreZeroU32,
-  kStoreZeroU64
+  kStoreZeroReg,              //!< Store zero (data-width depends on register size).
+  kStoreZeroU8,               //!< Store zero (8-bit).
+  kStoreZeroU16,              //!< Store zero (16-bit).
+  kStoreZeroU32,              //!< Store zero (32-bit).
+  kStoreZeroU64               //!< Store zero (64-bit).
 };
 
+//! Instruction with `[reg, mem]` operands.
 enum class UniOpRM : uint32_t {
-  kLoadReg,
-  kLoadI8,
-  kLoadU8,
-  kLoadI16,
-  kLoadU16,
-  kLoadI32,
-  kLoadU32,
-  kLoadI64,
-  kLoadU64,
-  kLoadMergeU8,
-  kLoadShiftU8,
-  kLoadMergeU16,
-  kLoadShiftU16
+  kLoadReg,                   //!< N-bit load (the size depends on the register size).
+  kLoadI8,                    //!< 8-bit load, sign extended.
+  kLoadU8,                    //!< 8-bit load, zero extended.
+  kLoadI16,                   //!< 16-bit load, sign extended.
+  kLoadU16,                   //!< 16-bit load, zero extended.
+  kLoadI32,                   //!< 32-bit load, sign extended.
+  kLoadU32,                   //!< 32-bit load, zero extended.
+  kLoadI64,                   //!< 64-bit load.
+  kLoadU64,                   //!< 64-bit load.
+  kLoadMergeU8,               //!< 8-bit load and merge.
+  kLoadShiftU8,               //!< 8-bit load, shift, and merge.
+  kLoadMergeU16,              //!< 16-bit load and merge.
+  kLoadShiftU16               //!< 16-bit load, shift, and merge.
 };
 
+//! Instruction with `[mem, reg]` operands.
 enum class UniOpMR : uint32_t {
-  kStoreReg,
-  kStoreU8,
-  kStoreU16,
-  kStoreU32,
-  kStoreU64,
-  kAddReg,
-  kAddU8,
-  kAddU16,
-  kAddU32,
-  kAddU64
+  kStoreReg,                  //!< N-bit store (the size depends on the register size).
+  kStoreU8,                   //!< 8-bit store.
+  kStoreU16,                  //!< 16-bit store.
+  kStoreU32,                  //!< 32-bit store.
+  kStoreU64,                  //!< 64-bit store.
+  kAddReg,                    //!< N-bit load+add+store (the size depends on the register size).
+  kAddU8,                     //!< 8-bit load+add+store.
+  kAddU16,                    //!< 16-bit load+add+store.
+  kAddU32,                    //!< 32-bit load+add+store.
+  kAddU64                     //!< 64-bit load+add+store.
 };
 
-//! Arithmetic operation having 2 operands (dst, src).
+//! Instruction with `[reg, reg]` operands.
+//!
+//! Arithmetic operations having 2 operands (dst, src).
+//!
+//! \note For convenience, the second operand can be register, memory, or immediate value.
 enum class UniOpRR : uint32_t {
-  kAbs,
-  kNeg,
-  kNot,
-  kBSwap,
-  kCLZ,
-  kCTZ,
-  kReflect,
+  kAbs,                       //!< Absolute value of a signed integer - `dst = abs(src)`.
+  kNeg,                       //!< Arithmetic negation - `dst = -src` (`dst = ~src + 1`).
+  kNot,                       //!< Bitwise-not - `dst = ~src`.
+  kBSwap,                     //!< Byteswap - `dst = bswap(src)`.
+  kCLZ,                       //!< Count leading zeros - `dst = clz(src)`.
+  kCTZ,                       //!< Count trailing zeros - `dst = ctz(src)`.
+  kReflect,                   //!< Integer reflection.
+
   kMaxValue = kReflect
 };
 
+//! Instruction with `[reg, reg, reg]` operands.
+//!
 //! Arithmetic operation having 3 operands (dst, src1, src2).
+//!
+//! \note For convenience, the third operand can be register, memory, or immediate value.
 enum class UniOpRRR : uint32_t {
-  kAnd,
-  kOr,
-  kXor,
-  kBic,
-  kAdd,
-  kSub,
-  kMul,
-  kUDiv,
-  kUMod,
-  kSMin,
-  kSMax,
-  kUMin,
-  kUMax,
-  kSll,
-  kSrl,
-  kSra,
-  kRol,
-  kRor,
-  kSBound,
+  kAnd,                       //!< Bitwise AND `dst = src1 & src2`.
+  kOr,                        //!< Bitwise OR  `dst = src1 | src2`.
+  kXor,                       //!< Bitwise XOR `dst = src1 ^ src2`.
+  kBic,                       //!< Bitwise BIC `dst = src1 & ~src2`.
+  kAdd,                       //!< Add `dst = src1 + src2`.
+  kSub,                       //!< Subtract `dst = src1 - src2`.
+  kMul,                       //!< Multiply `dst = src1 * src2`.
+  kUDiv,                      //!< Unsigned divide `dst = src1 / src2`.
+  kUMod,                      //!< Unsigned modulo `dst = src1 & src2`.
+  kSMin,                      //!< Signed minimum `dst = smin(src1, src2)`.
+  kSMax,                      //!< Signed maximum `dst = smax(src1, src2)`.
+  kUMin,                      //!< Unsigned minimum `dst = umin(src1, src2)`.
+  kUMax,                      //!< Unsigned maximum `dst = umax(src1, src2)`.
+  kSll,                       //!< Shift left logical `dst = src1 << src2`.
+  kSrl,                       //!< Shift left logical `dst = src1 >> src2`.
+  kSra,                       //!< Shift left logical `dst = sra(src1, src2)`.
+  kRol,                       //!< Rotate left `dst = (src1 << src2) | (src1 >> (N_BITS - src2))`.
+  kRor,                       //!< Rotate right `dst = (src1 >> src2) | (src1 << (N_BITS - src2))`.
+  kSBound,                    //!< Signed bounds.
 
   kMaxValue = kSBound
 };
 
+//! Instruction with `[vec, reg]` operands.
+//!
+//! Describes instructions where general-purpose is either moved, converted, or inserted to
+//! a vector register.
 enum class UniOpVR : uint32_t {
-  kMov,
-  kMovU32,
-  kMovU64,
-  kInsertU8,
-  kInsertU16,
-  kInsertU32,
-  kInsertU64,
-  kExtractU8,
-  kExtractU16,
-  kExtractU32,
-  kExtractU64,
-  kCvtIntToF32,
-  kCvtIntToF64,
-  kCvtTruncF32ToInt,
-  kCvtRoundF32ToInt,
-  kCvtTruncF64ToInt,
-  kCvtRoundF64ToInt,
+  kMov,                       //!< N-bit move into a vector register (the size depends on source register width).
+  kMovU32,                    //!< 32-bit move into a vector register.
+  kMovU64,                    //!< 64-bit move into a vector register.
+  kInsertU8,                  //!< 8-bit insertion into a vector register.
+  kInsertU16,                 //!< 16-bit insertion into a vector register.
+  kInsertU32,                 //!< 32-bit insertion into a vector register.
+  kInsertU64,                 //!< 64-bit insertion into a vector register.
+  kExtractU8,                 //!< 8-bit extraction from a vector register.
+  kExtractU16,                //!< 16-bit extraction from a vector register.
+  kExtractU32,                //!< 32-bit extraction from a vector register.
+  kExtractU64,                //!< 64-bit extraction from a vector register.
+  kCvtIntToF32,               //!< Int to float32 conversion.
+  kCvtIntToF64,               //!< Int to float64 conversion.
+  kCvtTruncF32ToInt,          //!< Float32 to int conversion with truncation semantics.
+  kCvtRoundF32ToInt,          //!< Float64 to int conversion with round-to-even semantics.
+  kCvtTruncF64ToInt,          //!< Float32 to int conversion with truncation semantics.
+  kCvtRoundF64ToInt,          //!< Float64 to int conversion with round-to-even semantics.
 
   kMaxValue = kCvtRoundF64ToInt
 };
 
+//! Instruction with `[vec, mem]` operands.
+//!
+//! Describes load, convert, and insert instructions.
 enum class UniOpVM : uint32_t {
-  kLoad8,
-  kLoad16_U16,
-  kLoad32_U32,
-  kLoad32_F32,
+  kLoad8,                     //!< 8-bit load into a vector register (the rest is cleared).
+  kLoad16_U16,                //!< 16-bit load into a vector register (the rest is cleared).
+  kLoad32_U32,                //!< 32-bit load (int) into a vector register (the rest is cleared).
+  kLoad32_F32,                //!< 32-bit load (f32) into a vector register (the rest is cleared).
 
-  kLoad64_U32,
-  kLoad64_U64,
-  kLoad64_F32,
-  kLoad64_F64,
+  kLoad64_U32,                //!< 32-bit load (int) into a vector register (the rest is cleared).
+  kLoad64_U64,                //!< 64-bit load (int) into a vector register (the rest is cleared).
+  kLoad64_F32,                //!< 32-bit load (f32) into a vector register (the rest is cleared).
+  kLoad64_F64,                //!< 64-bit load (f64) into a vector register (the rest is cleared).
 
-  kLoad128_U32,
-  kLoad128_U64,
-  kLoad128_F32,
-  kLoad128_F64,
+  kLoad128_U32,               //!< 128-bit load (int) into a vector register (the rest is cleared).
+  kLoad128_U64,               //!< 128-bit load (int) into a vector register (the rest is cleared).
+  kLoad128_F32,               //!< 128-bit load (f32) into a vector register (the rest is cleared).
+  kLoad128_F64,               //!< 128-bit load (f64) into a vector register (the rest is cleared).
 
-  kLoad256_U32,
-  kLoad256_U64,
-  kLoad256_F32,
-  kLoad256_F64,
+  kLoad256_U32,               //!< 256-bit load (int) into a vector register (the rest is cleared).
+  kLoad256_U64,               //!< 256-bit load (int) into a vector register (the rest is cleared).
+  kLoad256_F32,               //!< 256-bit load (f32) into a vector register (the rest is cleared).
+  kLoad256_F64,               //!< 256-bit load (f64) into a vector register (the rest is cleared).
 
-  kLoad512_U32,
-  kLoad512_U64,
-  kLoad512_F32,
-  kLoad512_F64,
+  kLoad512_U32,               //!< 512-bit load (int) into a vector register (the rest is cleared).
+  kLoad512_U64,               //!< 512-bit load (int) into a vector register (the rest is cleared).
+  kLoad512_F32,               //!< 512-bit load (f32) into a vector register (the rest is cleared).
+  kLoad512_F64,               //!< 512-bit load (f64) into a vector register (the rest is cleared).
 
-  kLoadN_U32,
-  kLoadN_U64,
-  kLoadN_F32,
-  kLoadN_F64,
+  kLoadN_U32,                 //!< N-bit load (int) into a vector register (the size depends on the vector width).
+  kLoadN_U64,                 //!< N-bit load (int) into a vector register (the size depends on the vector width).
+  kLoadN_F32,                 //!< N-bit load (f32) into a vector register (the size depends on the vector width).
+  kLoadN_F64,                 //!< N-bit load (f64) into a vector register (the size depends on the vector width).
 
-  kLoadCvt16_U8ToU64,
-  kLoadCvt32_U8ToU64,
-  kLoadCvt64_U8ToU64,
+  kLoadCvt16_U8ToU64,         //!< 16-bit load into a vector register with 8-bit to 64-bit zero extension (128-bit result).
+  kLoadCvt32_U8ToU64,         //!< 32-bit load into a vector register with 8-bit to 64-bit zero extension (256-bit result).
+  kLoadCvt64_U8ToU64,         //!< 64-bit load into a vector register with 8-bit to 64-bit zero extension (512-bit result).
 
-  kLoadCvt32_I8ToI16,
-  kLoadCvt32_U8ToU16,
-  kLoadCvt32_I8ToI32,
-  kLoadCvt32_U8ToU32,
-  kLoadCvt32_I16ToI32,
-  kLoadCvt32_U16ToU32,
-  kLoadCvt32_I32ToI64,
-  kLoadCvt32_U32ToU64,
+  kLoadCvt32_I8ToI16,         //!< 32-bit load into a vector register with 8-bit to 16-bit sign extension (64-bit result).
+  kLoadCvt32_U8ToU16,         //!< 32-bit load into a vector register with 8-bit to 16-bit zero extension (64-bit result).
+  kLoadCvt32_I8ToI32,         //!< 32-bit load into a vector register with 8-bit to 32-bit sign extension (128-bit result).
+  kLoadCvt32_U8ToU32,         //!< 32-bit load into a vector register with 8-bit to 32-bit zero extension (128-bit result).
+  kLoadCvt32_I16ToI32,        //!< 32-bit load into a vector register with 16-bit to 32-bit sign extension (64-bit result).
+  kLoadCvt32_U16ToU32,        //!< 32-bit load into a vector register with 16-bit to 32-bit zero extension (64-bit result).
+  kLoadCvt32_I32ToI64,        //!< 32-bit load into a vector register with 32-bit to 64-bit sign extension (64-bit result).
+  kLoadCvt32_U32ToU64,        //!< 32-bit load into a vector register with 32-bit to 64-bit zero extension (64-bit result).
 
-  kLoadCvt64_I8ToI16,
-  kLoadCvt64_U8ToU16,
-  kLoadCvt64_I8ToI32,
-  kLoadCvt64_U8ToU32,
-  kLoadCvt64_I16ToI32,
-  kLoadCvt64_U16ToU32,
-  kLoadCvt64_I32ToI64,
-  kLoadCvt64_U32ToU64,
+  kLoadCvt64_I8ToI16,         //!< 64-bit load into a vector register with 8-bit to 16-bit sign extension (128-bit result).
+  kLoadCvt64_U8ToU16,         //!< 64-bit load into a vector register with 8-bit to 16-bit zero extension (128-bit result).
+  kLoadCvt64_I8ToI32,         //!< 64-bit load into a vector register with 8-bit to 32-bit sign extension (256-bit result).
+  kLoadCvt64_U8ToU32,         //!< 64-bit load into a vector register with 8-bit to 32-bit zero extension (256-bit result).
+  kLoadCvt64_I16ToI32,        //!< 64-bit load into a vector register with 16-bit to 32-bit sign extension (128-bit result).
+  kLoadCvt64_U16ToU32,        //!< 64-bit load into a vector register with 16-bit to 32-bit zero extension (128-bit result).
+  kLoadCvt64_I32ToI64,        //!< 64-bit load into a vector register with 32-bit to 64-bit sign extension (128-bit result).
+  kLoadCvt64_U32ToU64,        //!< 64-bit load into a vector register with 32-bit to 64-bit zero extension (128-bit result).
 
-  kLoadCvt128_I8ToI16,
-  kLoadCvt128_U8ToU16,
-  kLoadCvt128_I8ToI32,
-  kLoadCvt128_U8ToU32,
-  kLoadCvt128_I16ToI32,
-  kLoadCvt128_U16ToU32,
-  kLoadCvt128_I32ToI64,
-  kLoadCvt128_U32ToU64,
+  kLoadCvt128_I8ToI16,        //!< 128-bit load into a vector register with 8-bit to 16-bit sign extension (256-bit result).
+  kLoadCvt128_U8ToU16,        //!< 128-bit load into a vector register with 8-bit to 16-bit zero extension (256-bit result).
+  kLoadCvt128_I8ToI32,        //!< 128-bit load into a vector register with 8-bit to 32-bit sign extension (512-bit result).
+  kLoadCvt128_U8ToU32,        //!< 128-bit load into a vector register with 8-bit to 32-bit zero extension (512-bit result).
+  kLoadCvt128_I16ToI32,       //!< 128-bit load into a vector register with 16-bit to 32-bit sign extension (256-bit result).
+  kLoadCvt128_U16ToU32,       //!< 128-bit load into a vector register with 16-bit to 32-bit zero extension (256-bit result).
+  kLoadCvt128_I32ToI64,       //!< 128-bit load into a vector register with 32-bit to 64-bit sign extension (256-bit result).
+  kLoadCvt128_U32ToU64,       //!< 128-bit load into a vector register with 32-bit to 64-bit zero extension (256-bit result).
 
-  kLoadCvt256_I8ToI16,
-  kLoadCvt256_U8ToU16,
-  kLoadCvt256_I16ToI32,
-  kLoadCvt256_U16ToU32,
-  kLoadCvt256_I32ToI64,
-  kLoadCvt256_U32ToU64,
+  kLoadCvt256_I8ToI16,        //!< 256-bit load into a vector register with 8-bit to 16-bit sign extension (512-bit result).
+  kLoadCvt256_U8ToU16,        //!< 256-bit load into a vector register with 8-bit to 16-bit zero extension (512-bit result).
+  kLoadCvt256_I16ToI32,       //!< 256-bit load into a vector register with 16-bit to 32-bit sign extension (512-bit result).
+  kLoadCvt256_U16ToU32,       //!< 256-bit load into a vector register with 16-bit to 32-bit zero extension (512-bit result).
+  kLoadCvt256_I32ToI64,       //!< 256-bit load into a vector register with 32-bit to 64-bit sign extension (512-bit result).
+  kLoadCvt256_U32ToU64,       //!< 256-bit load into a vector register with 32-bit to 64-bit zero extension (512-bit result).
 
-  kLoadCvtN_U8ToU64,
+  kLoadCvtN_U8ToU64,          //!< N-bit load with 8-bit to 64-bit zero extension (the size depends on the vector width).
 
-  kLoadCvtN_I8ToI16,
-  kLoadCvtN_U8ToU16,
-  kLoadCvtN_I8ToI32,
-  kLoadCvtN_U8ToU32,
-  kLoadCvtN_I16ToI32,
-  kLoadCvtN_U16ToU32,
-  kLoadCvtN_I32ToI64,
-  kLoadCvtN_U32ToU64,
+  kLoadCvtN_I8ToI16,          //!< N-bit load with 8-bit to 16-bit sign extension (the size depends on the vector width).
+  kLoadCvtN_U8ToU16,          //!< N-bit load with 8-bit to 16-bit zero extension (the size depends on the vector width).
+  kLoadCvtN_I8ToI32,          //!< N-bit load with 8-bit to 32-bit sign extension (the size depends on the vector width).
+  kLoadCvtN_U8ToU32,          //!< N-bit load with 8-bit to 32-bit zero extension (the size depends on the vector width).
+  kLoadCvtN_I16ToI32,         //!< N-bit load with 16-bit to 32-bit sign extension (the size depends on the vector width).
+  kLoadCvtN_U16ToU32,         //!< N-bit load with 16-bit to 32-bit zero extension (the size depends on the vector width).
+  kLoadCvtN_I32ToI64,         //!< N-bit load with 32-bit to 64-bit sign extension (the size depends on the vector width).
+  kLoadCvtN_U32ToU64,         //!< N-bit load with 32-bit to 64-bit zero extension (the size depends on the vector width).
 
-  kLoadInsertU8,
-  kLoadInsertU16,
-  kLoadInsertU32,
-  kLoadInsertU64,
-  kLoadInsertF32,
-  kLoadInsertF32x2,
-  kLoadInsertF64,
+  kLoadInsertU8,              //!< 8-bit insert (int) into a vector register from memory.
+  kLoadInsertU16,             //!< 16-bit insert (int) into a vector register from memory.
+  kLoadInsertU32,             //!< 32-bit insert (int) into a vector register from memory.
+  kLoadInsertU64,             //!< 64-bit insert (int) into a vector register from memory.
+  kLoadInsertF32,             //!< 32-bit insert (f32) into a vector register from memory.
+  kLoadInsertF32x2,           //!< 64-bit insert (f32x2) into a vector register from memory.
+  kLoadInsertF64,             //!< 64-bit insert (f64) into a vector register from memory.
 
   kMaxValue = kLoadInsertF64
 };
 
+//! Instruction with `[mem, vec]` operands.
+//!
+//! Describes store and extract instructions.
 enum class UniOpMV : uint32_t {
-  kStore8,
-  kStore16_U16,
-  kStore32_U32,
-  kStore32_F32,
+  kStore8,                    //!< 8-bit store (int) of a vector register.
+  kStore16_U16,               //!< 16-bit store (int) of a vector register.
+  kStore32_U32,               //!< 16-bit store (int) of a vector register.
+  kStore32_F32,               //!< 16-bit store (f32) of a vector register.
 
-  kStore64_U32,
-  kStore64_U64,
-  kStore64_F32,
-  kStore64_F64,
+  kStore64_U32,               //!< 64-bit store (int) of a vector register.
+  kStore64_U64,               //!< 64-bit store (int) of a vector register.
+  kStore64_F32,               //!< 64-bit store (f32) of a vector register.
+  kStore64_F64,               //!< 64-bit store (f64) of a vector register.
 
-  kStore128_U32,
-  kStore128_U64,
-  kStore128_F32,
-  kStore128_F64,
+  kStore128_U32,              //!< 128-bit store (int) of a vector register.
+  kStore128_U64,              //!< 128-bit store (int) of a vector register.
+  kStore128_F32,              //!< 128-bit store (f32) of a vector register.
+  kStore128_F64,              //!< 128-bit store (f64) of a vector register.
 
-  kStore256_U32,
-  kStore256_U64,
-  kStore256_F32,
-  kStore256_F64,
+  kStore256_U32,              //!< 256-bit store (int) of a vector register.
+  kStore256_U64,              //!< 256-bit store (int) of a vector register.
+  kStore256_F32,              //!< 256-bit store (f32) of a vector register.
+  kStore256_F64,              //!< 256-bit store (f64) of a vector register.
 
-  kStore512_U32,
-  kStore512_U64,
-  kStore512_F32,
-  kStore512_F64,
+  kStore512_U32,              //!< 512-bit store (int) of a vector register.
+  kStore512_U64,              //!< 512-bit store (int) of a vector register.
+  kStore512_F32,              //!< 512-bit store (f32) of a vector register.
+  kStore512_F64,              //!< 512-bit store (f64) of a vector register.
 
-  kStoreN_U32,
-  kStoreN_U64,
-  kStoreN_F32,
-  kStoreN_F64,
+  kStoreN_U32,                //!< N-bit store (int) of a vector register (the size depends on the vector width).
+  kStoreN_U64,                //!< N-bit store (int) of a vector register (the size depends on the vector width).
+  kStoreN_F32,                //!< N-bit store (f32) of a vector register (the size depends on the vector width).
+  kStoreN_F64,                //!< N-bit store (f64) of a vector register (the size depends on the vector width).
 
-  kStoreExtractU16,
-  kStoreExtractU32,
-  kStoreExtractU64,
+  kStoreExtractU16,           //!< 16-bit extract from lane and store.
+  kStoreExtractU32,           //!< 32-bit extract from lane and store.
+  kStoreExtractU64,           //!< 64-bit extract from lane and store.
 
   /*
   kStoreCvtz64_U16ToU8,
@@ -300,86 +323,105 @@ enum class UniOpMV : uint32_t {
   kMaxValue = kStoreExtractU64
 };
 
+//! Instruction with `[vec, vec]` operands.
+//!
+//! Describes vector arithmetic that has one destination and one source.
+//!
+//! \note For convenience, the second operand can be register, memory, or immediate value.
 enum class UniOpVV : uint32_t {
-  kMov,
-  kMovU64,
+  kMov,                       //!< Vector move.
+  kMovU64,                    //!< Vector move of the low 64-bit data, the rest is set to zero.
 
-  kBroadcastU8Z,
-  kBroadcastU16Z,
-  kBroadcastU8,
-  kBroadcastU16,
-  kBroadcastU32,
-  kBroadcastU64,
-  kBroadcastF32,
-  kBroadcastF64,
-  kBroadcastV128_U32,
-  kBroadcastV128_U64,
-  kBroadcastV128_F32,
-  kBroadcastV128_F64,
-  kBroadcastV256_U32,
-  kBroadcastV256_U64,
-  kBroadcastV256_F32,
-  kBroadcastV256_F64,
+  kBroadcastU8Z,              //!< Vector u8  broadcast with an assumption that the rest of the source vector is zero.
+  kBroadcastU16Z,             //!< Vector u16 broadcast with an assumption that the rest of the source vector is zero.
+  kBroadcastU8,               //!< Vector u8  broadcast to all lanes.
+  kBroadcastU16,              //!< Vector u16 broadcast to all lanes.
+  kBroadcastU32,              //!< Vector u32 broadcast to all lanes.
+  kBroadcastU64,              //!< Vector u64 broadcast to all lanes.
+  kBroadcastF32,              //!< Vector f32 broadcast to all lanes.
+  kBroadcastF64,              //!< Vector f64 broadcast to all lanes.
+  kBroadcastV128_U32,         //!< Vector broadcast of 128-bit lanes.
+  kBroadcastV128_U64,         //!< Vector broadcast of 128-bit lanes.
+  kBroadcastV128_F32,         //!< Vector broadcast of 128-bit lanes.
+  kBroadcastV128_F64,         //!< Vector broadcast of 128-bit lanes.
+  kBroadcastV256_U32,         //!< Vector broadcast of 256-bit lanes.
+  kBroadcastV256_U64,         //!< Vector broadcast of 256-bit lanes.
+  kBroadcastV256_F32,         //!< Vector broadcast of 256-bit lanes.
+  kBroadcastV256_F64,         //!< Vector broadcast of 256-bit lanes.
 
-  kAbsI8,
-  kAbsI16,
-  kAbsI32,
-  kAbsI64,
+  kAbsI8,                     //!< Vector i8  absolute value - `dst = abs(src)`.
+  kAbsI16,                    //!< Vector i16 absolute value - `dst = abs(src)`.
+  kAbsI32,                    //!< Vector i32 absolute value - `dst = abs(src)`.
+  kAbsI64,                    //!< Vector i64 absolute value - `dst = abs(src)`.
 
-  kNotU32,
-  kNotU64,
+  kNotU32,                    //!< Vector u32 bitwise NOT - `dst = ~src`.
+  kNotU64,                    //!< Vector u64 bitwise NOT - `dst = ~src`.
 
-  kCvtI8LoToI16,
-  kCvtI8HiToI16,
-  kCvtU8LoToU16,
-  kCvtU8HiToU16,
-  kCvtI8ToI32,
-  kCvtU8ToU32,
-  kCvtI16LoToI32,
-  kCvtI16HiToI32,
-  kCvtU16LoToU32,
-  kCvtU16HiToU32,
-  kCvtI32LoToI64,
-  kCvtI32HiToI64,
-  kCvtU32LoToU64,
-  kCvtU32HiToU64,
+  kCvtI8LoToI16,              //!< Vector sign extend low  i8  to i16.
+  kCvtI8HiToI16,              //!< Vector sign extend high i8  to i16.
+  kCvtU8LoToU16,              //!< Vector zero extend low  u8  to u16.
+  kCvtU8HiToU16,              //!< Vector zero extend high u8  to u16.
+  kCvtI8ToI32,                //!< Vector zero extend low  i8  to i32.
+  kCvtU8ToU32,                //!< Vector zero extend high u8  to u32.
+  kCvtI16LoToI32,             //!< Vector sign extend low  i16 to i32.
+  kCvtI16HiToI32,             //!< Vector sign extend high i16 to i32.
+  kCvtU16LoToU32,             //!< Vector zero extend low  u16 to u32.
+  kCvtU16HiToU32,             //!< Vector zero extend high u16 to u32.
+  kCvtI32LoToI64,             //!< Vector sign extend low  i32 to i64.
+  kCvtI32HiToI64,             //!< Vector sign extend high i32 to i64.
+  kCvtU32LoToU64,             //!< Vector zero extend low  u32 to u64.
+  kCvtU32HiToU64,             //!< Vector zero extend high u32 to u64.
 
-  kAbsF32,
-  kAbsF64,
+  kAbsF32S,                   //!< Scalar f32 absolute value.
+  kAbsF64S,                   //!< Scalar f64 absolute value.
+  kAbsF32,                    //!< Vector f32 absolute value.
+  kAbsF64,                    //!< Vector f64 absolute value.
 
-  kNegF32,
-  kNegF64,
+  kNegF32S,                   //!< Scalar f32 negate.
+  kNegF64S,                   //!< Scalar f64 negate.
+  kNegF32,                    //!< Vector f32 negate.
+  kNegF64,                    //!< Vector f64 negate.
 
-  kNotF32,
-  kNotF64,
+  kNotF32,                    //!< Vector f32 bitwise NOT.
+  kNotF64,                    //!< Vector f64 bitwise NOT.
 
-  kTruncF32S,
-  kTruncF64S,
-  kTruncF32,
-  kTruncF64,
+  kTruncF32S,                 //!< Scalar f32 truncate.
+  kTruncF64S,                 //!< Scalar f64 truncate.
+  kTruncF32,                  //!< Vector f32 truncate.
+  kTruncF64,                  //!< Vector f64 truncate.
 
-  kFloorF32S,
-  kFloorF64S,
-  kFloorF32,
-  kFloorF64,
+  kFloorF32S,                 //!< Scalar f32 floor.
+  kFloorF64S,                 //!< Scalar f64 floor.
+  kFloorF32,                  //!< Vector f32 floor.
+  kFloorF64,                  //!< Vector f64 floor.
 
-  kCeilF32S,
-  kCeilF64S,
-  kCeilF32,
-  kCeilF64,
+  kCeilF32S,                  //!< Scalar f32 ceil.
+  kCeilF64S,                  //!< Scalar f64 ceil.
+  kCeilF32,                   //!< Vector f32 ceil.
+  kCeilF64,                   //!< Vector f64 ceil.
 
-  kRoundF32S,
-  kRoundF64S,
-  kRoundF32,
-  kRoundF64,
+  kRoundEvenF32S,             //!< Scalar f32 round-even.
+  kRoundEvenF64S,             //!< Scalar f64 round-even.
+  kRoundEvenF32,              //!< Vector f32 round-even.
+  kRoundEvenF64,              //!< Vector f64 round-even.
 
-  kRcpF32,
-  kRcpF64,
+  kRoundHalfAwayF32S,         //!< Scalar f32 round-half-away (0.5 and greater fraction rounds away from zero).
+  kRoundHalfAwayF64S,         //!< Scalar f64 round-half-away (0.5 and greater fraction rounds away from zero).
+  kRoundHalfAwayF32,          //!< Vector f32 round-half-away (0.5 and greater fraction rounds away from zero).
+  kRoundHalfAwayF64,          //!< Vector f64 round-half-away (0.5 and greater fraction rounds away from zero).
 
-  kSqrtF32S,
-  kSqrtF64S,
-  kSqrtF32,
-  kSqrtF64,
+  kRoundHalfUpF32S,           //!< Scalar f32 round-half-up (0.5 and greater fraction rounds up).
+  kRoundHalfUpF64S,           //!< Scalar f64 round-half-up (0.5 and greater fraction rounds up).
+  kRoundHalfUpF32,            //!< Vector f32 round-half-up (0.5 and greater fraction rounds up).
+  kRoundHalfUpF64,            //!< Vector f64 round-half-up (0.5 and greater fraction rounds up).
+
+  kRcpF32,                    //!< Vector f32 reciprocal - `dst = 1.0 / src`.
+  kRcpF64,                    //!< Vector f64 reciprocal - `dst = 1.0 / src`.
+
+  kSqrtF32S,                  //!< Scalar f32 square root.
+  kSqrtF64S,                  //!< Scalar f64 square root.
+  kSqrtF32,                   //!< Vector f32 square root.
+  kSqrtF64,                   //!< Vector f64 square root.
 
   kCvtF32ToF64S,
   kCvtF64ToF32S,
@@ -400,35 +442,40 @@ enum class UniOpVV : uint32_t {
   kMaxValue = kCvtRoundF64ToI32Hi
 };
 
+//! Instruction with `[vec, vec, imm]` operands.
+//!
+//! Describes vector arithmetic that has one destination, one source, and one immediate.
+//!
+//! \note For convenience, the second operand can be register, memory, or immediate value.
 enum class UniOpVVI : uint32_t {
-  kSllU16,
-  kSllU32,
-  kSllU64,
-  kSrlU16,
-  kSrlU32,
-  kSrlU64,
-  kSraI16,
-  kSraI32,
-  kSraI64,
-  kSllbU128,
-  kSrlbU128,
-  kSwizzleU16x4,
-  kSwizzleLoU16x4,
-  kSwizzleHiU16x4,
-  kSwizzleU32x4,
-  kSwizzleU64x2,
-  kSwizzleF32x4,
-  kSwizzleF64x2,
-  kSwizzleU64x4,
-  kSwizzleF64x4,
-  kExtractV128_I32,
-  kExtractV128_I64,
-  kExtractV128_F32,
-  kExtractV128_F64,
-  kExtractV256_I32,
-  kExtractV256_I64,
-  kExtractV256_F32,
-  kExtractV256_F64,
+  kSllU16,                    //!< Vector u16 shift left logical.
+  kSllU32,                    //!< Vector u32 shift left logical.
+  kSllU64,                    //!< Vector u64 shift left logical.
+  kSrlU16,                    //!< Vector u16 shift right logical.
+  kSrlU32,                    //!< Vector u32 shift right logical.
+  kSrlU64,                    //!< Vector u64 shift right logical.
+  kSraI16,                    //!< Vector u16 shift right arithmetic.
+  kSraI32,                    //!< Vector u32 shift right arithmetic.
+  kSraI64,                    //!< Vector u64 shift right arithmetic.
+  kSllbU128,                  //!< Vector shift bytes (128-bit lanes).
+  kSrlbU128,                  //!< Vector shift bytes (128-bit lanes).
+  kSwizzleU16x4,              //!< Vector swizzle u16x4 (128-bit lanes).
+  kSwizzleLoU16x4,            //!< Vector swizzle u16x4 (low  64-bit lanes).
+  kSwizzleHiU16x4,            //!< Vector swizzle u16x4 (high 64-bit lanes)
+  kSwizzleU32x4,              //!< Vector swizzle u32x4 (128-bit lanes).
+  kSwizzleU64x2,              //!< Vector swizzle u64x2 (128-bit lanes).
+  kSwizzleF32x4,              //!< Vector swizzle f32x4 (128-bit lanes).
+  kSwizzleF64x2,              //!< Vector swizzle f64x2 (128-bit lanes).
+  kSwizzleU64x4,              //!< Vector swizzle u64x4 (256-bit lanes).
+  kSwizzleF64x4,              //!< Vector swizzle f64x4 (256-bit lanes).
+  kExtractV128_I32,           //!< Vector extract 128-bit lane from 256-bit or 512-bit vector.
+  kExtractV128_I64,           //!< Vector extract 128-bit lane from 256-bit or 512-bit vector.
+  kExtractV128_F32,           //!< Vector extract 128-bit lane from 256-bit or 512-bit vector.
+  kExtractV128_F64,           //!< Vector extract 128-bit lane from 256-bit or 512-bit vector.
+  kExtractV256_I32,           //!< Vector extract 256-bit lane from 512-bit vector.
+  kExtractV256_I64,           //!< Vector extract 256-bit lane from 512-bit vector.
+  kExtractV256_F32,           //!< Vector extract 256-bit lane from 512-bit vector.
+  kExtractV256_F64,           //!< Vector extract 256-bit lane from 512-bit vector.
 
 #if defined(ASMJIT_UJIT_AARCH64)
   kSrlRndU16,
@@ -466,188 +513,197 @@ enum class UniOpVVI : uint32_t {
 #endif // ASMJIT_UJIT_AARCH64
 };
 
+//! Instruction with `[vec, vec, vec]` operands.
+//!
+//! Describes vector arithmetic that has one destination and two sources.
+//!
+//! \note For convenience, the third operand can be register, memory, or immediate value.
 enum class UniOpVVV : uint32_t {
-  kAndU32,
-  kAndU64,
-  kOrU32,
-  kOrU64,
-  kXorU32,
-  kXorU64,
-  kAndnU32,
-  kAndnU64,
-  kBicU32,
-  kBicU64,
-  kAvgrU8,
-  kAvgrU16,
-  kAddU8,
-  kAddU16,
-  kAddU32,
-  kAddU64,
-  kSubU8,
-  kSubU16,
-  kSubU32,
-  kSubU64,
-  kAddsI8,
-  kAddsU8,
-  kAddsI16,
-  kAddsU16,
-  kSubsI8,
-  kSubsU8,
-  kSubsI16,
-  kSubsU16,
-  kMulU16,
-  kMulU32,
-  kMulU64,
-  kMulhI16,
-  kMulhU16,
-  kMulU64_LoU32,
-  kMHAddI16_I32,
-  kMinI8,
-  kMinU8,
-  kMinI16,
-  kMinU16,
-  kMinI32,
-  kMinU32,
-  kMinI64,
-  kMinU64,
-  kMaxI8,
-  kMaxU8,
-  kMaxI16,
-  kMaxU16,
-  kMaxI32,
-  kMaxU32,
-  kMaxI64,
-  kMaxU64,
-  kCmpEqU8,
-  kCmpEqU16,
-  kCmpEqU32,
-  kCmpEqU64,
-  kCmpGtI8,
-  kCmpGtU8,
-  kCmpGtI16,
-  kCmpGtU16,
-  kCmpGtI32,
-  kCmpGtU32,
-  kCmpGtI64,
-  kCmpGtU64,
-  kCmpGeI8,
-  kCmpGeU8,
-  kCmpGeI16,
-  kCmpGeU16,
-  kCmpGeI32,
-  kCmpGeU32,
-  kCmpGeI64,
-  kCmpGeU64,
-  kCmpLtI8,
-  kCmpLtU8,
-  kCmpLtI16,
-  kCmpLtU16,
-  kCmpLtI32,
-  kCmpLtU32,
-  kCmpLtI64,
-  kCmpLtU64,
-  kCmpLeI8,
-  kCmpLeU8,
-  kCmpLeI16,
-  kCmpLeU16,
-  kCmpLeI32,
-  kCmpLeU32,
-  kCmpLeI64,
-  kCmpLeU64,
+  kAndU32,                    //!< Vector u32 bitwise AND  - `dst = src1 & src2`.
+  kAndU64,                    //!< Vector u64 bitwise AND  - `dst = src1 & src2`.
+  kOrU32,                     //!< Vector u32 bitwise OR   - `dst = src1 | src2`.
+  kOrU64,                     //!< Vector u64 bitwise OR   - `dst = src1 | src2`.
+  kXorU32,                    //!< Vector u32 bitwise XOR  - `dst = src1 ^ src2`.
+  kXorU64,                    //!< Vector u64 bitwise XOR  - `dst = src1 ^ src2`.
+  kAndnU32,                   //!< Vector u32 bitwise ANDN - `dst = ~src1 & src2`.
+  kAndnU64,                   //!< Vector u64 bitwise ANDN - `dst = ~src1 & src2`.
+  kBicU32,                    //!< Vector u32 bitwise BIC  - `dst = src1 & ~src2`.
+  kBicU64,                    //!< Vector u64 bitwise BIC  - `dst = src1 & ~src2`.
+  kAvgrU8,                    //!< Vector u8  average rounded half up `dst = (src1 + src2 + 1) >> 1`.
+  kAvgrU16,                   //!< Vector u16 average rounded half up `dst = (src1 + src2 + 1) >> 1`.
+  kAddU8,                     //!< Vector u8  add.
+  kAddU16,                    //!< Vector u16 add.
+  kAddU32,                    //!< Vector u32 add.
+  kAddU64,                    //!< Vector u64 add.
+  kSubU8,                     //!< Vector u8  sub.
+  kSubU16,                    //!< Vector u16 sub.
+  kSubU32,                    //!< Vector u32 sub.
+  kSubU64,                    //!< Vector u64 sub.
+  kAddsI8,                    //!< Vector i8  add with saturation (signed).
+  kAddsU8,                    //!< Vector u8  add with saturation (unsigned).
+  kAddsI16,                   //!< Vector i16 add with saturation (signed).
+  kAddsU16,                   //!< Vector u16 add with saturation (unsigned).
+  kSubsI8,                    //!< Vector i8  sub with saturation (signed).
+  kSubsU8,                    //!< Vector u8  sub with saturation (unsigned).
+  kSubsI16,                   //!< Vector i16 sub with saturation (signed).
+  kSubsU16,                   //!< Vector u16 sub with saturation (unsigned).
+  kMulU16,                    //!< Vector u16 multiply.
+  kMulU32,                    //!< Vector u32 multiply.
+  kMulU64,                    //!< Vector u64 multiply.
+  kMulhI16,                   //!< Vector i16 multiply high - `dst = (src1 * src2) >> 16`.
+  kMulhU16,                   //!< Vector u16 multiply high - `dst = (src1 * src2) >> 16`.
+  kMulU64_LoU32,              //!< Vector u64xu32 multiply.
+  kMHAddI16_I32,              //!< Vector i16 multiply with horizontal widening add to form a 32-bit result.
+  kMinI8,                     //!< Vector i8  minimum.
+  kMinU8,                     //!< Vector u8  minimum.
+  kMinI16,                    //!< Vector i16 minimum.
+  kMinU16,                    //!< Vector u16 minimum.
+  kMinI32,                    //!< Vector i32 minimum.
+  kMinU32,                    //!< Vector u32 minimum.
+  kMinI64,                    //!< Vector i64 minimum.
+  kMinU64,                    //!< Vector u64 minimum.
+  kMaxI8,                     //!< Vector i8  maximum.
+  kMaxU8,                     //!< Vector u8  maximum.
+  kMaxI16,                    //!< Vector i16 maximum.
+  kMaxU16,                    //!< Vector u16 maximum.
+  kMaxI32,                    //!< Vector i32 maximum.
+  kMaxU32,                    //!< Vector u32 maximum.
+  kMaxI64,                    //!< Vector i64 maximum.
+  kMaxU64,                    //!< Vector u64 maximum.
+  kCmpEqU8,                   //!< Vector u8  compare equal.
+  kCmpEqU16,                  //!< Vector u16 compare equal.
+  kCmpEqU32,                  //!< Vector u32 compare equal.
+  kCmpEqU64,                  //!< Vector u64 compare equal.
+  kCmpGtI8,                   //!< Vector i8  compare greater-than.
+  kCmpGtU8,                   //!< Vector u8  compare greater-than.
+  kCmpGtI16,                  //!< Vector i16 compare greater-than.
+  kCmpGtU16,                  //!< Vector u16 compare greater-than.
+  kCmpGtI32,                  //!< Vector i32 compare greater-than.
+  kCmpGtU32,                  //!< Vector u32 compare greater-than.
+  kCmpGtI64,                  //!< Vector i64 compare greater-than.
+  kCmpGtU64,                  //!< Vector u64 compare greater-than.
+  kCmpGeI8,                   //!< Vector i8  compare greater-or-equal.
+  kCmpGeU8,                   //!< Vector u8  compare greater-or-equal.
+  kCmpGeI16,                  //!< Vector i16 compare greater-or-equal.
+  kCmpGeU16,                  //!< Vector u16 compare greater-or-equal.
+  kCmpGeI32,                  //!< Vector i32 compare greater-or-equal.
+  kCmpGeU32,                  //!< Vector u32 compare greater-or-equal.
+  kCmpGeI64,                  //!< Vector i64 compare greater-or-equal.
+  kCmpGeU64,                  //!< Vector u64 compare greater-or-equal.
+  kCmpLtI8,                   //!< Vector i8  compare lesser-than.
+  kCmpLtU8,                   //!< Vector u8  compare lesser-than.
+  kCmpLtI16,                  //!< Vector i16 compare lesser-than.
+  kCmpLtU16,                  //!< Vector u16 compare lesser-than.
+  kCmpLtI32,                  //!< Vector i32 compare lesser-than.
+  kCmpLtU32,                  //!< Vector u32 compare lesser-than.
+  kCmpLtI64,                  //!< Vector i64 compare lesser-than.
+  kCmpLtU64,                  //!< Vector u64 compare lesser-than.
+  kCmpLeI8,                   //!< Vector i8  compare lesser-or-equal.
+  kCmpLeU8,                   //!< Vector u8  compare lesser-or-equal.
+  kCmpLeI16,                  //!< Vector i16 compare lesser-or-equal.
+  kCmpLeU16,                  //!< Vector u16 compare lesser-or-equal.
+  kCmpLeI32,                  //!< Vector i32 compare lesser-or-equal.
+  kCmpLeU32,                  //!< Vector u32 compare lesser-or-equal.
+  kCmpLeI64,                  //!< Vector i64 compare lesser-or-equal.
+  kCmpLeU64,                  //!< Vector u64 compare lesser-or-equal.
 
-  kAndF32,
-  kAndF64,
-  kOrF32,
-  kOrF64,
-  kXorF32,
-  kXorF64,
-  kAndnF32,
-  kAndnF64,
-  kBicF32,
-  kBicF64,
-  kAddF32S,
-  kAddF64S,
-  kAddF32,
-  kAddF64,
-  kSubF32S,
-  kSubF64S,
-  kSubF32,
-  kSubF64,
-  kMulF32S,
-  kMulF64S,
-  kMulF32,
-  kMulF64,
-  kDivF32S,
-  kDivF64S,
-  kDivF32,
-  kDivF64,
-  kMinF32S,
-  kMinF64S,
-  kMinF32,
-  kMinF64,
-  kMaxF32S,
-  kMaxF64S,
-  kMaxF32,
-  kMaxF64,
-  kCmpEqF32S,
-  kCmpEqF64S,
-  kCmpEqF32,
-  kCmpEqF64,
-  kCmpNeF32S,
-  kCmpNeF64S,
-  kCmpNeF32,
-  kCmpNeF64,
-  kCmpGtF32S,
-  kCmpGtF64S,
-  kCmpGtF32,
-  kCmpGtF64,
-  kCmpGeF32S,
-  kCmpGeF64S,
-  kCmpGeF32,
-  kCmpGeF64,
-  kCmpLtF32S,
-  kCmpLtF64S,
-  kCmpLtF32,
-  kCmpLtF64,
-  kCmpLeF32S,
-  kCmpLeF64S,
-  kCmpLeF32,
-  kCmpLeF64,
-  kCmpOrdF32S,
-  kCmpOrdF64S,
-  kCmpOrdF32,
-  kCmpOrdF64,
-  kCmpUnordF32S,
-  kCmpUnordF64S,
-  kCmpUnordF32,
-  kCmpUnordF64,
+  kAndF32,                    //!< Vector f32 bitwise AND  - `dst = src1 & src2`.
+  kAndF64,                    //!< Vector f64 bitwise AND  - `dst = src1 & src2`.
+  kOrF32,                     //!< Vector f32 bitwise OR   - `dst = src1 | src2`.
+  kOrF64,                     //!< Vector f64 bitwise OR   - `dst = src1 | src2`.
+  kXorF32,                    //!< Vector f32 bitwise XOR  - `dst = src1 ^ src2`.
+  kXorF64,                    //!< Vector f64 bitwise XOR  - `dst = src1 ^ src2`.
+  kAndnF32,                   //!< Vector f32 bitwise ANDN - `dst = ~src1 & src2`.
+  kAndnF64,                   //!< Vector f64 bitwise ANDN - `dst = ~src1 & src2`.
+  kBicF32,                    //!< Vector f32 bitwise BIC  - `dst = src1 & ~src2`.
+  kBicF64,                    //!< Vector f64 bitwise BIC  - `dst = src1 & ~src2`.
+  kAddF32S,                   //!< Scalar f32 add.
+  kAddF64S,                   //!< Scalar f64 add.
+  kAddF32,                    //!< Vector f32 add.
+  kAddF64,                    //!< Vector f64 add.
+  kSubF32S,                   //!< Scalar f32 sub.
+  kSubF64S,                   //!< Scalar f64 sub.
+  kSubF32,                    //!< Vector f32 sub.
+  kSubF64,                    //!< Vector f64 sub.
+  kMulF32S,                   //!< Scalar f32 mul.
+  kMulF64S,                   //!< Scalar f64 mul.
+  kMulF32,                    //!< Vector f32 mul.
+  kMulF64,                    //!< Vector f64 mul.
+  kDivF32S,                   //!< Scalar f32 div.
+  kDivF64S,                   //!< Scalar f64 div.
+  kDivF32,                    //!< Vector f32 div.
+  kDivF64,                    //!< Vector f64 div.
+  kModF32S,                   //!< Scalar f32 modulo.
+  kModF64S,                   //!< Scalar f64 modulo.
+  kModF32,                    //!< Vector f32 modulo.
+  kModF64,                    //!< Vector f64 modulo.
+  kMinF32S,                   //!< Scalar f32 minimum.
+  kMinF64S,                   //!< Scalar f64 minimum.
+  kMinF32,                    //!< Vector f32 minimum.
+  kMinF64,                    //!< Vector f64 minimum.
+  kMaxF32S,                   //!< Scalar f32 maximum.
+  kMaxF64S,                   //!< Scalar f64 maximum.
+  kMaxF32,                    //!< Vector f32 maximum.
+  kMaxF64,                    //!< Vector f64 maximum.
+  kCmpEqF32S,                 //!< Scalar f32 compare equal (ordered).
+  kCmpEqF64S,                 //!< Scalar f64 compare equal (ordered).
+  kCmpEqF32,                  //!< Vector f32 compare equal (ordered).
+  kCmpEqF64,                  //!< Vector f64 compare equal (ordered).
+  kCmpNeF32S,                 //!< Scalar f32 compare not-equal (ordered)
+  kCmpNeF64S,                 //!< Scalar f64 compare not-equal (ordered)
+  kCmpNeF32,                  //!< Vector f32 compare not-equal (ordered)
+  kCmpNeF64,                  //!< Vector f64 compare not-equal (ordered)
+  kCmpGtF32S,                 //!< Scalar f32 compare greater-than (ordered)
+  kCmpGtF64S,                 //!< Scalar f64 compare greater-than (ordered)
+  kCmpGtF32,                  //!< Vector f32 compare greater-than (ordered)
+  kCmpGtF64,                  //!< Vector f64 compare greater-than (ordered)
+  kCmpGeF32S,                 //!< Scalar f32 compare greater-or-equal (ordered)
+  kCmpGeF64S,                 //!< Scalar f64 compare greater-or-equal (ordered)
+  kCmpGeF32,                  //!< Vector f32 compare greater-or-equal (ordered)
+  kCmpGeF64,                  //!< Vector f64 compare greater-or-equal (ordered)
+  kCmpLtF32S,                 //!< Scalar f32 compare lesser-than (ordered)
+  kCmpLtF64S,                 //!< Scalar f64 compare lesser-than (ordered)
+  kCmpLtF32,                  //!< Vector f32 compare lesser-than (ordered)
+  kCmpLtF64,                  //!< Vector f64 compare lesser-than (ordered)
+  kCmpLeF32S,                 //!< Scalar f32 compare lesser-or-equal (ordered)
+  kCmpLeF64S,                 //!< Scalar f64 compare lesser-or-equal (ordered)
+  kCmpLeF32,                  //!< Vector f32 compare lesser-or-equal (ordered)
+  kCmpLeF64,                  //!< Vector f64 compare lesser-or-equal (ordered)
+  kCmpOrdF32S,                //!< Scalar f32 compare ordered.
+  kCmpOrdF64S,                //!< Scalar f64 compare ordered.
+  kCmpOrdF32,                 //!< Vector f32 compare ordered.
+  kCmpOrdF64,                 //!< Vector f64 compare ordered.
+  kCmpUnordF32S,              //!< Scalar f32 compare unordered.
+  kCmpUnordF64S,              //!< Scalar f64 compare unordered.
+  kCmpUnordF32,               //!< Vector f32 compare unordered.
+  kCmpUnordF64,               //!< Vector f64 compare unordered.
 
-  kHAddF64,
+  kHAddF64,                   //!< Vector f64 horizontal-add.
 
-  kCombineLoHiU64,
-  kCombineLoHiF64,
-  kCombineHiLoU64,
-  kCombineHiLoF64,
+  kCombineLoHiU64,            //!< Combine low and high u64 lanes.
+  kCombineLoHiF64,            //!< Combine low and high f64 lanes.
+  kCombineHiLoU64,            //!< Combine low and high u64 lanes.
+  kCombineHiLoF64,            //!< Combine low and high f64 lanes.
 
-  kInterleaveLoU8,
-  kInterleaveHiU8,
-  kInterleaveLoU16,
-  kInterleaveHiU16,
-  kInterleaveLoU32,
-  kInterleaveHiU32,
-  kInterleaveLoU64,
-  kInterleaveHiU64,
-  kInterleaveLoF32,
-  kInterleaveHiF32,
-  kInterleaveLoF64,
-  kInterleaveHiF64,
+  kInterleaveLoU8,            //!< Interleave low  u8  lanes.
+  kInterleaveHiU8,            //!< Interleave high u8  lanes.
+  kInterleaveLoU16,           //!< Interleave low  u16 lanes.
+  kInterleaveHiU16,           //!< Interleave high u16 lanes.
+  kInterleaveLoU32,           //!< Interleave low  u32 lanes.
+  kInterleaveHiU32,           //!< Interleave high u32 lanes.
+  kInterleaveLoU64,           //!< Interleave low  u64 lanes.
+  kInterleaveHiU64,           //!< Interleave high u64 lanes.
+  kInterleaveLoF32,           //!< Interleave low  f32 lanes.
+  kInterleaveHiF32,           //!< Interleave high f32 lanes.
+  kInterleaveLoF64,           //!< Interleave low  f64 lanes.
+  kInterleaveHiF64,           //!< Interleave high f64 lanes.
 
-  kPacksI16_I8,
-  kPacksI16_U8,
-  kPacksI32_I16,
-  kPacksI32_U16,
+  kPacksI16_I8,               //!< Pack i16 to i8 with saturation.
+  kPacksI16_U8,               //!< Pack i16 to u8 with saturation.
+  kPacksI32_I16,              //!< Pack i32 to i16 with saturation.
+  kPacksI32_U16,              //!< Pack i32 to u16 with saturation.
 
-  kSwizzlev_U8,
+  kSwizzlev_U8,               //!< Swizzle 16xu8 elements in each 128-bit lane.
 
 #if defined(ASMJIT_UJIT_AARCH64)
 
@@ -681,10 +737,10 @@ enum class UniOpVVV : uint32_t {
 
 #elif defined(ASMJIT_UJIT_X86)
 
-  kPermuteU8,
-  kPermuteU16,
-  kPermuteU32,
-  kPermuteU64,
+  kPermuteU8,                 //!< Permute u8 elements  across the vector.
+  kPermuteU16,                //!< Permute u16 elements across the vector.
+  kPermuteU32,                //!< Permute u32 elements across the vector.
+  kPermuteU64,                //!< Permute u64 elements across the vector.
 
   kMaxValue = kPermuteU64
 
@@ -695,86 +751,65 @@ enum class UniOpVVV : uint32_t {
 #endif // ASMJIT_UJIT_AARCH64
 };
 
+//! Instruction with `[vec, vec, vec, imm]` operands.
+//!
+//! Describes vector arithmetic that has one destination, two sources, and immediate.
+//!
+//! \note For convenience, the third operand can be register, memory, or immediate value.
 enum class UniOpVVVI : uint32_t {
-  kAlignr_U128,
-  kInterleaveShuffleU32x4,
-  kInterleaveShuffleU64x2,
-  kInterleaveShuffleF32x4,
-  kInterleaveShuffleF64x2,
-  kInsertV128_U32,
-  kInsertV128_F32,
-  kInsertV128_U64,
-  kInsertV128_F64,
-  kInsertV256_U32,
-  kInsertV256_F32,
-  kInsertV256_U64,
-  kInsertV256_F64,
+  kAlignr_U128,               //!< Align-right 8-bit elements in 128-bit.
+  kInterleaveShuffleU32x4,    //!< Interleaved u32x4 shuffle.
+  kInterleaveShuffleU64x2,    //!< Interleaved u64x2 shuffle.
+  kInterleaveShuffleF32x4,    //!< Interleaved f32x4 shuffle.
+  kInterleaveShuffleF64x2,    //!< Interleaved f64x2 shuffle.
+  kInsertV128_U32,            //!< Insert a 128-bit lane (u32) into 256-bit or 512-bit vector.
+  kInsertV128_F32,            //!< Insert a 128-bit lane (f32) into 256-bit or 512-bit vector.
+  kInsertV128_U64,            //!< Insert a 128-bit lane (u64) into 256-bit or 512-bit vector.
+  kInsertV128_F64,            //!< Insert a 128-bit lane (f64) into 256-bit or 512-bit vector.
+  kInsertV256_U32,            //!< Insert a 256-bit lane (u32) into 512-bit vector.
+  kInsertV256_F32,            //!< Insert a 256-bit lane (f32) into 512-bit vector.
+  kInsertV256_U64,            //!< Insert a 256-bit lane (u64) into 512-bit vector.
+  kInsertV256_F64,            //!< Insert a 256-bit lane (f64) into 512-bit vector.
 
   kMaxValue = kInsertV256_F64
 };
 
+//! Instruction with `[vec, vec, vec, vec]` operands.
+//!
+//! Describes vector arithmetic that has one destination and three sources.
+//!
+//! \note For convenience, the fourth operand can be register, memory, or immediate value.
+//!
+//! \remarks For FMA functionality, check also \ref FMAddOpBehavior.
 enum class UniOpVVVV : uint32_t {
   kBlendV_U8,
 
-  kMAddU16,
-  kMAddU32,
+  kMAddU16,                   //!< Vector u16 multiply-add.
+  kMAddU32,                   //!< Vector u32 multiply-add.
 
-  kMAddF32S,
-  kMAddF64S,
-  kMAddF32,
-  kMAddF64,
+  kMAddF32S,                  //!< Scalar f32 multiply-add (FMA if available, or separate MUL+ADD if not).
+  kMAddF64S,                  //!< Scalar f64 multiply-add (FMA if available, or separate MUL+ADD if not).
+  kMAddF32,                   //!< Vector f32 multiply-add (FMA if available, or separate MUL+ADD if not).
+  kMAddF64,                   //!< Vector f64 multiply-add (FMA if available, or separate MUL+ADD if not).
 
-  kMSubF32S,
-  kMSubF64S,
-  kMSubF32,
-  kMSubF64,
+  kMSubF32S,                  //!< Scalar f32 multiply-sub (FMA if available, or separate MUL+ADD if not).
+  kMSubF64S,                  //!< Scalar f64 multiply-sub (FMA if available, or separate MUL+ADD if not).
+  kMSubF32,                   //!< Vector f32 multiply-sub (FMA if available, or separate MUL+ADD if not).
+  kMSubF64,                   //!< Vector f64 multiply-sub (FMA if available, or separate MUL+ADD if not).
 
-  kNMAddF32S,
-  kNMAddF64S,
-  kNMAddF32,
-  kNMAddF64,
+  kNMAddF32S,                 //!< Scalar f32 negated-multiply-add (FMA if available, or separate MUL+ADD if not)
+  kNMAddF64S,                 //!< Scalar f64 negated-multiply-add (FMA if available, or separate MUL+ADD if not)
+  kNMAddF32,                  //!< Vector f32 negated-multiply-add (FMA if available, or separate MUL+ADD if not).
+  kNMAddF64,                  //!< Vector f64 negated-multiply-add (FMA if available, or separate MUL+ADD if not).
 
-  kNMSubF32S,
-  kNMSubF64S,
-  kNMSubF32,
-  kNMSubF64,
+  kNMSubF32S,                 //!< Scalar f32 negated-multiply-sub (FMA if available, or separate MUL+ADD if not).
+  kNMSubF64S,                 //!< Scalar f64 negated-multiply-sub (FMA if available, or separate MUL+ADD if not).
+  kNMSubF32,                  //!< Vector f32 negated-multiply-sub (FMA if available, or separate MUL+ADD if not).
+  kNMSubF64,                  //!< Vector f64 negated-multiply-sub (FMA if available, or separate MUL+ADD if not).
 
   kMaxValue = kNMSubF64
 };
 
-//! Pipeline optimization flags used by \ref UniCompiler.
-enum class UniOptFlags : uint32_t {
-  //! No flags.
-  kNone = 0x0u,
-
-  //! CPU has instructions that can perform 8-bit masked loads and stores.
-  kMaskOps8Bit = 0x00000001u,
-
-  //! CPU has instructions that can perform 16-bit masked loads and stores.
-  kMaskOps16Bit = 0x00000002u,
-
-  //! CPU has instructions that can perform 32-bit masked loads and stores.
-  kMaskOps32Bit = 0x00000004u,
-
-  //! CPU has instructions that can perform 64-bit masked loads and stores.
-  kMaskOps64Bit = 0x00000008u,
-
-  //! CPU provides low-latency 32-bit multiplication (AMD CPUs).
-  kFastVpmulld = 0x00000010u,
-
-  //! CPU provides low-latency 64-bit multiplication (AMD CPUs).
-  kFastVpmullq = 0x00000020u,
-
-  //! CPU performs hardware gathers faster than a sequence of loads and packing.
-  kFastGather = 0x00000040u,
-
-  //! CPU has fast stores with mask.
-  //!
-  //! \note This is a hint to the compiler to emit a masked store instead of a sequence having branches.
-  kFastStoreWithMask = 0x00000080u
-};
-ASMJIT_DEFINE_ENUM_FLAGS(UniOptFlags)
-
 //! \}
 
 ASMJIT_END_SUB_NAMESPACE
diff --git a/src/asmjit/ujit/vecconsttable.h b/src/asmjit/ujit/vecconsttable.h
index 7444aa6..b0779ff 100644
--- a/src/asmjit/ujit/vecconsttable.h
+++ b/src/asmjit/ujit/vecconsttable.h
@@ -6,7 +6,7 @@
 #ifndef ASMJIT_UJIT_VECCONSTTABLE_H_INCLUDED
 #define ASMJIT_UJIT_VECCONSTTABLE_H_INCLUDED
 
-#include "ujitbase.h"
+#include "../core/globals.h"
 
 #if !defined(ASMJIT_NO_UJIT)
 
@@ -18,6 +18,8 @@ ASMJIT_BEGIN_SUB_NAMESPACE(ujit)
 template<typename T, size_t W>
 struct VecConst;
 
+//! \cond
+
 //! A 64-bit vector constant of type `T` aligned to 64 bits.
 template<typename T>
 struct ASMJIT_MAY_ALIAS ASMJIT_ALIGNAS(8) VecConst<T, 8> {
@@ -30,6 +32,12 @@ struct ASMJIT_MAY_ALIAS ASMJIT_ALIGNAS(8) VecConst<T, 8> {
   static_assert(kElementCount > 0u, "Vector constant must have at least one element");
 
   ElementType data[kElementCount];
+
+  template<typename DstT>
+  ASMJIT_INLINE_NODEBUG const DstT& as() const noexcept {
+    static_assert(sizeof(DstT) <= sizeof(*this), "Size of the destination type DstT must be <= 8");
+    return *static_cast<const DstT*>(static_cast<const void*>(this));
+  }
 };
 
 //! A 128-bit vector constant of type `T` aligned to 128 bits.
@@ -44,6 +52,12 @@ struct ASMJIT_MAY_ALIAS ASMJIT_ALIGNAS(16) VecConst<T, 16> {
   static_assert(kElementCount > 0u, "Vector constant must have at least one element");
 
   ElementType data[kElementCount];
+
+  template<typename DstT>
+  ASMJIT_INLINE_NODEBUG const DstT& as() const noexcept {
+    static_assert(sizeof(DstT) <= sizeof(*this), "Size of the destination type DstT must be <= 16");
+    return *static_cast<const DstT*>(static_cast<const void*>(this));
+  }
 };
 
 //! A 256-bit vector constant of type `T` aligned to 256 bits.
@@ -58,6 +72,12 @@ struct ASMJIT_MAY_ALIAS ASMJIT_ALIGNAS(32) VecConst<T, 32> {
   static_assert(kElementCount > 0u, "Vector constant must have at least one element");
 
   ElementType data[kElementCount];
+
+  template<typename DstT>
+  ASMJIT_INLINE_NODEBUG const DstT& as() const noexcept {
+    static_assert(sizeof(DstT) <= sizeof(*this), "Size of the destination type DstT must be <= 32");
+    return *static_cast<const DstT*>(static_cast<const void*>(this));
+  }
 };
 
 //! A 512-bit vector constant of type `T` aligned to 512 bits.
@@ -72,8 +92,16 @@ struct ASMJIT_MAY_ALIAS ASMJIT_ALIGNAS(64) VecConst<T, 64> {
   static_assert(kElementCount > 0u, "Vector constant must have at least one element");
 
   ElementType data[kElementCount];
+
+  template<typename DstT>
+  ASMJIT_INLINE_NODEBUG const DstT& as() const noexcept {
+    static_assert(sizeof(DstT) <= sizeof(*this), "Size of the destination type DstT must be <= 64");
+    return *static_cast<const DstT*>(static_cast<const void*>(this));
+  }
 };
 
+//! \endcond
+
 template<typename T> using VecConst64 = VecConst<T, 8>;
 template<typename T> using VecConst128 = VecConst<T, 16>;
 template<typename T> using VecConst256 = VecConst<T, 32>;
@@ -397,18 +425,27 @@ struct VecConstTable {
 
   VecConstNative<uint64_t> p_0000800000008000 = make_const<VecConstNative<uint64_t>>(uint64_t(0x0000800000008000u));
 
-  VecConst128<uint32_t> sign32_scalar  = make_const<VecConst128<uint32_t>>(0u, 0u, 0u, uint32_t(0x80000000u));
-  VecConst128<uint64_t> sign64_scalar  = make_const<VecConst128<uint64_t>>(uint64_t(0u), uint64_t(0x8000000000000000u));
+  VecConst128<uint32_t> sign32_scalar         = make_const<VecConst128<uint32_t>>(0u, 0u, 0u, uint32_t(0x80000000u));
+  VecConst128<uint64_t> sign64_scalar         = make_const<VecConst128<uint64_t>>(uint64_t(0u), uint64_t(0x8000000000000000u));
 
-  VecConstNative<float> f32_1          = make_const<VecConstNative<float>>(1.0f);
-  VecConstNative<float> f32_round_max  = make_const<VecConstNative<float>>(8388608.0f);
+  VecConstNative<uint64_t> f32_0_5_minus_1ulp  = make_const<VecConstNative<uint64_t>>(0x3EFFFFFF3EFFFFFFu); // 0.49999997 (0.5f - 1ulp)
+  VecConstNative<float> f32_0_5               = make_const<VecConstNative<float>>(0.5f);
+  VecConstNative<float> f32_1                 = make_const<VecConstNative<float>>(1.0f);
+  VecConstNative<float> f32_round_magic       = make_const<VecConstNative<float>>(8388608.0f);
 
-  VecConstNative<double> f64_1         = make_const<VecConstNative<double>>(1.0);
-  VecConstNative<double> f64_round_max = make_const<VecConstNative<double>>(4503599627370496.0);
+  VecConstNative<uint64_t> f64_0_5_minus_1ulp  = make_const<VecConstNative<uint64_t>>(0x3FDFFFFFFFFFFFFFu); // 0.49999999999999994 (0.5 - 1ulp).
+  VecConstNative<double> f64_0_5              = make_const<VecConstNative<double>>(0.5);
+  VecConstNative<double> f64_1                = make_const<VecConstNative<double>>(1.0);
+  VecConstNative<double> f64_round_magic      = make_const<VecConstNative<double>>(4503599627370496.0);
 };
 
 ASMJIT_VARAPI const VecConstTable vec_const_table;
 
+struct VecConstTableRef {
+  const VecConstTable& table;
+  size_t size;
+};
+
 //! \}
 
 ASMJIT_END_SUB_NAMESPACE
diff --git a/src/asmjit/x86/x86compiler.h b/src/asmjit/x86/x86compiler.h
index 219a418..ce4eabe 100644
--- a/src/asmjit/x86/x86compiler.h
+++ b/src/asmjit/x86/x86compiler.h
@@ -511,99 +511,182 @@ public:
   //! \name Virtual Registers
   //! \{
 
-#ifndef ASMJIT_NO_LOGGING
-# define ASMJIT_NEW_REG_FMT(OUT, PARAM, FORMAT, ARGS)                                \
-    _new_reg_fmt(Out<Reg>{OUT}, PARAM, FORMAT, ARGS)
-#else
-# define ASMJIT_NEW_REG_FMT(OUT, PARAM, FORMAT, ARGS)                                \
-    Support::maybe_unused(FORMAT);                                                   \
-    Support::maybe_unused(std::forward<Args>(args)...);                              \
-    _new_reg(Out<Reg>{OUT}, PARAM)
-#endif
+  //! Creates a new general-purpose register with `type_id` type and optional name passed via `args`.
+  //!
+  //! \note Using \ref TypeId is too generic. In general it's recommended to use \ref new_gp8(),
+  //! \ref new_gp16(), \ref new_gp32(), \ref new_gp64(), and \ref new_gpz() or \ref new_gp_ptr().
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Gp new_gp(TypeId type_id, Args&&... args) { return new_reg<Gp>(type_id, std::forward<Args>(args)...); }
 
-#define ASMJIT_NEW_REG_CUSTOM(FUNC, REG)                                             \
-    ASMJIT_INLINE_NODEBUG REG FUNC(TypeId type_id) {                                 \
-      REG reg(Globals::NoInit);                                                      \
-      _new_reg(Out<Reg>{reg}, type_id);                                              \
-      return reg;                                                                    \
-    }                                                                                \
-                                                                                     \
-    template<typename... Args>                                                       \
-    ASMJIT_INLINE_NODEBUG REG FUNC(TypeId type_id, const char* fmt, Args&&... args) {\
-      REG reg(Globals::NoInit);                                                      \
-      ASMJIT_NEW_REG_FMT(reg, type_id, fmt, std::forward<Args>(args)...);            \
-      return reg;                                                                    \
-    }
+  //! Creates a new vector register with `type_id` type and optional name passed via `args`.
+  //!
+  //! \note Using \ref TypeId is too generic. In general it's recommended to use \ref new_vec128(),
+  //! \ref new_vec256(), \ref new_vec512(), or alternatively \ref new_xmm(), \ref new_ymm(), and \ref new_zmm().
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec(TypeId type_id, Args&&... args) { return new_reg<Vec>(type_id, std::forward<Args>(args)...); }
 
-#define ASMJIT_NEW_REG_TYPED(FUNC, REG, TYPE_ID)                                     \
-    ASMJIT_INLINE_NODEBUG REG FUNC() {                                               \
-      REG reg(Globals::NoInit);                                                      \
-      _new_reg(Out<Reg>{reg}, TYPE_ID);                                              \
-      return reg;                                                                    \
-    }                                                                                \
-                                                                                     \
-    template<typename... Args>                                                       \
-    ASMJIT_INLINE_NODEBUG REG FUNC(const char* fmt, Args&&... args) {                \
-      REG reg(Globals::NoInit);                                                      \
-      ASMJIT_NEW_REG_FMT(reg, TYPE_ID, fmt, std::forward<Args>(args)...);            \
-      return reg;                                                                    \
-    }
+  //! Creates a new mask register with `type_id` type and optional name passed via `args`.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG KReg new_k(TypeId type_id, Args&&... args) { return new_reg<KReg>(type_id, std::forward<Args>(args)...); }
 
-  template<typename RegT>
-  ASMJIT_INLINE_NODEBUG RegT new_similar_reg(const RegT& ref) {
-    RegT reg(Globals::NoInit);
-    _new_reg(Out<Reg>(reg), ref);
-    return reg;
-  }
+  //! Creates a new 8-bit general purpose register mapped to low 8 bits of a full register.
+  //!
+  //! \note Using 8-bit registers is not recommended, use at least 32-bit registers in portable code.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Gp new_gp8(Args&&... args) { return new_reg<Gp>(TypeId::kUInt8, std::forward<Args>(args)...); }
 
-  template<typename RegT, typename... Args>
-  ASMJIT_INLINE_NODEBUG RegT new_similar_reg(const RegT& ref, const char* fmt, Args&&... args) {
-    RegT reg(Globals::NoInit);
-    ASMJIT_NEW_REG_FMT(reg, ref, fmt, std::forward<Args>(args)...);
-    return reg;
-  }
+  //! Creates a new 16-bit general purpose register mapped to low 16 bits of a full register.
+  //!
+  //! \note Using 16-bit registers is not recommended, use at least 32-bit registers in portable code.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Gp new_gp16(Args&&... args) { return new_reg<Gp>(TypeId::kUInt16, std::forward<Args>(args)...); }
 
-  ASMJIT_NEW_REG_CUSTOM(new_reg  , Reg )
-  ASMJIT_NEW_REG_CUSTOM(new_gp   , Gp  )
-  ASMJIT_NEW_REG_CUSTOM(new_vec  , Vec )
-  ASMJIT_NEW_REG_CUSTOM(new_kreg , KReg)
+  //! Creates a new 32-bit general purpose register mapped to low 32 bits of a full register (on 64-bit targets).
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Gp new_gp32(Args&&... args) { return new_reg<Gp>(TypeId::kUInt32, std::forward<Args>(args)...); }
 
-  ASMJIT_NEW_REG_TYPED(new_gp8   , Gp  , TypeId::kUInt8)
-  ASMJIT_NEW_REG_TYPED(new_gp16  , Gp  , TypeId::kUInt16)
-  ASMJIT_NEW_REG_TYPED(new_gp32  , Gp  , TypeId::kUInt32)
-  ASMJIT_NEW_REG_TYPED(new_gp64  , Gp  , TypeId::kUInt64)
+  //! Creates a new 64-bit general purpose register.
+  //!
+  //! \warning The target must be 64-bit in order to create 64-bit registers.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Gp new_gp64(Args&&... args) { return new_reg<Gp>(TypeId::kUInt64, std::forward<Args>(args)...); }
 
-  ASMJIT_NEW_REG_TYPED(new_gpb   , Gp  , TypeId::kUInt8)
-  ASMJIT_NEW_REG_TYPED(new_gpw   , Gp  , TypeId::kUInt16)
-  ASMJIT_NEW_REG_TYPED(new_gpd   , Gp  , TypeId::kUInt32)
-  ASMJIT_NEW_REG_TYPED(new_gpq   , Gp  , TypeId::kUInt64)
-  ASMJIT_NEW_REG_TYPED(new_gpz   , Gp  , TypeId::kUIntPtr)
-  ASMJIT_NEW_REG_TYPED(new_gp_ptr, Gp  , TypeId::kUIntPtr)
+  //! Creates a new 32-bit or 64-bit general purpose register depending on the target register width.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Gp new_gpz(Args&&... args) { return new_reg<Gp>(TypeId::kUIntPtr, std::forward<Args>(args)...); }
 
-  ASMJIT_NEW_REG_TYPED(new_xmm   , Vec , TypeId::kInt32x4)
-  ASMJIT_NEW_REG_TYPED(new_xmm_ss, Vec , TypeId::kFloat32x1)
-  ASMJIT_NEW_REG_TYPED(new_xmm_sd, Vec , TypeId::kFloat64x1)
-  ASMJIT_NEW_REG_TYPED(new_xmm_ps, Vec , TypeId::kFloat32x4)
-  ASMJIT_NEW_REG_TYPED(new_xmm_pd, Vec , TypeId::kFloat64x2)
+  //! Creates a new 32-bit or 64-bit general purpose register depending on the target register width.
+  //!
+  //! \note This is just an alternative name that maps more closely to C's `uintptr_t`, it's the same function as
+  //! \ref new_gpz().
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Gp new_gp_ptr(Args&&... args) { return new_reg<Gp>(TypeId::kUIntPtr, std::forward<Args>(args)...); }
 
-  ASMJIT_NEW_REG_TYPED(new_ymm   , Vec , TypeId::kInt32x8)
-  ASMJIT_NEW_REG_TYPED(new_ymm_ps, Vec , TypeId::kFloat32x8)
-  ASMJIT_NEW_REG_TYPED(new_ymm_pd, Vec , TypeId::kFloat64x4)
+  //! Creates a new 128-bit vector register (XMM).
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec128(Args&&... args) { return new_reg<Vec>(TypeId::kInt32x4, std::forward<Args>(args)...); }
 
-  ASMJIT_NEW_REG_TYPED(new_zmm   , Vec , TypeId::kInt32x16)
-  ASMJIT_NEW_REG_TYPED(new_zmm_ps, Vec , TypeId::kFloat32x16)
-  ASMJIT_NEW_REG_TYPED(new_zmm_pd, Vec , TypeId::kFloat64x8)
+  //! Creates a new 128-bit vector register (XMM) that will be used for scalar 32-bit floating point operation.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec128_f32x1(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32x1, std::forward<Args>(args)...); }
 
-  ASMJIT_NEW_REG_TYPED(new_mm    , Mm  , TypeId::kMmx64)
+  //! Creates a new 128-bit vector register (XMM) that will be used for scalar 64-bit floating point operation.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec128_f64x1(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64x1, std::forward<Args>(args)...); }
 
-  ASMJIT_NEW_REG_TYPED(new_kb    , KReg, TypeId::kMask8)
-  ASMJIT_NEW_REG_TYPED(new_kw    , KReg, TypeId::kMask16)
-  ASMJIT_NEW_REG_TYPED(new_kd    , KReg, TypeId::kMask32)
-  ASMJIT_NEW_REG_TYPED(new_kq    , KReg, TypeId::kMask64)
+  //! Creates a new 128-bit vector register (XMM) that will be used for packed 32-bit floating point operation.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec128_f32x4(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32x4, std::forward<Args>(args)...); }
 
-#undef ASMJIT_NEW_REG_TYPED
-#undef ASMJIT_NEW_REG_CUSTOM
-#undef ASMJIT_NEW_REG_FMT
+  //! Creates a new 128-bit vector register (XMM) that will be used for packed 64-bit floating point operation.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec128_f64x2(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64x2, std::forward<Args>(args)...); }
+
+  //! Creates a new 256-bit vector register (YMM).
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec256(Args&&... args) { return new_reg<Vec>(TypeId::kInt32x8, std::forward<Args>(args)...); }
+
+  //! Creates a new 256-bit vector register (YMM) that will be used for packed 32-bit floating point operation.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec256_f32x8(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32x8, std::forward<Args>(args)...); }
+
+  //! Creates a new 256-bit vector register (YMM) that will be used for packed 64-bit floating point operation.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec256_f64x4(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64x4, std::forward<Args>(args)...); }
+
+  //! Creates a new 512-bit vector register (ZMM).
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec512(Args&&... args) { return new_reg<Vec>(TypeId::kInt32x16, std::forward<Args>(args)...); }
+
+  //! Creates a new 512-bit vector register (ZMM) that will be used for packed 32-bit floating point operation.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec512_f32x16(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32x16, std::forward<Args>(args)...); }
+
+  //! Creates a new 512-bit vector register (ZMM) that will be used for packed 64-bit floating point operation.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_vec512_f64x8(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64x8, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec128() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_xmm(Args&&... args) { return new_reg<Vec>(TypeId::kInt32x4, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec128_f32x1() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_xmm_ss(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32x1, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec128_f64x1() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_xmm_sd(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64x1, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec128_f32x4() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_xmm_ps(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32x4, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec128_f64x2() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_xmm_pd(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64x2, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec256() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_ymm(Args&&... args) { return new_reg<Vec>(TypeId::kInt32x8, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec256_f32x8() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_ymm_ps(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32x8, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec256_f64x4() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_ymm_pd(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64x4, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec512() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_zmm(Args&&... args) { return new_reg<Vec>(TypeId::kInt32x16, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec512_f32x16() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_zmm_ps(Args&&... args) { return new_reg<Vec>(TypeId::kFloat32x16, std::forward<Args>(args)...); }
+
+  //! Alias of \ref new_vec512_f64x8() that matches x86 architecture terminology.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Vec new_zmm_pd(Args&&... args) { return new_reg<Vec>(TypeId::kFloat64x8, std::forward<Args>(args)...); }
+
+  //! Creates a new 64-bit MMX register.
+  //!
+  //! \note MMX ISA is generally deprecated by the X86 architecture.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG Mm new_mm(Args&&... args) { return new_reg<Mm>(TypeId::kMmx64, std::forward<Args>(args)...); }
+
+  //! Creates a new 8-bit mask (K) register.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG KReg new_k8(Args&&... args) { return new_reg<KReg>(TypeId::kMask8, std::forward<Args>(args)...); }
+
+  //! Creates a new 16-bit mask (K) register.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG KReg new_k16(Args&&... args) { return new_reg<KReg>(TypeId::kMask16, std::forward<Args>(args)...); }
+
+  //! Creates a new 32-bit mask (K) register.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG KReg new_k32(Args&&... args) { return new_reg<KReg>(TypeId::kMask32, std::forward<Args>(args)...); }
+
+  //! Creates a new 64-bit mask (K) register.
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG KReg new_k64(Args&&... args) { return new_reg<KReg>(TypeId::kMask64, std::forward<Args>(args)...); }
+
+  //! Creates a new 8-bit mask (K) register, alias of \ref new_k8().
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG KReg new_kb(Args&&... args) { return new_reg<KReg>(TypeId::kMask8, std::forward<Args>(args)...); }
+
+  //! Creates a new 16-bit mask (K) register, alias of \ref new_k16().
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG KReg new_kw(Args&&... args) { return new_reg<KReg>(TypeId::kMask16, std::forward<Args>(args)...); }
+
+  //! Creates a new 32-bit mask (K) register, alias of \ref new_k32().
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG KReg new_kd(Args&&... args) { return new_reg<KReg>(TypeId::kMask32, std::forward<Args>(args)...); }
+
+  //! Creates a new 64-bit mask (K) register, alias of \ref new_k64().
+  template<typename... Args>
+  ASMJIT_INLINE_NODEBUG KReg new_kq(Args&&... args) { return new_reg<KReg>(TypeId::kMask64, std::forward<Args>(args)...); }
 
   //! \}
 
diff --git a/src/asmjit/x86/x86formatter.cpp b/src/asmjit/x86/x86formatter.cpp
index 1235a82..39c554c 100644
--- a/src/asmjit/x86/x86formatter.cpp
+++ b/src/asmjit/x86/x86formatter.cpp
@@ -561,7 +561,7 @@ struct ImmBits {
   char text[48 - 3];
 };
 
-ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmShuf(String& sb, uint32_t imm8, uint32_t bits, uint32_t count) noexcept {
+ASMJIT_FAVOR_SIZE static Error FormatterInternal_format_imm_shuf(String& sb, uint32_t imm8, uint32_t bits, uint32_t count) noexcept {
   uint32_t mask = (1 << bits) - 1;
   uint32_t last_predicate_shift = bits * (count - 1u);
 
@@ -576,7 +576,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmShuf(String& sb, uint3
   return Error::kOk;
 }
 
-ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmBits(String& sb, uint32_t imm8, const ImmBits* bits, uint32_t count) noexcept {
+ASMJIT_FAVOR_SIZE static Error FormatterInternal_format_imm_bits(String& sb, uint32_t imm8, const ImmBits* bits, uint32_t count) noexcept {
   uint32_t n = 0;
   char buf[64];
 
@@ -615,7 +615,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmBits(String& sb, uint3
   return Error::kOk;
 }
 
-ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmText(String& sb, uint32_t imm8, uint32_t bits, uint32_t advance, const char* text, uint32_t count = 1) noexcept {
+ASMJIT_FAVOR_SIZE static Error FormatterInternal_format_imm_text(String& sb, uint32_t imm8, uint32_t bits, uint32_t advance, const char* text, uint32_t count = 1) noexcept {
   uint32_t mask = (1u << bits) - 1;
   uint32_t pos = 0;
 
@@ -628,7 +628,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmText(String& sb, uint3
   return sb.append(kImmCharEnd);
 }
 
-ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst(
+ASMJIT_FAVOR_SIZE static Error FormatterInternal_explain_const(
   String& sb,
   FormatFlags format_flags,
   InstId inst_id,
@@ -700,55 +700,55 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst(
   };
 
   static const ImmBits vroundxx[] = {
-    { 0x07u, 0, ImmBits::kModeLookup, "ROUND\0" "FLOOR\0" "CEIL\0" "TRUNC\0" "\0" "\0" "\0" "\0" },
-    { 0x08u, 3, ImmBits::kModeLookup, "\0" "INEXACT\0" }
+    { 0x07u, 0, ImmBits::kModeLookup, "ROUND\0" "FLOOR\0" "CEIL\0" "TRUNC\0" "CURRENT\0" "\0" "\0" "\0" },
+    { 0x08u, 3, ImmBits::kModeLookup, "\0" "SUPPRESS\0" }
   };
 
   uint32_t u8 = imm.value_as<uint8_t>();
   switch (inst_id) {
     case Inst::kIdVblendpd:
     case Inst::kIdBlendpd:
-      return FormatterInternal_formatImmShuf(sb, u8, 1, vec_size / 8);
+      return FormatterInternal_format_imm_shuf(sb, u8, 1, vec_size / 8);
 
     case Inst::kIdVblendps:
     case Inst::kIdBlendps:
-      return FormatterInternal_formatImmShuf(sb, u8, 1, vec_size / 4);
+      return FormatterInternal_format_imm_shuf(sb, u8, 1, vec_size / 4);
 
     case Inst::kIdVcmppd:
     case Inst::kIdVcmpps:
     case Inst::kIdVcmpsd:
     case Inst::kIdVcmpss:
-      return FormatterInternal_formatImmText(sb, u8, 5, 0, vcmpx);
+      return FormatterInternal_format_imm_text(sb, u8, 5, 0, vcmpx);
 
     case Inst::kIdCmppd:
     case Inst::kIdCmpps:
     case Inst::kIdCmpsd:
     case Inst::kIdCmpss:
-      return FormatterInternal_formatImmText(sb, u8, 3, 0, vcmpx);
+      return FormatterInternal_format_imm_text(sb, u8, 3, 0, vcmpx);
 
     case Inst::kIdVdbpsadbw:
-      return FormatterInternal_formatImmShuf(sb, u8, 2, 4);
+      return FormatterInternal_format_imm_shuf(sb, u8, 2, 4);
 
     case Inst::kIdVdppd:
     case Inst::kIdVdpps:
     case Inst::kIdDppd:
     case Inst::kIdDpps:
-      return FormatterInternal_formatImmShuf(sb, u8, 1, 8);
+      return FormatterInternal_format_imm_shuf(sb, u8, 1, 8);
 
     case Inst::kIdVmpsadbw:
     case Inst::kIdMpsadbw:
-      return FormatterInternal_formatImmBits(sb, u8, vmpsadbw, Support::min<uint32_t>(vec_size / 8, 4));
+      return FormatterInternal_format_imm_bits(sb, u8, vmpsadbw, Support::min<uint32_t>(vec_size / 8, 4));
 
     case Inst::kIdVpblendw:
     case Inst::kIdPblendw:
-      return FormatterInternal_formatImmShuf(sb, u8, 1, 8);
+      return FormatterInternal_format_imm_shuf(sb, u8, 1, 8);
 
     case Inst::kIdVpblendd:
-      return FormatterInternal_formatImmShuf(sb, u8, 1, Support::min<uint32_t>(vec_size / 4, 8));
+      return FormatterInternal_format_imm_shuf(sb, u8, 1, Support::min<uint32_t>(vec_size / 4, 8));
 
     case Inst::kIdVpclmulqdq:
     case Inst::kIdPclmulqdq:
-      return FormatterInternal_formatImmBits(sb, u8, vpclmulqdq, ASMJIT_ARRAY_SIZE(vpclmulqdq));
+      return FormatterInternal_format_imm_bits(sb, u8, vpclmulqdq, ASMJIT_ARRAY_SIZE(vpclmulqdq));
 
     case Inst::kIdVroundpd:
     case Inst::kIdVroundps:
@@ -758,57 +758,57 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst(
     case Inst::kIdRoundps:
     case Inst::kIdRoundsd:
     case Inst::kIdRoundss:
-      return FormatterInternal_formatImmBits(sb, u8, vroundxx, ASMJIT_ARRAY_SIZE(vroundxx));
+      return FormatterInternal_format_imm_bits(sb, u8, vroundxx, ASMJIT_ARRAY_SIZE(vroundxx));
 
     case Inst::kIdVshufpd:
     case Inst::kIdShufpd:
-      return FormatterInternal_formatImmText(sb, u8, 1, 2, vshufpd, Support::min<uint32_t>(vec_size / 8, 8));
+      return FormatterInternal_format_imm_text(sb, u8, 1, 2, vshufpd, Support::min<uint32_t>(vec_size / 8, 8));
 
     case Inst::kIdVshufps:
     case Inst::kIdShufps:
-      return FormatterInternal_formatImmText(sb, u8, 2, 4, vshufps, 4);
+      return FormatterInternal_format_imm_text(sb, u8, 2, 4, vshufps, 4);
 
     case Inst::kIdVcvtps2ph:
-      return FormatterInternal_formatImmBits(sb, u8, vroundxx, 1);
+      return FormatterInternal_format_imm_bits(sb, u8, vroundxx, 1);
 
     case Inst::kIdVperm2f128:
     case Inst::kIdVperm2i128:
-      return FormatterInternal_formatImmBits(sb, u8, vperm2x128, ASMJIT_ARRAY_SIZE(vperm2x128));
+      return FormatterInternal_format_imm_bits(sb, u8, vperm2x128, ASMJIT_ARRAY_SIZE(vperm2x128));
 
     case Inst::kIdVpermilpd:
-      return FormatterInternal_formatImmShuf(sb, u8, 1, vec_size / 8);
+      return FormatterInternal_format_imm_shuf(sb, u8, 1, vec_size / 8);
 
     case Inst::kIdVpermilps:
-      return FormatterInternal_formatImmShuf(sb, u8, 2, 4);
+      return FormatterInternal_format_imm_shuf(sb, u8, 2, 4);
 
     case Inst::kIdVpshufd:
     case Inst::kIdPshufd:
-      return FormatterInternal_formatImmShuf(sb, u8, 2, 4);
+      return FormatterInternal_format_imm_shuf(sb, u8, 2, 4);
 
     case Inst::kIdVpshufhw:
     case Inst::kIdVpshuflw:
     case Inst::kIdPshufhw:
     case Inst::kIdPshuflw:
     case Inst::kIdPshufw:
-      return FormatterInternal_formatImmShuf(sb, u8, 2, 4);
+      return FormatterInternal_format_imm_shuf(sb, u8, 2, 4);
 
     case Inst::kIdVfixupimmpd:
     case Inst::kIdVfixupimmps:
     case Inst::kIdVfixupimmsd:
     case Inst::kIdVfixupimmss:
-      return FormatterInternal_formatImmBits(sb, u8, vfixupimmxx, ASMJIT_ARRAY_SIZE(vfixupimmxx));
+      return FormatterInternal_format_imm_bits(sb, u8, vfixupimmxx, ASMJIT_ARRAY_SIZE(vfixupimmxx));
 
     case Inst::kIdVfpclasspd:
     case Inst::kIdVfpclassps:
     case Inst::kIdVfpclasssd:
     case Inst::kIdVfpclassss:
-      return FormatterInternal_formatImmBits(sb, u8, vfpclassxx, ASMJIT_ARRAY_SIZE(vfpclassxx));
+      return FormatterInternal_format_imm_bits(sb, u8, vfpclassxx, ASMJIT_ARRAY_SIZE(vfpclassxx));
 
     case Inst::kIdVgetmantpd:
     case Inst::kIdVgetmantps:
     case Inst::kIdVgetmantsd:
     case Inst::kIdVgetmantss:
-      return FormatterInternal_formatImmBits(sb, u8, vgetmantxx, ASMJIT_ARRAY_SIZE(vgetmantxx));
+      return FormatterInternal_format_imm_bits(sb, u8, vgetmantxx, ASMJIT_ARRAY_SIZE(vgetmantxx));
 
     case Inst::kIdVpcmpb:
     case Inst::kIdVpcmpd:
@@ -818,7 +818,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst(
     case Inst::kIdVpcmpud:
     case Inst::kIdVpcmpuq:
     case Inst::kIdVpcmpuw:
-      return FormatterInternal_formatImmText(sb, u8, 3, 0, vpcmpx);
+      return FormatterInternal_format_imm_text(sb, u8, 3, 0, vpcmpx);
 
     case Inst::kIdVpcomb:
     case Inst::kIdVpcomd:
@@ -828,21 +828,21 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst(
     case Inst::kIdVpcomud:
     case Inst::kIdVpcomuq:
     case Inst::kIdVpcomuw:
-      return FormatterInternal_formatImmText(sb, u8, 3, 0, vpcomx);
+      return FormatterInternal_format_imm_text(sb, u8, 3, 0, vpcomx);
 
     case Inst::kIdVpermq:
     case Inst::kIdVpermpd:
-      return FormatterInternal_formatImmShuf(sb, u8, 2, 4);
+      return FormatterInternal_format_imm_shuf(sb, u8, 2, 4);
 
     case Inst::kIdVpternlogd:
     case Inst::kIdVpternlogq:
-      return FormatterInternal_formatImmShuf(sb, u8, 1, 8);
+      return FormatterInternal_format_imm_shuf(sb, u8, 1, 8);
 
     case Inst::kIdVrangepd:
     case Inst::kIdVrangeps:
     case Inst::kIdVrangesd:
     case Inst::kIdVrangess:
-      return FormatterInternal_formatImmBits(sb, u8, vrangexx, ASMJIT_ARRAY_SIZE(vrangexx));
+      return FormatterInternal_format_imm_bits(sb, u8, vrangexx, ASMJIT_ARRAY_SIZE(vrangexx));
 
     case Inst::kIdVreducepd:
     case Inst::kIdVreduceps:
@@ -852,7 +852,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst(
     case Inst::kIdVrndscaleps:
     case Inst::kIdVrndscalesd:
     case Inst::kIdVrndscaless:
-      return FormatterInternal_formatImmBits(sb, u8, vreducexx_vrndscalexx, ASMJIT_ARRAY_SIZE(vreducexx_vrndscalexx));
+      return FormatterInternal_format_imm_bits(sb, u8, vreducexx_vrndscalexx, ASMJIT_ARRAY_SIZE(vreducexx_vrndscalexx));
 
     case Inst::kIdVshuff32x4:
     case Inst::kIdVshuff64x2:
@@ -860,7 +860,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst(
     case Inst::kIdVshufi64x2: {
       uint32_t count = Support::max<uint32_t>(vec_size / 16, 2u);
       uint32_t bits = count <= 2 ? 1u : 2u;
-      return FormatterInternal_formatImmShuf(sb, u8, bits, count);
+      return FormatterInternal_format_imm_shuf(sb, u8, bits, count);
     }
 
     default:
@@ -969,7 +969,7 @@ ASMJIT_FAVOR_SIZE Error FormatterInternal::format_instruction(
           vec_size = Support::max<uint32_t>(vec_size, operands[j].as<Reg>().size());
         }
       }
-      ASMJIT_PROPAGATE(FormatterInternal_explainConst(sb, format_flags, inst_id, vec_size, op.as<Imm>()));
+      ASMJIT_PROPAGATE(FormatterInternal_explain_const(sb, format_flags, inst_id, vec_size, op.as<Imm>()));
     }
 
     // Support AVX-512 masking - {k}{z}.
diff --git a/test/asmjit_bench_codegen.cpp b/testing/bench/asmjit_bench_codegen.cpp
similarity index 97%
rename from test/asmjit_bench_codegen.cpp
rename to testing/bench/asmjit_bench_codegen.cpp
index b11503b..3f4933a 100644
--- a/test/asmjit_bench_codegen.cpp
+++ b/testing/bench/asmjit_bench_codegen.cpp
@@ -8,8 +8,8 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "asmjitutils.h"
-#include "cmdline.h"
+#include "../commons/asmjitutils.h"
+#include "../commons/cmdline.h"
 
 using namespace asmjit;
 
diff --git a/test/asmjit_bench_codegen.h b/testing/bench/asmjit_bench_codegen.h
similarity index 97%
rename from test/asmjit_bench_codegen.h
rename to testing/bench/asmjit_bench_codegen.h
index cecbdc4..9df289c 100644
--- a/test/asmjit_bench_codegen.h
+++ b/testing/bench/asmjit_bench_codegen.h
@@ -7,8 +7,9 @@
 #define ASMJIT_TEST_PERF_H_INCLUDED
 
 #include <asmjit/core.h>
-#include "asmjitutils.h"
-#include "performancetimer.h"
+
+#include "../commons/asmjitutils.h"
+#include "../commons/performancetimer.h"
 
 namespace asmjit_perf_utils {
 
diff --git a/test/asmjit_bench_codegen_a64.cpp b/testing/bench/asmjit_bench_codegen_a64.cpp
similarity index 100%
rename from test/asmjit_bench_codegen_a64.cpp
rename to testing/bench/asmjit_bench_codegen_a64.cpp
diff --git a/test/asmjit_bench_codegen_x86.cpp b/testing/bench/asmjit_bench_codegen_x86.cpp
similarity index 99%
rename from test/asmjit_bench_codegen_x86.cpp
rename to testing/bench/asmjit_bench_codegen_x86.cpp
index 775be71..0620931 100644
--- a/test/asmjit_bench_codegen_x86.cpp
+++ b/testing/bench/asmjit_bench_codegen_x86.cpp
@@ -13,7 +13,7 @@
 #include <string.h>
 
 #include "asmjit_bench_codegen.h"
-#include "asmjit_test_misc.h"
+#include "../tests/asmjit_test_misc.h"
 
 using namespace asmjit;
 
diff --git a/test/asmjit_bench_overhead.cpp b/testing/bench/asmjit_bench_overhead.cpp
similarity index 99%
rename from test/asmjit_bench_overhead.cpp
rename to testing/bench/asmjit_bench_overhead.cpp
index 6b7d0ca..a964c30 100644
--- a/test/asmjit_bench_overhead.cpp
+++ b/testing/bench/asmjit_bench_overhead.cpp
@@ -1,9 +1,9 @@
 #include <stdint.h>
 #include <asmjit/host.h>
 
-#include "asmjitutils.h"
-#include "cmdline.h"
-#include "performancetimer.h"
+#include "../commons/asmjitutils.h"
+#include "../commons/cmdline.h"
+#include "../commons/performancetimer.h"
 
 using namespace asmjit;
 
diff --git a/test/asmjit_bench_regalloc.cpp b/testing/bench/asmjit_bench_regalloc.cpp
similarity index 98%
rename from test/asmjit_bench_regalloc.cpp
rename to testing/bench/asmjit_bench_regalloc.cpp
index f3ac0bd..32379cd 100644
--- a/test/asmjit_bench_regalloc.cpp
+++ b/testing/bench/asmjit_bench_regalloc.cpp
@@ -20,13 +20,12 @@
 #include <memory>
 #include <vector>
 
-#include "asmjitutils.h"
+#include "../commons/asmjitutils.h"
 
 #if !defined(ASMJIT_NO_COMPILER)
-  #include "cmdline.h"
-  #include "performancetimer.h"
-  #include "asmjit_test_compiler.h"
-  #include "asmjit_test_random.h"
+  #include "../commons/cmdline.h"
+  #include "../commons/performancetimer.h"
+  #include "../commons/random.h"
 #endif
 
 using namespace asmjit;
diff --git a/test/asmjitutils.h b/testing/commons/asmjitutils.h
similarity index 89%
rename from test/asmjitutils.h
rename to testing/commons/asmjitutils.h
index 791faef..87c05c1 100644
--- a/test/asmjitutils.h
+++ b/testing/commons/asmjitutils.h
@@ -88,6 +88,8 @@ static void print_cpu_info() noexcept {
   // CPU Features
   // ------------
 
+  using asmjit::CpuHints;
+
 #ifndef ASMJIT_NO_LOGGING
   printf("CPU Features:\n");
   asmjit::CpuFeatures::Iterator it(cpu.features().iterator());
@@ -99,6 +101,27 @@ static void print_cpu_info() noexcept {
   };
   printf("\n");
 #endif // !ASMJIT_NO_LOGGING
+
+  // CPU Hints
+  // ---------
+
+  printf("CPU Hints:\n");
+  auto print_hint = [&](CpuHints hint, const char* name) {
+    if ((cpu.hints() & hint) != CpuHints::kNone) {
+      printf("  %s\n", name);
+    }
+  };
+
+  print_hint(CpuHints::kVecMaskedOps8  , "VecMaskedOps8"  );
+  print_hint(CpuHints::kVecMaskedOps16 , "VecMaskedOps16" );
+  print_hint(CpuHints::kVecMaskedOps32 , "VecMaskedOps32" );
+  print_hint(CpuHints::kVecMaskedOps64 , "VecMaskedOps64" );
+  print_hint(CpuHints::kVecFastIntMul32, "VecFastIntMul32");
+  print_hint(CpuHints::kVecFastIntMul64, "VecFastIntMul64");
+  print_hint(CpuHints::kVecFastGather  , "VecFastGather"  );
+  print_hint(CpuHints::kVecMaskedStore , "VecMaskedStore" );
+
+  printf("\n");
 }
 
 [[maybe_unused]]
diff --git a/test/cmdline.h b/testing/commons/cmdline.h
similarity index 100%
rename from test/cmdline.h
rename to testing/commons/cmdline.h
diff --git a/test/performancetimer.h b/testing/commons/performancetimer.h
similarity index 100%
rename from test/performancetimer.h
rename to testing/commons/performancetimer.h
diff --git a/test/asmjit_test_random.h b/testing/commons/random.h
similarity index 93%
rename from test/asmjit_test_random.h
rename to testing/commons/random.h
index d085ab3..27f99f6 100644
--- a/test/asmjit_test_random.h
+++ b/testing/commons/random.h
@@ -3,8 +3,8 @@
 // See <asmjit/core.h> or LICENSE.md for license and copyright information
 // SPDX-License-Identifier: Zlib
 
-#ifndef ASMJIT_TEST_RANDOM_H_INCLUDED
-#define ASMJIT_TEST_RANDOM_H_INCLUDED
+#ifndef TESTING_COMMONS_RANDOM_H_INCLUDED
+#define TESTING_COMMONS_RANDOM_H_INCLUDED
 
 #include <stdint.h>
 #include <string.h>
@@ -74,4 +74,4 @@ public:
 } // {anonymous}
 } // {TestUtils}
 
-#endif // ASMJIT_TEST_RANDOM_H_INCLUDED
+#endif // TESTING_COMMONS_RANDOM_H_INCLUDED
diff --git a/test/asmjit_test_assembler.cpp b/testing/tests/asmjit_test_assembler.cpp
similarity index 97%
rename from test/asmjit_test_assembler.cpp
rename to testing/tests/asmjit_test_assembler.cpp
index d92a2b8..32dfd1e 100644
--- a/test/asmjit_test_assembler.cpp
+++ b/testing/tests/asmjit_test_assembler.cpp
@@ -8,11 +8,11 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "asmjitutils.h"
-#include "cmdline.h"
-
 #include "asmjit_test_assembler.h"
 
+#include "../commons/asmjitutils.h"
+#include "../commons/cmdline.h"
+
 using namespace asmjit;
 
 #if !defined(ASMJIT_NO_X86)
diff --git a/test/asmjit_test_assembler.h b/testing/tests/asmjit_test_assembler.h
similarity index 100%
rename from test/asmjit_test_assembler.h
rename to testing/tests/asmjit_test_assembler.h
diff --git a/test/asmjit_test_assembler_a64.cpp b/testing/tests/asmjit_test_assembler_a64.cpp
similarity index 99%
rename from test/asmjit_test_assembler_a64.cpp
rename to testing/tests/asmjit_test_assembler_a64.cpp
index 566f927..7354291 100644
--- a/test/asmjit_test_assembler_a64.cpp
+++ b/testing/tests/asmjit_test_assembler_a64.cpp
@@ -12,7 +12,6 @@
 #include <string.h>
 
 #include "asmjit_test_assembler.h"
-#include "cmdline.h"
 
 using namespace asmjit;
 
diff --git a/test/asmjit_test_assembler_x64.cpp b/testing/tests/asmjit_test_assembler_x64.cpp
similarity index 99%
rename from test/asmjit_test_assembler_x64.cpp
rename to testing/tests/asmjit_test_assembler_x64.cpp
index 7aaae91..7de00e2 100644
--- a/test/asmjit_test_assembler_x64.cpp
+++ b/testing/tests/asmjit_test_assembler_x64.cpp
@@ -12,7 +12,6 @@
 #include <string.h>
 
 #include "asmjit_test_assembler.h"
-#include "cmdline.h"
 
 using namespace asmjit;
 
diff --git a/test/asmjit_test_assembler_x86.cpp b/testing/tests/asmjit_test_assembler_x86.cpp
similarity index 99%
rename from test/asmjit_test_assembler_x86.cpp
rename to testing/tests/asmjit_test_assembler_x86.cpp
index b855d99..0362060 100644
--- a/test/asmjit_test_assembler_x86.cpp
+++ b/testing/tests/asmjit_test_assembler_x86.cpp
@@ -12,7 +12,6 @@
 #include <string.h>
 
 #include "asmjit_test_assembler.h"
-#include "cmdline.h"
 
 using namespace asmjit;
 
diff --git a/test/asmjit_test_compiler.cpp b/testing/tests/asmjit_test_compiler.cpp
similarity index 99%
rename from test/asmjit_test_compiler.cpp
rename to testing/tests/asmjit_test_compiler.cpp
index be4cbbb..b5940b0 100644
--- a/test/asmjit_test_compiler.cpp
+++ b/testing/tests/asmjit_test_compiler.cpp
@@ -15,10 +15,10 @@
 
 #if !defined(ASMJIT_NO_COMPILER)
 
-#include "cmdline.h"
-#include "performancetimer.h"
+#include "../commons/asmjitutils.h"
+#include "../commons/cmdline.h"
+#include "../commons/performancetimer.h"
 
-#include "asmjitutils.h"
 #include "asmjit_test_compiler.h"
 
 #if !defined(ASMJIT_NO_X86)
diff --git a/test/asmjit_test_compiler.h b/testing/tests/asmjit_test_compiler.h
similarity index 100%
rename from test/asmjit_test_compiler.h
rename to testing/tests/asmjit_test_compiler.h
diff --git a/test/asmjit_test_compiler_a64.cpp b/testing/tests/asmjit_test_compiler_a64.cpp
similarity index 99%
rename from test/asmjit_test_compiler_a64.cpp
rename to testing/tests/asmjit_test_compiler_a64.cpp
index 188e857..460999f 100644
--- a/test/asmjit_test_compiler_a64.cpp
+++ b/testing/tests/asmjit_test_compiler_a64.cpp
@@ -11,7 +11,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./asmjit_test_compiler.h"
+#include "asmjit_test_compiler.h"
 
 using namespace asmjit;
 
diff --git a/test/asmjit_test_compiler_x86.cpp b/testing/tests/asmjit_test_compiler_x86.cpp
similarity index 100%
rename from test/asmjit_test_compiler_x86.cpp
rename to testing/tests/asmjit_test_compiler_x86.cpp
diff --git a/test/asmjit_test_emitters.cpp b/testing/tests/asmjit_test_emitters.cpp
similarity index 99%
rename from test/asmjit_test_emitters.cpp
rename to testing/tests/asmjit_test_emitters.cpp
index ed3e27e..897436b 100644
--- a/test/asmjit_test_emitters.cpp
+++ b/testing/tests/asmjit_test_emitters.cpp
@@ -8,7 +8,7 @@
 #include <string.h>
 
 #include <asmjit/core.h>
-#include "asmjitutils.h"
+#include "../commons/asmjitutils.h"
 
 #if ASMJIT_ARCH_X86 != 0
   #include <asmjit/x86.h>
diff --git a/test/asmjit_test_environment.cpp b/testing/tests/asmjit_test_environment.cpp
similarity index 99%
rename from test/asmjit_test_environment.cpp
rename to testing/tests/asmjit_test_environment.cpp
index 963d391..a3b86e1 100644
--- a/test/asmjit_test_environment.cpp
+++ b/testing/tests/asmjit_test_environment.cpp
@@ -13,7 +13,7 @@
   #include <asmjit/a64.h>
 #endif
 
-#include "asmjitutils.h"
+#include "../commons/asmjitutils.h"
 
 using namespace asmjit;
 
diff --git a/test/asmjit_test_instinfo.cpp b/testing/tests/asmjit_test_instinfo.cpp
similarity index 99%
rename from test/asmjit_test_instinfo.cpp
rename to testing/tests/asmjit_test_instinfo.cpp
index c69664d..8c794c8 100644
--- a/test/asmjit_test_instinfo.cpp
+++ b/testing/tests/asmjit_test_instinfo.cpp
@@ -10,7 +10,8 @@
 #endif
 
 #include <stdio.h>
-#include "asmjitutils.h"
+
+#include "../commons/asmjitutils.h"
 
 using namespace asmjit;
 
diff --git a/test/asmjit_test_misc.h b/testing/tests/asmjit_test_misc.h
similarity index 100%
rename from test/asmjit_test_misc.h
rename to testing/tests/asmjit_test_misc.h
diff --git a/test/asmjit_test_unit.cpp b/testing/tests/asmjit_test_runner.cpp
similarity index 97%
rename from test/asmjit_test_unit.cpp
rename to testing/tests/asmjit_test_runner.cpp
index 0903947..ed831b4 100644
--- a/test/asmjit_test_unit.cpp
+++ b/testing/tests/asmjit_test_runner.cpp
@@ -6,15 +6,15 @@
 #include <asmjit/core.h>
 
 #if !defined(ASMJIT_NO_X86)
-#include <asmjit/x86.h>
+  #include <asmjit/x86.h>
 #endif
 
 #if !defined(ASMJIT_NO_AARCH64)
-#include <asmjit/a64.h>
+  #include <asmjit/a64.h>
 #endif
 
-#include "asmjitutils.h"
 #include "broken.h"
+#include "../commons/asmjitutils.h"
 
 #if !defined(ASMJIT_NO_COMPILER)
 #include <asmjit/core/racfgblock_p.h>
diff --git a/test/asmjit_test_unicompiler.cpp b/testing/tests/asmjit_test_unicompiler.cpp
similarity index 92%
rename from test/asmjit_test_unicompiler.cpp
rename to testing/tests/asmjit_test_unicompiler.cpp
index ba45f3e..501332f 100644
--- a/test/asmjit_test_unicompiler.cpp
+++ b/testing/tests/asmjit_test_unicompiler.cpp
@@ -9,8 +9,8 @@
 #include <cmath>
 #include <limits>
 
-#include "asmjitutils.h"
-#include "asmjit_test_random.h"
+#include "../commons/asmjitutils.h"
+#include "../commons/random.h"
 
 static void print_app_info() noexcept {
   printf("AsmJit UniCompiler Test Suite v%u.%u.%u [Arch=%s] [Mode=%s]\n\n",
@@ -50,6 +50,7 @@ float fadd(float a, float b) noexcept;
 float fsub(float a, float b) noexcept;
 float fmul(float a, float b) noexcept;
 float fdiv(float a, float b) noexcept;
+float fsqrt(float a) noexcept;
 float fmadd_nofma_ref(float a, float b, float c) noexcept;
 float fmadd_fma_ref(float a, float b, float c) noexcept;
 
@@ -57,6 +58,7 @@ double fadd(double a, double b) noexcept;
 double fsub(double a, double b) noexcept;
 double fmul(double a, double b) noexcept;
 double fdiv(double a, double b) noexcept;
+double fsqrt(double a) noexcept;
 double fmadd_nofma_ref(double a, double b, double c) noexcept;
 double fmadd_fma_ref(double a, double b, double c) noexcept;
 
@@ -68,6 +70,7 @@ static ASMJIT_NOINLINE float fadd(float a, float b) noexcept { return a + b; }
 static ASMJIT_NOINLINE float fsub(float a, float b) noexcept { return a - b; }
 static ASMJIT_NOINLINE float fmul(float a, float b) noexcept { return a * b; }
 static ASMJIT_NOINLINE float fdiv(float a, float b) noexcept { return a / b; }
+static ASMJIT_NOINLINE float fsqrt(float a) noexcept { return std::sqrt(a); }
 static ASMJIT_NOINLINE float fmadd_nofma_ref(float a, float b, float c) noexcept { return a * b + c; }
 static ASMJIT_NOINLINE float fmadd_fma_ref(float a, float b, float c) noexcept { return std::fma(a, b, c); }
 
@@ -75,11 +78,18 @@ static ASMJIT_NOINLINE double fadd(double a, double b) noexcept { return a + b;
 static ASMJIT_NOINLINE double fsub(double a, double b) noexcept { return a - b; }
 static ASMJIT_NOINLINE double fmul(double a, double b) noexcept { return a * b; }
 static ASMJIT_NOINLINE double fdiv(double a, double b) noexcept { return a / b; }
+static ASMJIT_NOINLINE double fsqrt(double a) noexcept { return std::sqrt(a); }
 static ASMJIT_NOINLINE double fmadd_nofma_ref(double a, double b, double c) noexcept { return fadd(fmul(a, b), c); }
 static ASMJIT_NOINLINE double fmadd_fma_ref(double a, double b, double c) noexcept { return std::fma(a, b, c); }
 
 #endif
 
+static ASMJIT_INLINE float fsign(float a) noexcept { return Support::bit_cast<float>(Support::bit_cast<uint32_t>(a) & (uint32_t(1) << 31)); }
+static ASMJIT_INLINE double fsign(double a) noexcept { return Support::bit_cast<double>(Support::bit_cast<uint64_t>(a) & (uint64_t(1) << 63)); }
+
+static ASMJIT_INLINE float fxor(float a, float b) noexcept { return Support::bit_cast<float>(Support::bit_cast<uint32_t>(a) ^ Support::bit_cast<uint32_t>(b)); }
+static ASMJIT_INLINE double fxor(double a, double b) noexcept { return Support::bit_cast<double>(Support::bit_cast<uint64_t>(a) ^ Support::bit_cast<uint64_t>(b)); }
+
 // ujit::UniCompiler - Tests - Types
 // =================================
 
@@ -115,14 +125,14 @@ typedef void (*TestVVVVFunc)(void* dst, const void* src1, const void* src2, cons
 // ujit::UniCompiler - Tests - JIT Context Error Handler
 // =====================================================
 
-class TestErrorHandler : public asmjit::ErrorHandler {
+class TestErrorHandler : public ErrorHandler {
 public:
   TestErrorHandler() noexcept {}
   ~TestErrorHandler() noexcept override {}
 
-  void handle_error(asmjit::Error err, const char* message, asmjit::BaseEmitter* origin) override {
+  void handle_error(Error err, const char* message, BaseEmitter* origin) override {
     Support::maybe_unused(origin);
-    EXPECT_EQ(err, asmjit::Error::kOk)
+    EXPECT_EQ(err, Error::kOk)
       .message("AsmJit Error: %s", message);
   }
 };
@@ -131,16 +141,16 @@ public:
 
 class JitContext {
 public:
-  asmjit::JitRuntime rt;
-  asmjit::CpuFeatures features {};
-  UniOptFlags opt_flags {};
+  JitRuntime rt;
+  CpuFeatures features {};
+  CpuHints cpu_hints {};
 
 #if !defined(ASMJIT_NO_LOGGING)
-  asmjit::StringLogger logger;
+  StringLogger logger;
 #endif // !ASMJIT_NO_LOGGING
 
   TestErrorHandler eh;
-  asmjit::CodeHolder code;
+  CodeHolder code;
   BackendCompiler cc;
 
   void prepare() noexcept {
@@ -154,16 +164,16 @@ public:
 #endif // !ASMJIT_NO_LOGGING
 
     code.attach(&cc);
-    cc.add_diagnostic_options(asmjit::DiagnosticOptions::kRAAnnotate);
-    cc.add_diagnostic_options(asmjit::DiagnosticOptions::kValidateAssembler);
-    cc.add_diagnostic_options(asmjit::DiagnosticOptions::kValidateIntermediate);
+    cc.add_diagnostic_options(DiagnosticOptions::kRAAnnotate);
+    cc.add_diagnostic_options(DiagnosticOptions::kValidateAssembler);
+    cc.add_diagnostic_options(DiagnosticOptions::kValidateIntermediate);
   }
 
   template<typename Fn>
   Fn finish() noexcept {
     Fn fn;
-    EXPECT_EQ(cc.finalize(), asmjit::Error::kOk);
-    EXPECT_EQ(rt.add(&fn, &code), asmjit::Error::kOk);
+    EXPECT_EQ(cc.finalize(), Error::kOk);
+    EXPECT_EQ(rt.add(&fn, &code), Error::kOk);
     code.reset();
     return fn;
   }
@@ -180,9 +190,9 @@ public:
 
 static TestCondRRFunc create_func_cond_rr(JitContext& ctx, UniOpCond op, CondCode cond_code, uint32_t variation) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<uint32_t, int32_t, int32_t>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<uint32_t, int32_t, int32_t>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(VecWidth::k128);
@@ -200,7 +210,7 @@ static TestCondRRFunc create_func_cond_rr(JitContext& ctx, UniOpCond op, CondCod
       // Test a conditional branch based on the given condition.
       Label done = pc.new_label();
       pc.mov(result, 1);
-      pc.j(done, Condition(op, cond_code, a, b));
+      pc.j(done, UniCondition(op, cond_code, a, b));
       pc.mov(result, 0);
       pc.bind(done);
       break;
@@ -211,7 +221,7 @@ static TestCondRRFunc create_func_cond_rr(JitContext& ctx, UniOpCond op, CondCod
       Gp true_value = pc.new_gp32("true_value");
       pc.mov(result, 0);
       pc.mov(true_value, 1);
-      pc.cmov(result, true_value, Condition(op, cond_code, a, b));
+      pc.cmov(result, true_value, UniCondition(op, cond_code, a, b));
       break;
     }
 
@@ -221,7 +231,7 @@ static TestCondRRFunc create_func_cond_rr(JitContext& ctx, UniOpCond op, CondCod
       Gp true_value = pc.new_gp32("true_value");
       pc.mov(false_value, 0);
       pc.mov(true_value, 1);
-      pc.select(result, true_value, false_value, Condition(op, cond_code, a, b));
+      pc.select(result, true_value, false_value, UniCondition(op, cond_code, a, b));
       break;
     }
   }
@@ -234,9 +244,9 @@ static TestCondRRFunc create_func_cond_rr(JitContext& ctx, UniOpCond op, CondCod
 
 static TestCondRIFunc create_func_cond_ri(JitContext& ctx, UniOpCond op, CondCode cond_code, Imm bImm) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<uint32_t, int32_t>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<uint32_t, int32_t>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(VecWidth::k128);
@@ -248,7 +258,7 @@ static TestCondRIFunc create_func_cond_ri(JitContext& ctx, UniOpCond op, CondCod
 
   node->set_arg(0, a);
   pc.mov(result, 1);
-  pc.j(done, Condition(op, cond_code, a, bImm));
+  pc.j(done, UniCondition(op, cond_code, a, bImm));
   pc.mov(result, 0);
   pc.bind(done);
   ctx.cc.ret(result);
@@ -552,9 +562,9 @@ static ASMJIT_NOINLINE void test_cond_ops(JitContext& ctx) noexcept {
 
 static TestMFunc create_func_m(JitContext& ctx, UniOpM op) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<void, void*>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<void, void*>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(VecWidth::k128);
@@ -619,9 +629,9 @@ static ASMJIT_NOINLINE void test_m_ops(JitContext& ctx) noexcept {
 
 static TestRMFunc create_func_rm(JitContext& ctx, UniOpRM op) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<uintptr_t, uintptr_t, void*>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<uintptr_t, uintptr_t, void*>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(VecWidth::k128);
@@ -735,9 +745,9 @@ static ASMJIT_NOINLINE void test_rm_ops(JitContext& ctx) noexcept {
 
 static TestMRFunc create_func_mr(JitContext& ctx, UniOpMR op) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<void, void*, uintptr_t>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<void, void*, uintptr_t>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(VecWidth::k128);
@@ -849,9 +859,9 @@ static ASMJIT_NOINLINE void test_mr_ops(JitContext& ctx) noexcept {
 
 static TestRRFunc create_func_rr(JitContext& ctx, UniOpRR op) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<uint32_t, uint32_t>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<uint32_t, uint32_t>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(VecWidth::k128);
@@ -929,9 +939,9 @@ static ASMJIT_NOINLINE void test_rr_ops(JitContext& ctx) noexcept {
 
 static TestRRRFunc create_func_rrr(JitContext& ctx, UniOpRRR op) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<uint32_t, uint32_t, uint32_t>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<uint32_t, uint32_t, uint32_t>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(VecWidth::k128);
@@ -953,9 +963,9 @@ static TestRRRFunc create_func_rrr(JitContext& ctx, UniOpRRR op) noexcept {
 
 static TestRRIFunc create_func_rri(JitContext& ctx, UniOpRRR op, Imm bImm) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<uint32_t, uint32_t>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<uint32_t, uint32_t>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(VecWidth::k128);
@@ -1161,9 +1171,9 @@ static constexpr uint32_t kNumVariationsVV_Broadcast = 4;
 
 static TestVVFunc create_func_vv(JitContext& ctx, VecWidth vw, UniOpVV op, Variation variation = Variation{0}) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<void, void*, const void*>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<void, void*, const void*>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(vw);
@@ -1175,7 +1185,7 @@ static TestVVFunc create_func_vv(JitContext& ctx, VecWidth vw, UniOpVV op, Varia
   node->set_arg(0, dst_ptr);
   node->set_arg(1, src_ptr);
 
-  Vec dst_vec = pc.new_vec(vw, "dst_vec");
+  Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec");
 
   // There are some instructions that fill the high part of the register, so just zero the destination to make
   // sure that we can test this function (that the low part is actually zeroed and doesn't contain garbage).
@@ -1233,7 +1243,7 @@ static TestVVFunc create_func_vv(JitContext& ctx, VecWidth vw, UniOpVV op, Varia
     pc.emit_2v(op, dst_vec, dst_vec);
   }
   else {
-    Vec src_vec = pc.new_vec(vw, "src_vec");
+    Vec src_vec = pc.new_vec_with_width(vw, "src_vec");
     pc.v_loaduvec(src_vec, mem_ptr(src_ptr));
     pc.emit_2v(op, dst_vec, src_vec);
   }
@@ -1252,9 +1262,9 @@ static constexpr uint32_t kNumVariationsVVI = 3;
 
 static TestVVFunc create_func_vvi(JitContext& ctx, VecWidth vw, UniOpVVI op, uint32_t imm, Variation variation = Variation{0}) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<void, void*, const void*>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<void, void*, const void*>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(vw);
@@ -1266,14 +1276,14 @@ static TestVVFunc create_func_vvi(JitContext& ctx, VecWidth vw, UniOpVVI op, uin
   node->set_arg(0, dst_ptr);
   node->set_arg(1, src_ptr);
 
-  Vec src_vec = pc.new_vec(vw, "src_vec");
+  Vec src_vec = pc.new_vec_with_width(vw, "src_vec");
 
   switch (variation.value) {
     default:
     case 0: {
       // There are some instructions that fill the high part of the register, so just zero the destination to make
       // sure that we can test this function (that the low part is actually zeroed and doesn't contain garbage).
-      Vec dst_vec = pc.new_vec(vw, "dst_vec");
+      Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec");
       pc.v_zero_i(dst_vec);
 
       pc.v_loaduvec(src_vec, mem_ptr(src_ptr));
@@ -1291,7 +1301,7 @@ static TestVVFunc create_func_vvi(JitContext& ctx, VecWidth vw, UniOpVVI op, uin
     }
 
     case 2: {
-      Vec dst_vec = pc.new_vec(vw, "dst_vec");
+      Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec");
       pc.emit_2vi(op, dst_vec, mem_ptr(src_ptr), imm);
       pc.v_storeuvec(mem_ptr(dst_ptr), dst_vec);
       break;
@@ -1312,9 +1322,9 @@ static constexpr uint32_t kNumVariationsVVV = 5;
 
 static TestVVVFunc create_func_vvv(JitContext& ctx, VecWidth vw, UniOpVVV op, Variation variation = Variation{0}) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<void, void*, const void*, const void*>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<void, void*, const void*, const void*>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(vw);
@@ -1328,15 +1338,15 @@ static TestVVVFunc create_func_vvv(JitContext& ctx, VecWidth vw, UniOpVVV op, Va
   node->set_arg(1, src1_ptr);
   node->set_arg(2, src2_ptr);
 
-  Vec src1_vec = pc.new_vec(vw, "src1_vec");
-  Vec src2_vec = pc.new_vec(vw, "src2_vec");
+  Vec src1_vec = pc.new_vec_with_width(vw, "src1_vec");
+  Vec src2_vec = pc.new_vec_with_width(vw, "src2_vec");
 
   switch (variation.value) {
     default:
     case 0: {
       // There are some instructions that fill the high part of the register, so just zero the destination to make
       // sure that we can test this function (that the low part is actually zeroed and doesn't contain garbage).
-      Vec dst_vec = pc.new_vec(vw, "dst_vec");
+      Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec");
       pc.v_zero_i(dst_vec);
 
       pc.v_loaduvec(src1_vec, mem_ptr(src1_ptr));
@@ -1366,7 +1376,7 @@ static TestVVVFunc create_func_vvv(JitContext& ctx, VecWidth vw, UniOpVVV op, Va
     }
 
     case 3: {
-      Vec dst_vec = pc.new_vec(vw, "dst_vec");
+      Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec");
       pc.v_zero_i(dst_vec);
 
       pc.v_loaduvec(src1_vec, mem_ptr(src1_ptr));
@@ -1393,9 +1403,9 @@ static constexpr uint32_t kNumVariationsVVVI = 5;
 
 static TestVVVFunc create_func_vvvi(JitContext& ctx, VecWidth vw, UniOpVVVI op, uint32_t imm, Variation variation = Variation{0}) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<void, void*, const void*, const void*>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<void, void*, const void*, const void*>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(vw);
@@ -1409,15 +1419,15 @@ static TestVVVFunc create_func_vvvi(JitContext& ctx, VecWidth vw, UniOpVVVI op,
   node->set_arg(1, src1_ptr);
   node->set_arg(2, src2_ptr);
 
-  Vec src1_vec = pc.new_vec(vw, "src1_vec");
-  Vec src2_vec = pc.new_vec(vw, "src2_vec");
+  Vec src1_vec = pc.new_vec_with_width(vw, "src1_vec");
+  Vec src2_vec = pc.new_vec_with_width(vw, "src2_vec");
 
   switch (variation.value) {
     default:
     case 0: {
       // There are some instructions that fill the high part of the register, so just zero the destination to make
       // sure that we can test this function (that the low part is actually zeroed and doesn't contain garbage).
-      Vec dst_vec = pc.new_vec(vw, "dst_vec");
+      Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec");
       pc.v_zero_i(dst_vec);
 
       pc.v_loaduvec(src1_vec, mem_ptr(src1_ptr));
@@ -1447,7 +1457,7 @@ static TestVVVFunc create_func_vvvi(JitContext& ctx, VecWidth vw, UniOpVVVI op,
     }
 
     case 3: {
-      Vec dst_vec = pc.new_vec(vw, "dst_vec");
+      Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec");
       pc.v_zero_i(dst_vec);
 
       pc.v_loaduvec(src1_vec, mem_ptr(src1_ptr));
@@ -1479,9 +1489,9 @@ static constexpr uint32_t kNumVariationsVVVV = 4;
 
 static TestVVVVFunc create_func_vvvv(JitContext& ctx, VecWidth vw, UniOpVVVV op, Variation variation = Variation{0}) noexcept {
   ctx.prepare();
-  UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+  UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
-  asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build<void, void*, const void*, const void*, const void*>());
+  FuncNode* node = ctx.cc.new_func(FuncSignature::build<void, void*, const void*, const void*, const void*>());
   EXPECT_NOT_NULL(node);
 
   pc.init_vec_width(vw);
@@ -1497,9 +1507,9 @@ static TestVVVVFunc create_func_vvvv(JitContext& ctx, VecWidth vw, UniOpVVVV op,
   node->set_arg(2, src2_ptr);
   node->set_arg(3, src3_ptr);
 
-  Vec src1_vec = pc.new_vec(vw, "src1_vec");
-  Vec src2_vec = pc.new_vec(vw, "src2_vec");
-  Vec src3_vec = pc.new_vec(vw, "src3_vec");
+  Vec src1_vec = pc.new_vec_with_width(vw, "src1_vec");
+  Vec src2_vec = pc.new_vec_with_width(vw, "src2_vec");
+  Vec src3_vec = pc.new_vec_with_width(vw, "src3_vec");
 
   pc.v_loaduvec(src1_vec, mem_ptr(src1_ptr));
   pc.v_loaduvec(src2_vec, mem_ptr(src2_ptr));
@@ -1510,7 +1520,7 @@ static TestVVVVFunc create_func_vvvv(JitContext& ctx, VecWidth vw, UniOpVVVV op,
     case 0: {
       // There are some instructions that fill the high part of the register, so just zero the destination to make
       // sure that we can test this function (that the low part is actually zeroed and doesn't contain garbage).
-      Vec dst_vec = pc.new_vec(vw, "dst_vec");
+      Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec");
       pc.v_zero_i(dst_vec);
 
       pc.emit_4v(op, dst_vec, src1_vec, src2_vec, src3_vec);
@@ -1847,8 +1857,14 @@ static const char* vec_op_name_vv(UniOpVV op) noexcept {
     case UniOpVV::kCvtI32HiToI64     : return "v_cvt_i32_hi_to_i64";
     case UniOpVV::kCvtU32LoToU64     : return "v_cvt_u32_lo_to_u64";
     case UniOpVV::kCvtU32HiToU64     : return "v_cvt_u32_hi_to_u64";
+    case UniOpVV::kAbsF32S           : return "s_abs_f32";
+    case UniOpVV::kAbsF64S           : return "s_abs_f64";
     case UniOpVV::kAbsF32            : return "v_abs_f32";
     case UniOpVV::kAbsF64            : return "v_abs_f64";
+    case UniOpVV::kNegF32S           : return "s_neg_f32";
+    case UniOpVV::kNegF64S           : return "s_neg_f64";
+    case UniOpVV::kNegF32            : return "v_neg_f32";
+    case UniOpVV::kNegF64            : return "v_neg_f64";
     case UniOpVV::kNotF32            : return "v_not_f32";
     case UniOpVV::kNotF64            : return "v_not_f64";
     case UniOpVV::kTruncF32S         : return "v_trunc_f32s";
@@ -1863,10 +1879,18 @@ static const char* vec_op_name_vv(UniOpVV op) noexcept {
     case UniOpVV::kCeilF64S          : return "v_ceil_f64s";
     case UniOpVV::kCeilF32           : return "v_ceil_f32";
     case UniOpVV::kCeilF64           : return "v_ceil_f64";
-    case UniOpVV::kRoundF32S         : return "v_round_f32s";
-    case UniOpVV::kRoundF64S         : return "v_round_f64s";
-    case UniOpVV::kRoundF32          : return "v_round_f32";
-    case UniOpVV::kRoundF64          : return "v_round_f64";
+    case UniOpVV::kRoundEvenF32S     : return "v_round_even_f32s";
+    case UniOpVV::kRoundEvenF64S     : return "v_round_even_f64s";
+    case UniOpVV::kRoundEvenF32      : return "v_round_even_f32";
+    case UniOpVV::kRoundEvenF64      : return "v_round_even_f64";
+    case UniOpVV::kRoundHalfAwayF32S : return "v_round_half_away_f32s";
+    case UniOpVV::kRoundHalfAwayF64S : return "v_round_half_away_f64s";
+    case UniOpVV::kRoundHalfAwayF32  : return "v_round_half_away_f32";
+    case UniOpVV::kRoundHalfAwayF64  : return "v_round_half_away_f64";
+    case UniOpVV::kRoundHalfUpF32S   : return "v_round_half_up_f32s";
+    case UniOpVV::kRoundHalfUpF64S   : return "v_round_half_up_f64s";
+    case UniOpVV::kRoundHalfUpF32    : return "v_round_half_up_f32";
+    case UniOpVV::kRoundHalfUpF64    : return "v_round_half_up_f64";
     case UniOpVV::kRcpF32            : return "v_rcp_f32";
     case UniOpVV::kRcpF64            : return "v_rcp_f64";
     case UniOpVV::kSqrtF32S          : return "v_sqrt_f32s";
@@ -1888,10 +1912,9 @@ static const char* vec_op_name_vv(UniOpVV op) noexcept {
     case UniOpVV::kCvtRoundF32ToI32  : return "v_cvt_round_f32_to_i32";
     case UniOpVV::kCvtRoundF64ToI32Lo: return "v_cvt_round_f64_to_i32_lo";
     case UniOpVV::kCvtRoundF64ToI32Hi: return "v_cvt_round_f64_to_i32_hi";
-
-    default:
-      ASMJIT_NOT_REACHED();
   }
+
+  ASMJIT_NOT_REACHED();
 }
 
 static VecOpInfo vec_op_info_vv(UniOpVV op) noexcept {
@@ -1936,8 +1959,14 @@ static VecOpInfo vec_op_info_vv(UniOpVV op) noexcept {
     case UniOpVV::kCvtI32HiToI64     : return VecOpInfo::make(VE::kInt64, VE::kInt32);
     case UniOpVV::kCvtU32LoToU64     : return VecOpInfo::make(VE::kUInt64, VE::kUInt32);
     case UniOpVV::kCvtU32HiToU64     : return VecOpInfo::make(VE::kUInt64, VE::kUInt32);
+    case UniOpVV::kAbsF32S           : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
+    case UniOpVV::kAbsF64S           : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
     case UniOpVV::kAbsF32            : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
     case UniOpVV::kAbsF64            : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
+    case UniOpVV::kNegF32S           : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
+    case UniOpVV::kNegF64S           : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
+    case UniOpVV::kNegF32            : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
+    case UniOpVV::kNegF64            : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
     case UniOpVV::kNotF32            : return VecOpInfo::make(VE::kUInt32, VE::kUInt32);
     case UniOpVV::kNotF64            : return VecOpInfo::make(VE::kUInt64, VE::kUInt64);
     case UniOpVV::kTruncF32S         : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
@@ -1952,10 +1981,18 @@ static VecOpInfo vec_op_info_vv(UniOpVV op) noexcept {
     case UniOpVV::kCeilF64S          : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
     case UniOpVV::kCeilF32           : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
     case UniOpVV::kCeilF64           : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
-    case UniOpVV::kRoundF32S         : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
-    case UniOpVV::kRoundF64S         : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
-    case UniOpVV::kRoundF32          : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
-    case UniOpVV::kRoundF64          : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
+    case UniOpVV::kRoundEvenF32S     : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
+    case UniOpVV::kRoundEvenF64S     : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
+    case UniOpVV::kRoundEvenF32      : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
+    case UniOpVV::kRoundEvenF64      : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
+    case UniOpVV::kRoundHalfAwayF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
+    case UniOpVV::kRoundHalfAwayF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
+    case UniOpVV::kRoundHalfAwayF32  : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
+    case UniOpVV::kRoundHalfAwayF64  : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
+    case UniOpVV::kRoundHalfUpF32S   : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
+    case UniOpVV::kRoundHalfUpF64S   : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
+    case UniOpVV::kRoundHalfUpF32    : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
+    case UniOpVV::kRoundHalfUpF64    : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
     case UniOpVV::kRcpF32            : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
     case UniOpVV::kRcpF64            : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
     case UniOpVV::kSqrtF32S          : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
@@ -1977,10 +2014,9 @@ static VecOpInfo vec_op_info_vv(UniOpVV op) noexcept {
     case UniOpVV::kCvtRoundF32ToI32  : return VecOpInfo::make(VE::kInt32, VE::kFloat32);
     case UniOpVV::kCvtRoundF64ToI32Lo: return VecOpInfo::make(VE::kInt32, VE::kFloat64);
     case UniOpVV::kCvtRoundF64ToI32Hi: return VecOpInfo::make(VE::kInt32, VE::kFloat64);
-
-    default:
-      ASMJIT_NOT_REACHED();
   }
+
+  ASMJIT_NOT_REACHED();
 }
 
 static const char* vec_op_name_vvi(UniOpVVI op) noexcept {
@@ -2030,43 +2066,48 @@ static const char* vec_op_name_vvi(UniOpVVI op) noexcept {
     case UniOpVVI::kSrlnHiU32      : return "v_srln_hi_u32";
     case UniOpVVI::kSrlnLoU64      : return "v_srln_lo_u64";
     case UniOpVVI::kSrlnHiU64      : return "v_srln_hi_u64";
+    case UniOpVVI::kSrlnRndLoU16   : return "v_srln_rnd_lo_u16";
+    case UniOpVVI::kSrlnRndHiU16   : return "v_srln_rnd_hi_u16";
+    case UniOpVVI::kSrlnRndLoU32   : return "v_srln_rnd_lo_u32";
+    case UniOpVVI::kSrlnRndHiU32   : return "v_srln_rnd_hi_u32";
+    case UniOpVVI::kSrlnRndLoU64   : return "v_srln_rnd_lo_u64";
+    case UniOpVVI::kSrlnRndHiU64   : return "v_srln_rnd_hi_u64";
 #endif // ASMJIT_UJIT_AARCH64
-
-    default:
-      ASMJIT_NOT_REACHED();
   }
+
+  ASMJIT_NOT_REACHED();
 }
 
 static VecOpInfo vec_op_info_vvi(UniOpVVI op) noexcept {
   using VE = VecElementType;
 
   switch (op) {
-    case UniOpVVI::kSllU16         : return VecOpInfo::make(VE::kUInt16, VE::kUInt16);
-    case UniOpVVI::kSllU32         : return VecOpInfo::make(VE::kUInt32, VE::kUInt32);
-    case UniOpVVI::kSllU64         : return VecOpInfo::make(VE::kUInt64, VE::kUInt64);
-    case UniOpVVI::kSrlU16         : return VecOpInfo::make(VE::kUInt16, VE::kUInt16);
-    case UniOpVVI::kSrlU32         : return VecOpInfo::make(VE::kUInt32, VE::kUInt32);
-    case UniOpVVI::kSrlU64         : return VecOpInfo::make(VE::kUInt64, VE::kUInt64);
-    case UniOpVVI::kSraI16         : return VecOpInfo::make(VE::kInt16, VE::kInt16);
-    case UniOpVVI::kSraI32         : return VecOpInfo::make(VE::kInt32, VE::kInt32);
-    case UniOpVVI::kSraI64         : return VecOpInfo::make(VE::kInt64, VE::kInt64);
-    case UniOpVVI::kSllbU128       : return VecOpInfo::make(VE::kUInt8, VE::kUInt8);
-    case UniOpVVI::kSrlbU128       : return VecOpInfo::make(VE::kUInt8, VE::kUInt8);
-    case UniOpVVI::kSwizzleU16x4   : return VecOpInfo::make(VE::kUInt16, VE::kUInt16);
-    case UniOpVVI::kSwizzleLoU16x4 : return VecOpInfo::make(VE::kUInt16, VE::kUInt16);
-    case UniOpVVI::kSwizzleHiU16x4 : return VecOpInfo::make(VE::kUInt16, VE::kUInt16);
-    case UniOpVVI::kSwizzleU32x4   : return VecOpInfo::make(VE::kUInt32, VE::kUInt32);
-    case UniOpVVI::kSwizzleU64x2   : return VecOpInfo::make(VE::kUInt64, VE::kUInt64);
+    case UniOpVVI::kSllU16         : return VecOpInfo::make(VE::kUInt16 , VE::kUInt16);
+    case UniOpVVI::kSllU32         : return VecOpInfo::make(VE::kUInt32 , VE::kUInt32);
+    case UniOpVVI::kSllU64         : return VecOpInfo::make(VE::kUInt64 , VE::kUInt64);
+    case UniOpVVI::kSrlU16         : return VecOpInfo::make(VE::kUInt16 , VE::kUInt16);
+    case UniOpVVI::kSrlU32         : return VecOpInfo::make(VE::kUInt32 , VE::kUInt32);
+    case UniOpVVI::kSrlU64         : return VecOpInfo::make(VE::kUInt64 , VE::kUInt64);
+    case UniOpVVI::kSraI16         : return VecOpInfo::make(VE::kInt16  , VE::kInt16);
+    case UniOpVVI::kSraI32         : return VecOpInfo::make(VE::kInt32  , VE::kInt32);
+    case UniOpVVI::kSraI64         : return VecOpInfo::make(VE::kInt64  , VE::kInt64);
+    case UniOpVVI::kSllbU128       : return VecOpInfo::make(VE::kUInt8  , VE::kUInt8);
+    case UniOpVVI::kSrlbU128       : return VecOpInfo::make(VE::kUInt8  , VE::kUInt8);
+    case UniOpVVI::kSwizzleU16x4   : return VecOpInfo::make(VE::kUInt16 , VE::kUInt16);
+    case UniOpVVI::kSwizzleLoU16x4 : return VecOpInfo::make(VE::kUInt16 , VE::kUInt16);
+    case UniOpVVI::kSwizzleHiU16x4 : return VecOpInfo::make(VE::kUInt16 , VE::kUInt16);
+    case UniOpVVI::kSwizzleU32x4   : return VecOpInfo::make(VE::kUInt32 , VE::kUInt32);
+    case UniOpVVI::kSwizzleU64x2   : return VecOpInfo::make(VE::kUInt64 , VE::kUInt64);
     case UniOpVVI::kSwizzleF32x4   : return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
     case UniOpVVI::kSwizzleF64x2   : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
-    case UniOpVVI::kSwizzleU64x4   : return VecOpInfo::make(VE::kUInt64, VE::kUInt64);
+    case UniOpVVI::kSwizzleU64x4   : return VecOpInfo::make(VE::kUInt64 , VE::kUInt64);
     case UniOpVVI::kSwizzleF64x4   : return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
-    case UniOpVVI::kExtractV128_I32: return VecOpInfo::make(VE::kInt32, VE::kInt32);
-    case UniOpVVI::kExtractV128_I64: return VecOpInfo::make(VE::kInt64, VE::kInt64);
+    case UniOpVVI::kExtractV128_I32: return VecOpInfo::make(VE::kInt32  , VE::kInt32);
+    case UniOpVVI::kExtractV128_I64: return VecOpInfo::make(VE::kInt64  , VE::kInt64);
     case UniOpVVI::kExtractV128_F32: return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
     case UniOpVVI::kExtractV128_F64: return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
-    case UniOpVVI::kExtractV256_I32: return VecOpInfo::make(VE::kUInt32, VE::kUInt32);
-    case UniOpVVI::kExtractV256_I64: return VecOpInfo::make(VE::kUInt64, VE::kUInt64);
+    case UniOpVVI::kExtractV256_I32: return VecOpInfo::make(VE::kUInt32 , VE::kUInt32);
+    case UniOpVVI::kExtractV256_I64: return VecOpInfo::make(VE::kUInt64 , VE::kUInt64);
     case UniOpVVI::kExtractV256_F32: return VecOpInfo::make(VE::kFloat32, VE::kFloat32);
     case UniOpVVI::kExtractV256_F64: return VecOpInfo::make(VE::kFloat64, VE::kFloat64);
 
@@ -2086,11 +2127,16 @@ static VecOpInfo vec_op_info_vvi(UniOpVVI op) noexcept {
     case UniOpVVI::kSrlnHiU32      : return VecOpInfo::make(VE::kUInt16, VE::kUInt16);
     case UniOpVVI::kSrlnLoU64      : return VecOpInfo::make(VE::kUInt32, VE::kUInt32);
     case UniOpVVI::kSrlnHiU64      : return VecOpInfo::make(VE::kUInt64, VE::kUInt64);
+    case UniOpVVI::kSrlnRndLoU16   : return VecOpInfo::make(VE::kUInt8 , VE::kUInt16);
+    case UniOpVVI::kSrlnRndHiU16   : return VecOpInfo::make(VE::kUInt8 , VE::kUInt16);
+    case UniOpVVI::kSrlnRndLoU32   : return VecOpInfo::make(VE::kUInt16, VE::kUInt32);
+    case UniOpVVI::kSrlnRndHiU32   : return VecOpInfo::make(VE::kUInt16, VE::kUInt32);
+    case UniOpVVI::kSrlnRndLoU64   : return VecOpInfo::make(VE::kUInt32, VE::kUInt64);
+    case UniOpVVI::kSrlnRndHiU64   : return VecOpInfo::make(VE::kUInt32, VE::kUInt64);
 #endif // ASMJIT_UJIT_AARCH64
-
-    default:
-      ASMJIT_NOT_REACHED();
   }
+
+  ASMJIT_NOT_REACHED();
 }
 
 static const char* vec_op_name_vvv(UniOpVVV op) noexcept {
@@ -2208,6 +2254,10 @@ static const char* vec_op_name_vvv(UniOpVVV op) noexcept {
     case UniOpVVV::kDivF64S        : return "v_div_f64s";
     case UniOpVVV::kDivF32         : return "v_div_f32";
     case UniOpVVV::kDivF64         : return "v_div_f64";
+    case UniOpVVV::kModF32S        : return "v_mod_f32s";
+    case UniOpVVV::kModF64S        : return "v_mod_f64s";
+    case UniOpVVV::kModF32         : return "v_mod_f32";
+    case UniOpVVV::kModF64         : return "v_mod_f64";
     case UniOpVVV::kMinF32S        : return "v_min_f32s";
     case UniOpVVV::kMinF64S        : return "v_min_f64s";
     case UniOpVVV::kMinF32         : return "v_min_f32";
@@ -2298,9 +2348,15 @@ static const char* vec_op_name_vvv(UniOpVVV op) noexcept {
     case UniOpVVV::kMAddwHiU32     : return "v_maddw_hi_u32";
 #endif // ASMJIT_UJIT_AARCH64
 
-    default:
-      ASMJIT_NOT_REACHED();
+#if defined(ASMJIT_UJIT_X86)
+    case UniOpVVV::kPermuteU8       : return "v_permute_u8";
+    case UniOpVVV::kPermuteU16      : return "v_permute_u16";
+    case UniOpVVV::kPermuteU32      : return "v_permute_u32";
+    case UniOpVVV::kPermuteU64      : return "v_permute_u64";
+#endif // ASMJIT_UJIT_X86
   }
+
+  ASMJIT_NOT_REACHED();
 }
 
 static VecOpInfo vec_op_info_vvv(UniOpVVV op) noexcept {
@@ -2420,6 +2476,10 @@ static VecOpInfo vec_op_info_vvv(UniOpVVV op) noexcept {
     case UniOpVVV::kDivF64S        : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64);
     case UniOpVVV::kDivF32         : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32);
     case UniOpVVV::kDivF64         : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64);
+    case UniOpVVV::kModF32S        : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32);
+    case UniOpVVV::kModF64S        : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64);
+    case UniOpVVV::kModF32         : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32);
+    case UniOpVVV::kModF64         : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64);
     case UniOpVVV::kMinF32S        : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32);
     case UniOpVVV::kMinF64S        : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64);
     case UniOpVVV::kMinF32         : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32);
@@ -2510,9 +2570,15 @@ static VecOpInfo vec_op_info_vvv(UniOpVVV op) noexcept {
     case UniOpVVV::kMAddwHiU32     : return VecOpInfo::make(VE::kUInt64, VE::kUInt32, VE::kUInt32);
 #endif // ASMJIT_UJIT_AARCH64
 
-    default:
-      ASMJIT_NOT_REACHED();
+#if defined(ASMJIT_UJIT_X86)
+    case UniOpVVV::kPermuteU8      : return VecOpInfo::make(VE::kUInt8, VE::kUInt8, VE::kUInt8);
+    case UniOpVVV::kPermuteU16     : return VecOpInfo::make(VE::kUInt16, VE::kUInt16, VE::kUInt16);
+    case UniOpVVV::kPermuteU32     : return VecOpInfo::make(VE::kUInt32, VE::kUInt32, VE::kUInt32);
+    case UniOpVVV::kPermuteU64     : return VecOpInfo::make(VE::kUInt64, VE::kUInt64, VE::kUInt64);
+#endif // ASMJIT_UJIT_X86
   }
+
+  ASMJIT_NOT_REACHED();
 }
 
 static const char* vec_op_name_vvvi(UniOpVVVI op) noexcept {
@@ -2530,10 +2596,9 @@ static const char* vec_op_name_vvvi(UniOpVVVI op) noexcept {
     case UniOpVVVI::kInsertV256_F32        : return "v_insert_v256_f32";
     case UniOpVVVI::kInsertV256_U64        : return "v_insert_v256_u64";
     case UniOpVVVI::kInsertV256_F64        : return "v_insert_v256_f64";
-
-    default:
-      ASMJIT_NOT_REACHED();
   }
+
+  ASMJIT_NOT_REACHED();
 }
 
 static VecOpInfo vec_op_info_vvvi(UniOpVVVI op) noexcept {
@@ -2580,10 +2645,9 @@ static const char* vec_op_name_vvvv(UniOpVVVV op) noexcept {
     case UniOpVVVV::kNMSubF64S: return "v_nmsub_f64s";
     case UniOpVVVV::kNMSubF32 : return "v_nmsub_f32";
     case UniOpVVVV::kNMSubF64 : return "v_nmsub_f64";
-
-    default:
-      ASMJIT_NOT_REACHED();
   }
+
+  ASMJIT_NOT_REACHED();
 }
 
 static VecOpInfo vec_op_info_vvvv(UniOpVVVV op) noexcept {
@@ -2609,49 +2673,55 @@ static VecOpInfo vec_op_info_vvvv(UniOpVVVV op) noexcept {
     case UniOpVVVV::kNMSubF64S: return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64, VE::kFloat64);
     case UniOpVVVV::kNMSubF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32, VE::kFloat32);
     case UniOpVVVV::kNMSubF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64, VE::kFloat64);
-
-    default:
-      ASMJIT_NOT_REACHED();
   }
+
+  ASMJIT_NOT_REACHED();
 }
 
 // ujit::UniCompiler - Tests - SIMD - Float To Int - Machine Behavior
 // ==================================================================
 
-#if defined(ASMJIT_UJIT_X86)
-static ASMJIT_INLINE_NODEBUG int32_t cvt_non_finite_f32_to_i32([[maybe_unused]] float x) noexcept { return INT32_MIN; }
-static ASMJIT_INLINE_NODEBUG int32_t cvt_non_finite_f64_to_i32([[maybe_unused]] double x) noexcept { return INT32_MIN; }
-#endif // ASMJIT_UJIT_X86
+template<FloatToIntOutsideRangeBehavior behavior, typename IntT, typename FloatT>
+static ASMJIT_INLINE_NODEBUG int32_t cvt_float_to_int_trunc(const FloatT& x) noexcept {
+  constexpr IntT min_value = std::numeric_limits<IntT>::lowest();
+  constexpr IntT max_value = std::numeric_limits<IntT>::max();
+  constexpr IntT zero = IntT(0);
 
-#if defined(ASMJIT_UJIT_AARCH64)
-static constexpr int32_t kPInfToInt32 = INT32_MAX;
-static constexpr int32_t kNInfToInt32 = INT32_MIN;
-static constexpr int32_t kNaNToInt32 = 0;
+  if (std::isnan(x)) {
+    return behavior == FloatToIntOutsideRangeBehavior::kSmallestValue ? min_value : zero;
+  }
 
-static ASMJIT_INLINE_NODEBUG int32_t cvt_non_finite_f32_to_i32(float x) noexcept {
-  if (x == std::numeric_limits<float>::infinity()) {
-    return kPInfToInt32;
+  if (x < FloatT(min_value)) {
+    return min_value;
   }
-  else if (x == -std::numeric_limits<float>::infinity()) {
-    return kNInfToInt32;
-  }
-  else {
-    return kNaNToInt32;
+
+  if (x > FloatT(max_value)) {
+    return behavior == FloatToIntOutsideRangeBehavior::kSmallestValue ? min_value : max_value;
   }
+
+  return IntT(x);
 }
 
-static ASMJIT_INLINE_NODEBUG int32_t cvt_non_finite_f64_to_i32(double x) noexcept {
-  if (x == std::numeric_limits<double>::infinity()) {
-    return kPInfToInt32;
+template<FloatToIntOutsideRangeBehavior behavior, typename IntT, typename FloatT>
+static ASMJIT_INLINE_NODEBUG int32_t cvt_float_to_int_round(const FloatT& x) noexcept {
+  constexpr IntT min_value = std::numeric_limits<IntT>::lowest();
+  constexpr IntT max_value = std::numeric_limits<IntT>::max();
+  constexpr IntT zero = IntT(0);
+
+  if (std::isnan(x)) {
+    return behavior == FloatToIntOutsideRangeBehavior::kSmallestValue ? min_value : zero;
   }
-  else if (x == -std::numeric_limits<double>::infinity()) {
-    return kNInfToInt32;
+
+  if (x < FloatT(min_value)) {
+    return min_value;
   }
-  else {
-    return kNaNToInt32;
+
+  if (x > FloatT(max_value)) {
+    return behavior == FloatToIntOutsideRangeBehavior::kSmallestValue ? min_value : max_value;
   }
+
+  return IntT(std::nearbyint(x));
 }
-#endif // ASMJIT_UJIT_AARCH64
 
 // ujit::UniCompiler - Tests - SIMD - Data Generators & Constraints
 // ================================================================
@@ -2737,6 +2807,7 @@ public:
       case  66: return 0.1f;
       case  69: return 0.2f;
       case  79: return 0.3f;
+      case  89: return -13005961.0f;
       case  99: return -std::numeric_limits<float>::infinity();
       case 100:
       case 102:
@@ -2799,6 +2870,7 @@ public:
       case  66: return 0.1;
       case  69: return 0.2;
       case  79: return 0.3;
+      case  80: return 4503599627370495.5;
       case  99: return -std::numeric_limits<double>::infinity();
       case 100:
       case 102:
@@ -2813,6 +2885,7 @@ public:
       case 122: return 10.3;
       case 123: return 20.3;
       case 124: return -100.3;
+      case 125: return 4503599627370496.0;
       case 127: return 1.3;
       case 130: return std::numeric_limits<double>::quiet_NaN();
       case 135: return -std::numeric_limits<double>::infinity();
@@ -2824,6 +2897,7 @@ public:
       case 165: return -0.5;
       case 175: return -1.0;
       case 245: return 2.5;
+      case 248: return -4503599627370495.5;
 
       default: {
         double sign = rng.next_uint32() < 0x7FFFFFF ? 1.0 : -1.0;
@@ -2833,6 +2907,12 @@ public:
   }
 };
 
+template<typename T>
+struct half_minus_1ulp_const;
+
+template<> struct half_minus_1ulp_const<float> { static inline constexpr float value = 0.49999997f; };
+template<> struct half_minus_1ulp_const<double> { static inline constexpr double value = 0.49999999999999994; };
+
 // Some SIMD operations are constrained, especially those higher level. So, to successfully test these we
 // have to model the constraints in a way that the SIMD instruction we test actually gets the correct input.
 // Note that a constraint doesn't have to be always range based, it could be anything.
@@ -3326,16 +3406,24 @@ template<typename T> struct vec_op_ceil : public op_each_vv<T, vec_op_ceil<T>> {
   static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::ceil(a); }
 };
 
-template<typename T> struct vec_op_round : public op_each_vv<T, vec_op_round<T>> {
+template<typename T> struct vec_op_round_even : public op_each_vv<T, vec_op_round_even<T>> {
   static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::nearbyint(a); }
 };
 
+template<typename T> struct vec_op_round_half_away : public op_each_vv<T, vec_op_round_half_away<T>> {
+  static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::trunc(fadd(a, fxor(half_minus_1ulp_const<T>::value, fsign(a)))); }
+};
+
+template<typename T> struct vec_op_round_half_up : public op_each_vv<T, vec_op_round_half_up<T>> {
+  static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::floor(fadd(a, half_minus_1ulp_const<T>::value)); }
+};
+
 template<typename T> struct vec_op_sqrt : public op_each_vv<T, vec_op_sqrt<T>> {
-  static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::sqrt(a); }
+  static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return fsqrt(a); }
 };
 
 template<typename T> struct vec_op_rcp : public op_each_vv<T, vec_op_rcp<T>> {
-  static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return T(1) / a; }
+  static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return fdiv(T(1), a); }
 };
 
 struct vec_op_cvt_i32_to_f32 {
@@ -3403,130 +3491,74 @@ struct vec_op_cvt_i32_to_f64_impl {
 struct vec_op_cvt_i32_lo_to_f64 : public vec_op_cvt_i32_to_f64_impl<false> {};
 struct vec_op_cvt_i32_hi_to_f64 : public vec_op_cvt_i32_to_f64_impl<true> {};
 
+template<FloatToIntOutsideRangeBehavior behavior>
 struct vec_op_cvt_trunc_f32_to_i32 {
-  static ASMJIT_INLINE int32_t cvt(float val) noexcept {
-    if (!std::isfinite(val))
-      return cvt_non_finite_f32_to_i32(val);
-
-    if (val <= float(INT32_MIN)) {
-      return INT32_MIN;
-    }
-    else if (val >= float(INT32_MAX)) {
-      return INT32_MAX;
-    }
-    else {
-      return int32_t(val);
-    }
-  }
-
   template<uint32_t kW>
   static ASMJIT_INLINE VecOverlay<kW> apply(const VecOverlay<kW>& a) noexcept {
     VecOverlay<kW> out{};
     for (uint32_t off = 0; off < kW; off += 16) {
-      out.data_i32[off / 4 + 0] = cvt(a.data_f32[off / 4 + 0]);
-      out.data_i32[off / 4 + 1] = cvt(a.data_f32[off / 4 + 1]);
-      out.data_i32[off / 4 + 2] = cvt(a.data_f32[off / 4 + 2]);
-      out.data_i32[off / 4 + 3] = cvt(a.data_f32[off / 4 + 3]);
+      out.data_i32[off / 4 + 0] = cvt_float_to_int_trunc<behavior, int32_t>(a.data_f32[off / 4 + 0]);
+      out.data_i32[off / 4 + 1] = cvt_float_to_int_trunc<behavior, int32_t>(a.data_f32[off / 4 + 1]);
+      out.data_i32[off / 4 + 2] = cvt_float_to_int_trunc<behavior, int32_t>(a.data_f32[off / 4 + 2]);
+      out.data_i32[off / 4 + 3] = cvt_float_to_int_trunc<behavior, int32_t>(a.data_f32[off / 4 + 3]);
     }
     return out;
   }
 };
 
-template<bool kHi>
+template<FloatToIntOutsideRangeBehavior behavior, bool kHi>
 struct vec_op_cvt_trunc_f64_to_i32_impl {
-  static ASMJIT_INLINE int32_t cvt(double val) noexcept {
-    if (!std::isfinite(val)) {
-      return cvt_non_finite_f64_to_i32(val);
-    }
-
-    if (val <= double(INT32_MIN)) {
-      return INT32_MIN;
-    }
-    else if (val >= double(INT32_MAX)) {
-      return INT32_MAX;
-    }
-    else {
-      return int32_t(val);
-    }
-  }
-
   template<uint32_t kW>
   static ASMJIT_INLINE VecOverlay<kW> apply(const VecOverlay<kW>& a) noexcept {
     VecOverlay<kW> out{};
     uint32_t adj = kHi ? kW / 8 : 0u;
     for (uint32_t off = 0; off < kW; off += 16) {
-      out.data_i32[off / 8 + adj + 0] = cvt(a.data_f64[off / 8 + 0]);
-      out.data_i32[off / 8 + adj + 1] = cvt(a.data_f64[off / 8 + 1]);
+      out.data_i32[off / 8 + adj + 0] = cvt_float_to_int_trunc<behavior, int32_t>(a.data_f64[off / 8 + 0]);
+      out.data_i32[off / 8 + adj + 1] = cvt_float_to_int_trunc<behavior, int32_t>(a.data_f64[off / 8 + 1]);
     }
     return out;
   }
 };
 
-struct vec_op_cvt_trunc_f64_to_i32_lo : vec_op_cvt_trunc_f64_to_i32_impl<false> {};
-struct vec_op_cvt_trunc_f64_to_i32_hi : vec_op_cvt_trunc_f64_to_i32_impl<true> {};
+template<FloatToIntOutsideRangeBehavior behavior>
+struct vec_op_cvt_trunc_f64_to_i32_lo : vec_op_cvt_trunc_f64_to_i32_impl<behavior, false> {};
 
+template<FloatToIntOutsideRangeBehavior behavior>
+struct vec_op_cvt_trunc_f64_to_i32_hi : vec_op_cvt_trunc_f64_to_i32_impl<behavior, true> {};
+
+template<FloatToIntOutsideRangeBehavior behavior>
 struct vec_op_cvt_round_f32_to_i32 {
-  static ASMJIT_INLINE int32_t cvt(float val) noexcept {
-    if (!std::isfinite(val)) {
-      return cvt_non_finite_f32_to_i32(val);
-    }
-
-    if (val <= float(INT32_MIN)) {
-      return INT32_MIN;
-    }
-    else if (val >= float(INT32_MAX)) {
-      return INT32_MAX;
-    }
-    else {
-      return static_cast<int32_t>(std::nearbyint(val));
-    }
-  }
-
   template<uint32_t kW>
   static ASMJIT_INLINE VecOverlay<kW> apply(const VecOverlay<kW>& a) noexcept {
     VecOverlay<kW> out{};
     for (uint32_t off = 0; off < kW; off += 16) {
-      out.data_i32[off / 4 + 0] = cvt(a.data_f32[off / 4 + 0]);
-      out.data_i32[off / 4 + 1] = cvt(a.data_f32[off / 4 + 1]);
-      out.data_i32[off / 4 + 2] = cvt(a.data_f32[off / 4 + 2]);
-      out.data_i32[off / 4 + 3] = cvt(a.data_f32[off / 4 + 3]);
+      out.data_i32[off / 4 + 0] = cvt_float_to_int_round<behavior, int32_t>(a.data_f32[off / 4 + 0]);
+      out.data_i32[off / 4 + 1] = cvt_float_to_int_round<behavior, int32_t>(a.data_f32[off / 4 + 1]);
+      out.data_i32[off / 4 + 2] = cvt_float_to_int_round<behavior, int32_t>(a.data_f32[off / 4 + 2]);
+      out.data_i32[off / 4 + 3] = cvt_float_to_int_round<behavior, int32_t>(a.data_f32[off / 4 + 3]);
     }
     return out;
   }
 };
 
-template<bool kHi>
+template<FloatToIntOutsideRangeBehavior behavior, bool kHi>
 struct vec_op_cvt_round_f64_to_i32_impl {
-  static ASMJIT_INLINE int32_t cvt(double val) noexcept {
-    if (!std::isfinite(val)) {
-      return cvt_non_finite_f64_to_i32(val);
-    }
-
-    if (val <= double(INT32_MIN)) {
-      return INT32_MIN;
-    }
-    else if (val >= double(INT32_MAX)) {
-      return INT32_MAX;
-    }
-    else {
-      return static_cast<int32_t>(std::nearbyint(val));
-    }
-  }
-
   template<uint32_t kW>
   static ASMJIT_INLINE VecOverlay<kW> apply(const VecOverlay<kW>& a) noexcept {
     VecOverlay<kW> out{};
     uint32_t adj = kHi ? kW / 8 : 0u;
     for (uint32_t off = 0; off < kW; off += 16) {
-      out.data_i32[off / 8 + adj + 0] = cvt(a.data_f64[off / 8 + 0]);
-      out.data_i32[off / 8 + adj + 1] = cvt(a.data_f64[off / 8 + 1]);
+      out.data_i32[off / 8 + adj + 0] = cvt_float_to_int_round<behavior, int32_t>(a.data_f64[off / 8 + 0]);
+      out.data_i32[off / 8 + adj + 1] = cvt_float_to_int_round<behavior, int32_t>(a.data_f64[off / 8 + 1]);
     }
     return out;
   }
 };
 
-struct vec_op_cvt_round_f64_to_i32_lo : vec_op_cvt_round_f64_to_i32_impl<false> {};
-struct vec_op_cvt_round_f64_to_i32_hi : vec_op_cvt_round_f64_to_i32_impl<true> {};
+template<FloatToIntOutsideRangeBehavior behavior>
+struct vec_op_cvt_round_f64_to_i32_lo : vec_op_cvt_round_f64_to_i32_impl<behavior, false> {};
+template<FloatToIntOutsideRangeBehavior behavior>
+struct vec_op_cvt_round_f64_to_i32_hi : vec_op_cvt_round_f64_to_i32_impl<behavior, true> {};
 
 struct scalar_op_cvt_f32_to_f64 {
   template<uint32_t kW>
@@ -3558,12 +3590,20 @@ template<ScalarOpBehavior kB, typename T> struct scalar_op_ceil : public op_scal
   static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::ceil(a); }
 };
 
-template<ScalarOpBehavior kB, typename T> struct scalar_op_round : public op_scalar_vv<kB, T, scalar_op_round<kB, T>> {
+template<ScalarOpBehavior kB, typename T> struct scalar_op_round_even : public op_scalar_vv<kB, T, scalar_op_round_even<kB, T>> {
   static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::nearbyint(a); }
 };
 
+template<ScalarOpBehavior kB, typename T> struct scalar_op_round_half_away : public op_scalar_vv<kB, T, scalar_op_round_half_away<kB, T>> {
+  static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::trunc(fadd(a, fxor(half_minus_1ulp_const<T>::value, fsign(a)))); }
+};
+
+template<ScalarOpBehavior kB, typename T> struct scalar_op_round_half_up : public op_scalar_vv<kB, T, scalar_op_round_half_up<kB, T>> {
+  static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::floor(fadd(a, half_minus_1ulp_const<T>::value)); }
+};
+
 template<ScalarOpBehavior kB, typename T> struct scalar_op_sqrt : public op_scalar_vv<kB, T, scalar_op_sqrt<kB, T>> {
-  static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::sqrt(a); }
+  static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return fsqrt(a); }
 };
 
 // ujit::UniCompiler - Tests - Generic Operations - VVI
@@ -4715,15 +4755,17 @@ template<VecWidth kVecWidth>
 static ASMJIT_NOINLINE void test_simd_ops(JitContext& ctx) noexcept {
   // We need to know some behaviors in advance so we can select the right test function,
   // so create a dummy compiler and extract the necessary information from it.
-  ScalarOpBehavior scalar_op_behavior;
-  FMAddOpBehavior fmadd_op_behavior;
+  ScalarOpBehavior scalar_op_behavior {};
+  FMAddOpBehavior fmadd_op_behavior {};
+  FloatToIntOutsideRangeBehavior float_to_int_behavior {};
 
   {
     ctx.prepare();
-    UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags);
+    UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints);
 
     scalar_op_behavior = pc.scalar_op_behavior();
     fmadd_op_behavior = pc.fmadd_op_behavior();
+    float_to_int_behavior = pc.float_to_int_outside_range_behavior();
   }
 
   bool valgrind_fma_bug = false;
@@ -4843,8 +4885,12 @@ static ASMJIT_NOINLINE void test_simd_ops(JitContext& ctx) noexcept {
         test_vecop_vv<kVecWidth, UniOpVV::kFloorF64S, scalar_op_floor<ScalarOpBehavior::kZeroing, double>>(ctx, Variation{v});
         test_vecop_vv<kVecWidth, UniOpVV::kCeilF32S, scalar_op_ceil<ScalarOpBehavior::kZeroing, float>>(ctx, Variation{v});
         test_vecop_vv<kVecWidth, UniOpVV::kCeilF64S, scalar_op_ceil<ScalarOpBehavior::kZeroing, double>>(ctx, Variation{v});
-        test_vecop_vv<kVecWidth, UniOpVV::kRoundF32S, scalar_op_round<ScalarOpBehavior::kZeroing, float>>(ctx, Variation{v});
-        test_vecop_vv<kVecWidth, UniOpVV::kRoundF64S, scalar_op_round<ScalarOpBehavior::kZeroing, double>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundEvenF32S, scalar_op_round_even<ScalarOpBehavior::kZeroing, float>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundEvenF64S, scalar_op_round_even<ScalarOpBehavior::kZeroing, double>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfAwayF32S, scalar_op_round_half_away<ScalarOpBehavior::kZeroing, float>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfAwayF64S, scalar_op_round_half_away<ScalarOpBehavior::kZeroing, double>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfUpF32S, scalar_op_round_half_up<ScalarOpBehavior::kZeroing, float>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfUpF64S, scalar_op_round_half_up<ScalarOpBehavior::kZeroing, double>>(ctx, Variation{v});
       }
       else {
         test_vecop_vv<kVecWidth, UniOpVV::kTruncF32S, scalar_op_trunc<ScalarOpBehavior::kPreservingVec128, float>>(ctx, Variation{v});
@@ -4853,8 +4899,12 @@ static ASMJIT_NOINLINE void test_simd_ops(JitContext& ctx) noexcept {
         test_vecop_vv<kVecWidth, UniOpVV::kFloorF64S, scalar_op_floor<ScalarOpBehavior::kPreservingVec128, double>>(ctx, Variation{v});
         test_vecop_vv<kVecWidth, UniOpVV::kCeilF32S, scalar_op_ceil<ScalarOpBehavior::kPreservingVec128, float>>(ctx, Variation{v});
         test_vecop_vv<kVecWidth, UniOpVV::kCeilF64S, scalar_op_ceil<ScalarOpBehavior::kPreservingVec128, double>>(ctx, Variation{v});
-        test_vecop_vv<kVecWidth, UniOpVV::kRoundF32S, scalar_op_round<ScalarOpBehavior::kPreservingVec128, float>>(ctx, Variation{v});
-        test_vecop_vv<kVecWidth, UniOpVV::kRoundF64S, scalar_op_round<ScalarOpBehavior::kPreservingVec128, double>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundEvenF32S, scalar_op_round_even<ScalarOpBehavior::kPreservingVec128, float>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundEvenF64S, scalar_op_round_even<ScalarOpBehavior::kPreservingVec128, double>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfAwayF32S, scalar_op_round_half_away<ScalarOpBehavior::kPreservingVec128, float>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfAwayF64S, scalar_op_round_half_away<ScalarOpBehavior::kPreservingVec128, double>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfUpF32S, scalar_op_round_half_up<ScalarOpBehavior::kPreservingVec128, float>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfUpF64S, scalar_op_round_half_up<ScalarOpBehavior::kPreservingVec128, double>>(ctx, Variation{v});
       }
 
       test_vecop_vv<kVecWidth, UniOpVV::kTruncF32, vec_op_trunc<float>>(ctx, Variation{v});
@@ -4863,8 +4913,12 @@ static ASMJIT_NOINLINE void test_simd_ops(JitContext& ctx) noexcept {
       test_vecop_vv<kVecWidth, UniOpVV::kFloorF64, vec_op_floor<double>>(ctx, Variation{v});
       test_vecop_vv<kVecWidth, UniOpVV::kCeilF32, vec_op_ceil<float>>(ctx, Variation{v});
       test_vecop_vv<kVecWidth, UniOpVV::kCeilF64, vec_op_ceil<double>>(ctx, Variation{v});
-      test_vecop_vv<kVecWidth, UniOpVV::kRoundF32, vec_op_round<float>>(ctx, Variation{v});
-      test_vecop_vv<kVecWidth, UniOpVV::kRoundF64, vec_op_round<double>>(ctx, Variation{v});
+      test_vecop_vv<kVecWidth, UniOpVV::kRoundEvenF32, vec_op_round_even<float>>(ctx, Variation{v});
+      test_vecop_vv<kVecWidth, UniOpVV::kRoundEvenF64, vec_op_round_even<double>>(ctx, Variation{v});
+      test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfAwayF32, vec_op_round_half_away<float>>(ctx, Variation{v});
+      test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfAwayF64, vec_op_round_half_away<double>>(ctx, Variation{v});
+      test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfUpF32, vec_op_round_half_up<float>>(ctx, Variation{v});
+      test_vecop_vv<kVecWidth, UniOpVV::kRoundHalfUpF64, vec_op_round_half_up<double>>(ctx, Variation{v});
     }
   }
 
@@ -4908,12 +4962,24 @@ static ASMJIT_NOINLINE void test_simd_ops(JitContext& ctx) noexcept {
       test_vecop_vv<kVecWidth, UniOpVV::kCvtI32LoToF64, vec_op_cvt_i32_lo_to_f64>(ctx, Variation{v});
       test_vecop_vv<kVecWidth, UniOpVV::kCvtI32HiToF64, vec_op_cvt_i32_hi_to_f64>(ctx, Variation{v});
 
-      test_vecop_vv<kVecWidth, UniOpVV::kCvtTruncF32ToI32, vec_op_cvt_trunc_f32_to_i32>(ctx, Variation{v});
-      test_vecop_vv<kVecWidth, UniOpVV::kCvtTruncF64ToI32Lo, vec_op_cvt_trunc_f64_to_i32_lo>(ctx, Variation{0});
-      test_vecop_vv<kVecWidth, UniOpVV::kCvtTruncF64ToI32Hi, vec_op_cvt_trunc_f64_to_i32_hi>(ctx, Variation{0});
-      test_vecop_vv<kVecWidth, UniOpVV::kCvtRoundF32ToI32, vec_op_cvt_round_f32_to_i32>(ctx, Variation{v});
-      test_vecop_vv<kVecWidth, UniOpVV::kCvtRoundF64ToI32Lo, vec_op_cvt_round_f64_to_i32_lo>(ctx, Variation{0});
-      test_vecop_vv<kVecWidth, UniOpVV::kCvtRoundF64ToI32Hi, vec_op_cvt_round_f64_to_i32_hi>(ctx, Variation{0});
+      if (float_to_int_behavior == FloatToIntOutsideRangeBehavior::kSmallestValue) {
+        constexpr FloatToIntOutsideRangeBehavior behavior = FloatToIntOutsideRangeBehavior::kSmallestValue;
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtTruncF32ToI32, vec_op_cvt_trunc_f32_to_i32<behavior>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtTruncF64ToI32Lo, vec_op_cvt_trunc_f64_to_i32_lo<behavior>>(ctx, Variation{0});
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtTruncF64ToI32Hi, vec_op_cvt_trunc_f64_to_i32_hi<behavior>>(ctx, Variation{0});
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtRoundF32ToI32, vec_op_cvt_round_f32_to_i32<behavior>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtRoundF64ToI32Lo, vec_op_cvt_round_f64_to_i32_lo<behavior>>(ctx, Variation{0});
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtRoundF64ToI32Hi, vec_op_cvt_round_f64_to_i32_hi<behavior>>(ctx, Variation{0});
+      }
+      else {
+        constexpr FloatToIntOutsideRangeBehavior behavior = FloatToIntOutsideRangeBehavior::kSaturatedValue;
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtTruncF32ToI32, vec_op_cvt_trunc_f32_to_i32<behavior>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtTruncF64ToI32Lo, vec_op_cvt_trunc_f64_to_i32_lo<behavior>>(ctx, Variation{0});
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtTruncF64ToI32Hi, vec_op_cvt_trunc_f64_to_i32_hi<behavior>>(ctx, Variation{0});
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtRoundF32ToI32, vec_op_cvt_round_f32_to_i32<behavior>>(ctx, Variation{v});
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtRoundF64ToI32Lo, vec_op_cvt_round_f64_to_i32_lo<behavior>>(ctx, Variation{0});
+        test_vecop_vv<kVecWidth, UniOpVV::kCvtRoundF64ToI32Hi, vec_op_cvt_round_f64_to_i32_hi<behavior>>(ctx, Variation{0});
+      }
     }
   }
 
@@ -5410,30 +5476,29 @@ static void test_gp_ops(JitContext& ctx) noexcept {
 }
 
 #if defined(ASMJIT_UJIT_X86)
-static void dump_feature_list(asmjit::String& out, const asmjit::CpuFeatures& features) noexcept {
+static void dump_feature_list(String& out, const CpuFeatures& features) noexcept {
 #if !defined(ASMJIT_NO_LOGGING)
-  asmjit::CpuFeatures::Iterator it = features.iterator();
+  CpuFeatures::Iterator it = features.iterator();
   bool first = true;
   while (it.has_next()) {
     size_t feature_id = it.next();
     if (!first) {
       out.append(' ');
     }
-    asmjit::Formatter::format_feature(out, asmjit::Arch::kHost, uint32_t(feature_id));
+    Formatter::format_feature(out, Arch::kHost, uint32_t(feature_id));
     first = false;
   }
 #else
-  asmjit::Support::maybe_unused(features);
+  Support::maybe_unused(features);
   out.append("<ASMJIT_NO_LOGGING>");
 #endif
 }
 
-static void test_x86_ops(JitContext& ctx, const asmjit::CpuFeatures& host_features) noexcept {
-  using Ext = asmjit::CpuFeatures::X86;
-  using CpuFeatures = asmjit::CpuFeatures;
+static void test_x86_ops(JitContext& ctx, const CpuFeatures& host_features) noexcept {
+  using Ext = CpuFeatures::X86;
 
   {
-    asmjit::String s;
+    String s;
     dump_feature_list(s, host_features);
     INFO("Available CPU features: %s", s.data());
   }
@@ -5474,7 +5539,7 @@ static void test_x86_ops(JitContext& ctx, const asmjit::CpuFeatures& host_featur
         continue;
       }
 
-      asmjit::String s;
+      String s;
       if (filtered == host_features) {
         s.assign("[ALL]");
       }
@@ -5554,7 +5619,7 @@ static void test_x86_ops(JitContext& ctx, const asmjit::CpuFeatures& host_featur
         continue;
       }
 
-      asmjit::String s;
+      String s;
       if (filtered == host_features) {
         s.assign("[ALL]");
       }
@@ -5585,7 +5650,7 @@ static void test_x86_ops(JitContext& ctx, const asmjit::CpuFeatures& host_featur
 #endif // ASMJIT_UJIT_X86
 
 #if defined(ASMJIT_UJIT_AARCH64)
-static void test_a64_ops(JitContext& ctx, const asmjit::CpuFeatures& host_features) noexcept {
+static void test_a64_ops(JitContext& ctx, const CpuFeatures& host_features) noexcept {
   ctx.features = host_features;
 
   test_gp_ops(ctx);
diff --git a/test/asmjit_test_unicompiler_avx2fma.cpp b/testing/tests/asmjit_test_unicompiler_avx2fma.cpp
similarity index 100%
rename from test/asmjit_test_unicompiler_avx2fma.cpp
rename to testing/tests/asmjit_test_unicompiler_avx2fma.cpp
diff --git a/test/asmjit_test_unicompiler_sse2.cpp b/testing/tests/asmjit_test_unicompiler_sse2.cpp
similarity index 90%
rename from test/asmjit_test_unicompiler_sse2.cpp
rename to testing/tests/asmjit_test_unicompiler_sse2.cpp
index 7803713..fa49e60 100644
--- a/test/asmjit_test_unicompiler_sse2.cpp
+++ b/testing/tests/asmjit_test_unicompiler_sse2.cpp
@@ -35,6 +35,10 @@ float fdiv(float a, float b) noexcept {
   return _mm_cvtss_f32(_mm_div_ss(_mm_set1_ps(a), _mm_set1_ps(b)));
 }
 
+float fsqrt(float a) noexcept {
+  return _mm_cvtss_f32(_mm_sqrt_ss(_mm_set1_ps(a)));
+}
+
 float fmadd_nofma_ref(float a, float b, float c) noexcept {
   return _mm_cvtss_f32(_mm_add_ss(_mm_mul_ss(_mm_set1_ps(a), _mm_set1_ps(b)), _mm_set1_ps(c)));
 }
@@ -55,6 +59,10 @@ double fdiv(double a, double b) noexcept {
   return _mm_cvtsd_f64(_mm_div_sd(_mm_set1_pd(a), _mm_set1_pd(b)));
 }
 
+double fsqrt(double a) noexcept {
+  return _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_set1_pd(a)));
+}
+
 double fmadd_nofma_ref(double a, double b, double c) noexcept {
   return _mm_cvtsd_f64(_mm_add_sd(_mm_mul_sd(_mm_set1_pd(a), _mm_set1_pd(b)), _mm_set1_pd(c)));
 }
diff --git a/test/asmjit_test_x86_sections.cpp b/testing/tests/asmjit_test_x86_sections.cpp
similarity index 100%
rename from test/asmjit_test_x86_sections.cpp
rename to testing/tests/asmjit_test_x86_sections.cpp
diff --git a/test/broken.cpp b/testing/tests/broken.cpp
similarity index 100%
rename from test/broken.cpp
rename to testing/tests/broken.cpp
diff --git a/test/broken.h b/testing/tests/broken.h
similarity index 100%
rename from test/broken.h
rename to testing/tests/broken.h
diff --git a/tools/configure-makefiles.sh b/tools/configure-makefiles.sh
deleted file mode 100755
index 69503dc..0000000
--- a/tools/configure-makefiles.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-
-CURRENT_DIR="`pwd`"
-BUILD_DIR="${CURRENT_DIR}/../build"
-BUILD_OPTIONS="-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1"
-
-echo "== [Configuring Build - Debug] =="
-eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Debug" -DCMAKE_BUILD_TYPE=Debug ${BUILD_OPTIONS}
-echo ""
-
-echo "== [Configuring Build - Release] =="
-eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Release" -DCMAKE_BUILD_TYPE=Release ${BUILD_OPTIONS}
-echo ""
diff --git a/tools/configure-ninja.sh b/tools/configure-ninja.sh
deleted file mode 100755
index 84808d2..0000000
--- a/tools/configure-ninja.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/sh
-
-CURRENT_DIR="`pwd`"
-BUILD_DIR="${CURRENT_DIR}/../build"
-BUILD_OPTIONS="-G Ninja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1"
-
-echo "== [Configuring Build - Debug] =="
-eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Debug" -DCMAKE_BUILD_TYPE=Debug ${BUILD_OPTIONS}
-echo ""
-
-echo "== [Configuring Build - Release] =="
-eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Release" -DCMAKE_BUILD_TYPE=Release ${BUILD_OPTIONS}
-echo ""
diff --git a/tools/configure-sanitizers.sh b/tools/configure-sanitizers.sh
deleted file mode 100755
index eee697a..0000000
--- a/tools/configure-sanitizers.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/sh
-
-CURRENT_DIR="`pwd`"
-BUILD_DIR="${CURRENT_DIR}/../build"
-BUILD_OPTIONS="-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1"
-
-echo "== [Configuring Build - Release_ASAN] =="
-eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Release_ASAN" ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=address
-echo ""
-
-echo "== [Configuring Build - Release_UBSAN] =="
-eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Release_UBSAN" ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=undefined
-echo ""
-
-echo "== [Configuring Build - Release_MSAN] =="
-eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Release_MSAN" ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=memory
-echo ""
-
-echo "== [Configuring Build - Debug_UBSAN] =="
-eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Debug_UBSAN" ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Debug -DASMJIT_SANITIZE=undefined
-echo ""
diff --git a/tools/configure-vs2019-x64.bat b/tools/configure-vs2019-x64.bat
deleted file mode 100644
index 05bc31e..0000000
--- a/tools/configure-vs2019-x64.bat
+++ /dev/null
@@ -1,2 +0,0 @@
-@echo off
-cmake .. -B "..\build_vs2019_x64" -G"Visual Studio 16" -A x64 -DASMJIT_TEST=1
diff --git a/tools/configure-vs2019-x86.bat b/tools/configure-vs2019-x86.bat
deleted file mode 100644
index a0e2663..0000000
--- a/tools/configure-vs2019-x86.bat
+++ /dev/null
@@ -1,2 +0,0 @@
-@echo off
-cmake .. -B "..\build_vs2019_x86" -G"Visual Studio 16" -A Win32 -DASMJIT_TEST=1
diff --git a/tools/configure-vs2022-x64.bat b/tools/configure-vs2022-x64.bat
deleted file mode 100644
index b33f541..0000000
--- a/tools/configure-vs2022-x64.bat
+++ /dev/null
@@ -1,2 +0,0 @@
-@echo off
-cmake .. -B "..\build_vs2022_x64" -G"Visual Studio 17" -A x64 -DASMJIT_TEST=1
diff --git a/tools/configure-vs2022-x86.bat b/tools/configure-vs2022-x86.bat
deleted file mode 100644
index 0ba3505..0000000
--- a/tools/configure-vs2022-x86.bat
+++ /dev/null
@@ -1,2 +0,0 @@
-@echo off
-cmake .. -B "..\build_vs2022_x86" -G"Visual Studio 17" -A Win32 -DASMJIT_TEST=1
diff --git a/tools/configure-xcode.sh b/tools/configure-xcode.sh
deleted file mode 100755
index d9c7d98..0000000
--- a/tools/configure-xcode.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-
-CURRENT_DIR="`pwd`"
-BUILD_DIR="${CURRENT_DIR}/../build"
-BUILD_OPTIONS="-G Xcode -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1"
-
-mkdir -p "${BUILD_DIR}"
-eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}" ${BUILD_OPTIONS}