diff --git a/.github/workflows/build-config.json b/.github/workflows/build-config.json index 23a9686..473b1aa 100644 --- a/.github/workflows/build-config.json +++ b/.github/workflows/build-config.json @@ -12,7 +12,7 @@ ], "tests": [ - { "optional": true, "cmd": ["asmjit_test_unit", "--quick"] }, + { "optional": true, "cmd": ["asmjit_test_runner", "--quick"] }, { "optional": true, "cmd": ["asmjit_test_environment"] }, { "optional": true, "cmd": ["asmjit_test_assembler"] }, { "optional": true, "cmd": ["asmjit_test_assembler", "--validate"] }, diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1903172..9c41ef7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -18,10 +18,10 @@ jobs: steps: - name: "Checkout" - uses: actions/checkout@v4 + uses: actions/checkout@v5 - name: "Setup node.js" - uses: actions/setup-node@v4 + uses: actions/setup-node@v5 with: node-version: "*" @@ -104,22 +104,25 @@ jobs: - { title: "linux" , host: "ubuntu-24.04" , arch: "x64" , cc: "clang-19", conf: "Release", defs: "ASMJIT_TEST=1" } - { title: "linux" , host: "ubuntu-24.04-arm", arch: "arm64" , cc: "clang-19", conf: "Debug" , defs: "ASMJIT_TEST=1" } - { title: "linux" , host: "ubuntu-24.04-arm", arch: "arm64" , cc: "clang-19", conf: "Release", defs: "ASMJIT_TEST=1" } - - { title: "macos" , host: "macos-13" , arch: "x64" , cc: "gcc-14" , conf: "Debug" , defs: "ASMJIT_TEST=1" } - - { title: "macos" , host: "macos-13" , arch: "x64" , cc: "gcc-14" , conf: "Release", defs: "ASMJIT_TEST=1" } - - { title: "macos" , host: "macos-13" , arch: "x64" , cc: "clang" , conf: "Debug" , defs: "ASMJIT_TEST=1" } - - { title: "macos" , host: "macos-13" , arch: "x64" , cc: "clang" , conf: "Release", defs: "ASMJIT_TEST=1" } - - { title: "macos" , host: "macos-14" , arch: "arm64" , cc: "clang" , conf: "Debug" , defs: "ASMJIT_TEST=1" } - - { title: "macos" , host: "macos-14" , arch: "arm64" , cc: "clang" , conf: "Release", defs: "ASMJIT_TEST=1" } + - { title: "linux" , host: "ubuntu-24.04" , arch: "x86" , cc: "clang-20", conf: "Debug" , defs: "ASMJIT_TEST=1" } + - { title: "linux" , host: "ubuntu-24.04" , arch: "x86" , cc: "clang-20", conf: "Release", defs: "ASMJIT_TEST=1" } + - { title: "linux" , host: "ubuntu-24.04" , arch: "x64" , cc: "clang-20", conf: "Debug" , defs: "ASMJIT_TEST=1" } + - { title: "linux" , host: "ubuntu-24.04" , arch: "x64" , cc: "clang-20", conf: "Release", defs: "ASMJIT_TEST=1" } + - { title: "linux" , host: "ubuntu-24.04-arm", arch: "arm64" , cc: "clang-20", conf: "Debug" , defs: "ASMJIT_TEST=1" } + - { title: "linux" , host: "ubuntu-24.04-arm", arch: "arm64" , cc: "clang-20", conf: "Release", defs: "ASMJIT_TEST=1" } + - { title: "macos" , host: "macos-15-intel" , arch: "x64" , cc: "gcc-14" , conf: "Debug" , defs: "ASMJIT_TEST=1" } + - { title: "macos" , host: "macos-15-intel" , arch: "x64" , cc: "gcc-14" , conf: "Release", defs: "ASMJIT_TEST=1" } + - { title: "macos" , host: "macos-15-intel" , arch: "x64" , cc: "clang" , conf: "Debug" , defs: "ASMJIT_TEST=1" } + - { title: "macos" , host: "macos-15-intel" , arch: "x64" , cc: "clang" , conf: "Release", defs: "ASMJIT_TEST=1" } + - { title: "macos" , host: "macos-15" , arch: "arm64" , cc: "clang" , conf: "Debug" , defs: "ASMJIT_TEST=1" } + - { title: "macos" , host: "macos-15" , arch: "arm64" , cc: "clang" , conf: "Release", defs: "ASMJIT_TEST=1" } - { title: "windows" , host: "windows-2022" , arch: "x86" , cc: "vs2022" , conf: "Debug" , defs: "ASMJIT_TEST=1" } - { title: "windows" , host: "windows-2022" , arch: "x86" , cc: "vs2022" , conf: "Release", defs: "ASMJIT_TEST=1" } - { title: "windows" , host: "windows-2022" , arch: "x64" , cc: "vs2022" , conf: "Debug" , defs: "ASMJIT_TEST=1" } - { title: "windows" , host: "windows-2022" , arch: "x64" , cc: "vs2022" , conf: "Release", defs: "ASMJIT_TEST=1" } - { title: "windows" , host: "windows-11-arm" , arch: "arm64" , cc: "vs2022" , conf: "Debug" , defs: "ASMJIT_TEST=1" } - { title: "windows" , host: "windows-11-arm" , arch: "arm64" , cc: "vs2022" , conf: "Release", defs: "ASMJIT_TEST=1" } - - # Cross compiled, cannot run tests (Windows/UWP). - { title: "windows/uwp" , host: "windows-2022" , arch: "x64" , cc: "vs2022" , conf: "Release", defs: "ASMJIT_TEST=0,CMAKE_SYSTEM_NAME=WindowsStore,CMAKE_SYSTEM_VERSION=10.0,CMAKE_CXX_FLAGS=-D_WIN32_WINNT=0x0A00" } - - { title: "freebsd" , host: "ubuntu-latest" , arch: "x64" , cc: "clang" , conf: "Release", vm: "freebsd", vm_ver: "14.2", defs: "ASMJIT_TEST=1" } - { title: "freebsd" , host: "ubuntu-latest" , arch: "arm64" , cc: "clang" , conf: "Release", vm: "freebsd", vm_ver: "14.2", defs: "ASMJIT_TEST=1" } - { title: "netbsd" , host: "ubuntu-latest" , arch: "x64" , cc: "clang" , conf: "Release", vm: "netbsd" , vm_ver: "10.1", defs: "ASMJIT_TEST=1" } @@ -135,18 +138,18 @@ jobs: steps: - name: "Checkout" - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: path: "source" - name: "Checkout Build Actions" - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: repository: build-actions/build-actions path: "build-actions" - name: "Python" - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: "3.x" diff --git a/CMakeLists.txt b/CMakeLists.txt index d948218..064b255 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,18 @@ -cmake_minimum_required(VERSION 3.19 FATAL_ERROR) +# AsmJit +# ====== + +# To consume asmjit as a dependency, use asmjit::asmjit alias. + +cmake_minimum_required(VERSION 3.24 FATAL_ERROR) # Don't create a project if it was already created by another CMakeLists.txt. This makes # it possible to support both add_subdirectory() and include() ways of using AsmJit as a # dependency. if (NOT CMAKE_PROJECT_NAME OR "${CMAKE_PROJECT_NAME}" STREQUAL "asmjit") - project(asmjit CXX) + project(asmjit + LANGUAGES CXX + DESCRIPTION "Low-latency machine code generation" + HOMEPAGE_URL "https://asmjit.com") endif() include(CheckCXXCompilerFlag) @@ -194,7 +202,7 @@ function(asmjit_detect_sanitizers out) set(${out} "${_out_array}" PARENT_SCOPE) endfunction() -function(asmjit_add_target target target_type) +function(asmjit_addapp target target_type) set(single_val "") set(multi_val SOURCES LIBRARIES CFLAGS CFLAGS_DBG CFLAGS_REL) cmake_parse_arguments("X" "" "${single_val}" "${multi_val}" ${ARGN}) @@ -227,47 +235,41 @@ set(ASMJIT_INCLUDE_DIR "${ASMJIT_INCLUDE_DIRS}") if (NOT ASMJIT_NO_CUSTOM_FLAGS) if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC" OR "x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xMSVC") - list(APPEND ASMJIT_PRIVATE_CFLAGS - -MP # [+] Multi-Process Compilation. - -GF # [+] Eliminate duplicate strings. - -Zc:__cplusplus # [+] Conforming __cplusplus definition. - -Zc:inline # [+] Remove unreferenced COMDAT. - -Zc:strictStrings # [+] Strict const qualification of string literals. - -Zc:threadSafeInit- # [-] Thread-safe statics. - -W4) # [+] Warning level 4. + list(APPEND ASMJIT_PRIVATE_CFLAGS -W4) # [+] Warning level 4. - list(APPEND ASMJIT_PRIVATE_CFLAGS_DBG - -GS) # [+] Buffer security-check. + list(APPEND ASMJIT_PRIVATE_CFLAGS -MP) # [+] Multi-Process Compilation. + list(APPEND ASMJIT_PRIVATE_CFLAGS -GF) # [+] Eliminate duplicate strings. + list(APPEND ASMJIT_PRIVATE_CFLAGS -Zc:__cplusplus) # [+] Conforming __cplusplus definition. + list(APPEND ASMJIT_PRIVATE_CFLAGS -Zc:inline) # [+] Remove unreferenced COMDAT. + list(APPEND ASMJIT_PRIVATE_CFLAGS -Zc:strictStrings) # [+] Strict const qualification of string literals. + list(APPEND ASMJIT_PRIVATE_CFLAGS -Zc:threadSafeInit-) # [-] Thread-safe statics. - list(APPEND ASMJIT_PRIVATE_CFLAGS_REL - -GS- # [-] Buffer security-check. - -O2 # [+] Favor speed over size. - -Oi) # [+] Generate intrinsic functions. - elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "^(GNU|Clang|AppleClang)$") - list(APPEND ASMJIT_PRIVATE_CFLAGS -Wall -Wextra -Wconversion) - list(APPEND ASMJIT_PRIVATE_CFLAGS -fno-math-errno) - list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -O2) + list(APPEND ASMJIT_PRIVATE_CFLAGS_DBG -GS) # [+] Buffer security-check. + list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -GS-) # [-] Buffer security-check. + list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -O2) # [+] Favor speed over size. + list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -Oi) # [+] Generate intrinsic functions. + elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU|Clang") + list(APPEND ASMJIT_PRIVATE_CFLAGS -Wall -Wextra -Wconversion) # [+] Add baseline warnings that can be used safely even with system headers. + asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wdouble-promotion) # [+] Warn about double promotions. + asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wduplicated-cond) # [+] Warn about duplicate conditions. + asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wduplicated-branches) # [+] Warn about duplicate branches. + asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wlogical-op) # [+] Warn about suspicious uses of logical operators in expressions. + asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wlogical-not-parentheses) # [+] Warn about logical not used on the left hand side operand of a comparison. + asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -Wrestrict) - # We would like also '-Wzero-as-null-pointer-constant' but it would warn when it comes to system headers. - asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS - -Wdouble-promotion - -Wduplicated-cond - -Wduplicated-branches - -Wlogical-op - -Wrestrict - ) + list(APPEND ASMJIT_PRIVATE_CFLAGS -fno-math-errno) # [-] Disable math functions setting errno (performance reasons). + list(APPEND ASMJIT_PRIVATE_CFLAGS -fno-threadsafe-statics) # [-] Don't add guards when initializing statics (we don't need it). + list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -O2) # [+] Compiling with -O2 in release mode is what we generally want. + list(APPEND ASMJIT_PRIVATE_CFLAGS_REL -fmerge-all-constants) # [+] We don't need unique address per constant (merging improves library size). # -fno-semantic-interposition is not available on apple - the compiler issues a warning, which is not detected. - if (APPLE) - asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -fno-threadsafe-statics) - else() - asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -fno-threadsafe-statics -fno-semantic-interposition) + if (NOT APPLE) + asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS -fno-semantic-interposition) endif() - # The following flags can save few bytes in the resulting binary. - asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS_REL - -fmerge-all-constants # Merge all constants even if it violates ISO C++. - -fno-enforce-eh-specs) # Don't enforce termination if noexcept function throws. + if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "iOS") + asmjit_detect_cflags(ASMJIT_PRIVATE_CFLAGS_REL -fno-enforce-eh-specs) # [-] Don't enforce termination if noexcept function throws. + endif() endif() endif() @@ -547,9 +549,10 @@ set(ASMJIT_SRC_LIST asmjit/x86/x86rapass_p.h asmjit/ujit/ujitbase.h + asmjit/ujit/unicompiler.h asmjit/ujit/unicompiler_a64.cpp asmjit/ujit/unicompiler_x86.cpp - asmjit/ujit/unicompiler.h + asmjit/ujit/unicompiler_utils_p.h asmjit/ujit/uniop.h asmjit/ujit/vecconsttable.cpp asmjit/ujit/vecconsttable.h @@ -592,12 +595,12 @@ message(" ASMJIT_PRIVATE_CFLAGS_REL=${ASMJIT_PRIVATE_CFLAGS_REL}") if (NOT ASMJIT_EMBED) # Add AsmJit target. - asmjit_add_target(asmjit "${ASMJIT_TARGET_TYPE}" - SOURCES ${ASMJIT_SRC} - LIBRARIES ${ASMJIT_DEPS} - CFLAGS ${ASMJIT_PRIVATE_CFLAGS} - CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} - CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) + asmjit_addapp(asmjit "${ASMJIT_TARGET_TYPE}" + SOURCES ${ASMJIT_SRC} + LIBRARIES ${ASMJIT_DEPS} + CFLAGS ${ASMJIT_PRIVATE_CFLAGS} + CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} + CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) target_compile_options(asmjit INTERFACE ${ASMJIT_CFLAGS}) target_include_directories(asmjit BEFORE INTERFACE @@ -632,60 +635,46 @@ if (NOT ASMJIT_EMBED) enable_testing() # Special target that always uses embedded AsmJit. - asmjit_add_target(asmjit_test_unit TEST - SOURCES ${ASMJIT_SRC} - test/asmjit_test_unit.cpp - test/broken.cpp - test/broken.h - LIBRARIES ${ASMJIT_DEPS} - CFLAGS ${ASMJIT_PRIVATE_CFLAGS} - -DASMJIT_TEST - -DASMJIT_STATIC - CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} - CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) - target_include_directories(asmjit_test_unit BEFORE PRIVATE ${ASMJIT_INCLUDE_DIRS}) + asmjit_addapp(asmjit_test_runner TEST + SOURCES ${ASMJIT_SRC} + testing/tests/asmjit_test_runner.cpp + testing/tests/broken.cpp + testing/tests/broken.h + LIBRARIES ${ASMJIT_DEPS} + CFLAGS ${ASMJIT_PRIVATE_CFLAGS} + -DASMJIT_TEST + -DASMJIT_STATIC + CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} + CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) + target_include_directories(asmjit_test_runner BEFORE PRIVATE ${ASMJIT_INCLUDE_DIRS}) - asmjit_add_target(asmjit_test_assembler TEST - SOURCES test/asmjit_test_assembler.cpp - test/asmjit_test_assembler.h - test/asmjit_test_assembler_a64.cpp - test/asmjit_test_assembler_x64.cpp - test/asmjit_test_assembler_x86.cpp - LIBRARIES asmjit::asmjit - CFLAGS ${ASMJIT_PRIVATE_CFLAGS} - CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} - CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) + asmjit_addapp(asmjit_test_assembler TEST + SOURCES testing/tests/asmjit_test_assembler.cpp + testing/tests/asmjit_test_assembler.h + testing/tests/asmjit_test_assembler_a64.cpp + testing/tests/asmjit_test_assembler_x64.cpp + testing/tests/asmjit_test_assembler_x86.cpp + LIBRARIES asmjit::asmjit + CFLAGS ${ASMJIT_PRIVATE_CFLAGS} + CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} + CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) - asmjit_add_target(asmjit_bench_codegen EXECUTABLE - SOURCES test/asmjit_bench_codegen.cpp - test/asmjit_bench_codegen_a64.cpp - test/asmjit_bench_codegen_x86.cpp - SOURCES test/asmjit_bench_codegen.h - LIBRARIES asmjit::asmjit - CFLAGS ${ASMJIT_PRIVATE_CFLAGS} - CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} - CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) - - foreach(_target asmjit_bench_overhead - asmjit_bench_regalloc - asmjit_test_environment - asmjit_test_emitters - asmjit_test_x86_sections) - asmjit_add_target(${_target} TEST - SOURCES test/${_target}.cpp - LIBRARIES asmjit::asmjit - CFLAGS ${ASMJIT_PRIVATE_CFLAGS} - CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} - CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) + foreach(app asmjit_test_environment asmjit_test_emitters asmjit_test_x86_sections) + asmjit_addapp(${app} TEST + SOURCES testing/tests/${app}.cpp + LIBRARIES asmjit::asmjit + CFLAGS ${ASMJIT_PRIVATE_CFLAGS} + CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} + CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) endforeach() if (NOT ASMJIT_NO_INTROSPECTION) - asmjit_add_target(asmjit_test_instinfo TEST - SOURCES test/asmjit_test_instinfo.cpp - LIBRARIES asmjit::asmjit - CFLAGS ${ASMJIT_PRIVATE_CFLAGS} - CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} - CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) + asmjit_addapp(asmjit_test_instinfo TEST + SOURCES testing/tests/asmjit_test_instinfo.cpp + LIBRARIES asmjit::asmjit + CFLAGS ${ASMJIT_PRIVATE_CFLAGS} + CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} + CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) endif() if (NOT (ASMJIT_NO_BUILDER OR ASMJIT_NO_COMPILER)) @@ -727,28 +716,48 @@ if (NOT ASMJIT_EMBED) endif() endif() - set_property(SOURCE test/asmjit_test_unicompiler_avx2fma.cpp APPEND PROPERTY COMPILE_OPTIONS ${ASMJIT_AVX2FMA_CFLAGS}) - - asmjit_add_target(asmjit_test_compiler TEST - SOURCES test/asmjit_test_compiler.cpp - test/asmjit_test_compiler.h - test/asmjit_test_compiler_a64.cpp - test/asmjit_test_compiler_x86.cpp - LIBRARIES asmjit::asmjit - CFLAGS ${ASMJIT_PRIVATE_CFLAGS} ${ASMJIT_SSE2_CFLAGS} - CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} - CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) - - asmjit_add_target(asmjit_test_unicompiler TEST - SOURCES test/asmjit_test_unicompiler.cpp - test/asmjit_test_unicompiler_sse2.cpp - test/asmjit_test_unicompiler_avx2fma.cpp - test/broken.cpp - LIBRARIES asmjit::asmjit - CFLAGS ${ASMJIT_PRIVATE_CFLAGS} ${ASMJIT_SSE2_CFLAGS} - CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} - CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) + asmjit_addapp(asmjit_test_compiler TEST + SOURCES testing/tests/asmjit_test_compiler.cpp + testing/tests/asmjit_test_compiler.h + testing/tests/asmjit_test_compiler_a64.cpp + testing/tests/asmjit_test_compiler_x86.cpp + LIBRARIES asmjit::asmjit + CFLAGS ${ASMJIT_PRIVATE_CFLAGS} ${ASMJIT_SSE2_CFLAGS} + CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} + CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) endif() + if (NOT ASMJIT_NO_UJIT) + asmjit_addapp(asmjit_test_unicompiler TEST + SOURCES testing/tests/asmjit_test_unicompiler.cpp + testing/tests/asmjit_test_unicompiler_sse2.cpp + testing/tests/asmjit_test_unicompiler_avx2fma.cpp + testing/tests/broken.cpp + LIBRARIES asmjit::asmjit + CFLAGS ${ASMJIT_PRIVATE_CFLAGS} ${ASMJIT_SSE2_CFLAGS} + CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} + CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) + set_property(SOURCE testing/tests/asmjit_test_unicompiler_avx2fma.cpp APPEND PROPERTY COMPILE_OPTIONS ${ASMJIT_AVX2FMA_CFLAGS}) + endif() + + asmjit_addapp(asmjit_bench_codegen EXECUTABLE + SOURCES testing/bench/asmjit_bench_codegen.cpp + testing/bench/asmjit_bench_codegen_a64.cpp + testing/bench/asmjit_bench_codegen_x86.cpp + SOURCES testing/bench/asmjit_bench_codegen.h + LIBRARIES asmjit::asmjit + CFLAGS ${ASMJIT_PRIVATE_CFLAGS} + CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} + CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) + + foreach(app asmjit_bench_overhead asmjit_bench_regalloc) + asmjit_addapp(${app} TEST + SOURCES testing/bench/${app}.cpp + LIBRARIES asmjit::asmjit + CFLAGS ${ASMJIT_PRIVATE_CFLAGS} + CFLAGS_DBG ${ASMJIT_PRIVATE_CFLAGS_DBG} + CFLAGS_REL ${ASMJIT_PRIVATE_CFLAGS_REL}) + endforeach() + endif() endif() diff --git a/LICENSE.md b/LICENSE.md index e01395c..7818b21 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,4 +1,4 @@ -Copyright (c) 2008-2025 The AsmJit Authors +Copyright (c) 2008-2025 Petr Kobalicek This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages diff --git a/README.md b/README.md index 805930d..4f416b3 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,12 @@ Project Organization * **src** - Source code * **asmjit** - Source code and headers (always point include path in here) * **core** - Core API, backend independent except relocations - * **arm** - ARM specific API, used only by ARM and AArch64 backends + * **arm** - ARM specific API, designed to be common for both AArch32 and AArch64 + * **a64** - AArch64 specific API, used only by AArch64 backends * **x86** - X86 specific API, used only by X86 and X64 backends + * **ujit** - Universal JIT API * **test** - Unit and integration tests (don't embed in your project) - * **tools** - Tools used for configuring, documenting, and generating files + * **tools** - Tools used to re-regenerate generated files (instruction DB, enum strings) Roadmap ------- @@ -38,6 +40,11 @@ Documentation * [Documentation Index](https://asmjit.com/doc/index.html) * [Build Instructions](https://asmjit.com/doc/group__asmjit__build.html) (includes [CMake Integration](https://asmjit.com/doc/group__asmjit__build.html#cmake_integration)) +Development & Testing +--------------------- + + * Basic configure scripts that invoke cmake are provided in project root. + Breaking Changes ---------------- diff --git a/configure.sh b/configure.sh new file mode 100755 index 0000000..659d6dc --- /dev/null +++ b/configure.sh @@ -0,0 +1,11 @@ +#!/bin/sh + +BUILD_OPTIONS="-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1" + +echo "== [Configuring Build - Debug] ==" +eval cmake . -B build/Debug -DCMAKE_BUILD_TYPE=Debug ${BUILD_OPTIONS} "$@" +echo "" + +echo "== [Configuring Build - Release] ==" +eval cmake . -B build/Release -DCMAKE_BUILD_TYPE=Release ${BUILD_OPTIONS} "$@" +echo "" diff --git a/configure_sanitizers.sh b/configure_sanitizers.sh new file mode 100755 index 0000000..2634bcd --- /dev/null +++ b/configure_sanitizers.sh @@ -0,0 +1,15 @@ +#!/bin/sh + +BUILD_OPTIONS="-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1" + +echo "== [Configuring Build - Release_ASAN] ==" +eval cmake . -B build/Release_ASAN ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=address "$@" +echo "" + +echo "== [Configuring Build - Release_MSAN] ==" +eval cmake . -B build/Release_MSAN ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=memory "$@" +echo "" + +echo "== [Configuring Build - Release_UBSAN] ==" +eval cmake . -B build/Release_UBSAN ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=undefined "$@" +echo "" diff --git a/configure_vs2022_x64.bat b/configure_vs2022_x64.bat new file mode 100644 index 0000000..9bbadd7 --- /dev/null +++ b/configure_vs2022_x64.bat @@ -0,0 +1,2 @@ +@echo off +cmake . -B build_x64 -G"Visual Studio 17" -A x64 -DASMJIT_TEST=ON diff --git a/configure_vs2022_x86.bat b/configure_vs2022_x86.bat new file mode 100644 index 0000000..8c123a0 --- /dev/null +++ b/configure_vs2022_x86.bat @@ -0,0 +1,2 @@ +@echo off +cmake . -B build_x86 -G"Visual Studio 17" -A Win32 -DASMJIT_TEST=ON diff --git a/db/isa_x86.json b/db/isa_x86.json index 657ec7c..b4bdaa8 100644 --- a/db/isa_x86.json +++ b/db/isa_x86.json @@ -3633,22 +3633,22 @@ {"apx": "and{nf} W:r8, R:r8/m8, imm8" , "op": "[VM ] EVEX.ND=1.LLZ.NP.MAP4.WIG 80 /4 ib" , "io": "OF=0 SF=W ZF=W AF=U PF=W CF=0"}, {"apx": "and{nf} W:rv, R:rv/mv, imms8" , "op": "[VM ] EVEX.ND=1.LLZ.Pv.MAP4.Wv 83 /4 ib" , "io": "OF=0 SF=W ZF=W AF=U PF=W CF=0"}, {"apx": "and{nf} W:rv, R:rv/mv, immv" , "op": "[VM ] EVEX.ND=1.LLZ.Pv.MAP4.Wv 81 /4 iv" , "io": "OF=0 SF=W ZF=W AF=U PF=W CF=0"}, - {"apx": "cmovb W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"}, - {"apx": "cmovbe W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"}, - {"apx": "cmovl W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"}, - {"apx": "cmovle W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"}, - {"apx": "cmovnb W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"}, - {"apx": "cmovnbe W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"}, - {"apx": "cmovnl W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"}, - {"apx": "cmovnle W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"}, - {"apx": "cmovno W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"}, - {"apx": "cmovnp W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"}, - {"apx": "cmovns W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"}, - {"apx": "cmovnz W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"}, - {"apx": "cmovo W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"}, - {"apx": "cmovp W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"}, - {"apx": "cmovs W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"}, - {"apx": "cmovz W:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"}, + {"apx": "cmovb X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"}, + {"apx": "cmovbe X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"}, + {"apx": "cmovl X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"}, + {"apx": "cmovle X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"}, + {"apx": "cmovnb X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"}, + {"apx": "cmovnbe X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"}, + {"apx": "cmovnl X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"}, + {"apx": "cmovnle X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"}, + {"apx": "cmovno X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"}, + {"apx": "cmovnp X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"}, + {"apx": "cmovns X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"}, + {"apx": "cmovnz X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"}, + {"apx": "cmovo X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"}, + {"apx": "cmovp X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"}, + {"apx": "cmovs X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"}, + {"apx": "cmovz X:rv, R:rv, R:rv/mv" , "op": "[VRM] EVEX.ND=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"}, {"any": "crc32 X:r32, R:r8/m8" , "op": "[RM ] EVEX.ND=0.LLZ.NP.MAP4.W0 F0 /r"}, {"any": "crc32 X:r64, R:r8/m8" , "op": "[RM ] EVEX.ND=0.LLZ.NP.MAP4.W1 F0 /r"}, {"any": "crc32 X:r32, R:r16/m16" , "op": "[RM ] EVEX.ND=0.LLZ.66.MAP4.W0 F1 /r"}, @@ -3970,69 +3970,69 @@ {"x64": "ccmpz dfv, R:r8/m8, imm8" , "op": "[M ] EVEX.ND=0.SCC=4.LLZ.NP.MAP4.WIG 80 /7 ib" }, {"x64": "ccmpz dfv, R:rv/mv, imms8" , "op": "[M ] EVEX.ND=0.SCC=4.LLZ.Pv.MAP4.Wv 83 /7 ib" }, {"x64": "ccmpz dfv, R:rv/mv, immv" , "op": "[M ] EVEX.ND=0.SCC=4.LLZ.Pv.MAP4.Wv 81 /7 iv" }, - {"x64": "cfcmovb W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"}, - {"x64": "cfcmovb W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"}, - {"x64": "cfcmovb W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"}, + {"x64": "cfcmovb X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"}, + {"x64": "cfcmovb X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"}, + {"x64": "cfcmovb X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"}, {"x64": "cfcmovb W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 42 /r" , "io": "CF=R"}, - {"x64": "cfcmovbe W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"}, - {"x64": "cfcmovbe W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"}, - {"x64": "cfcmovbe W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"}, + {"x64": "cfcmovbe X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"}, + {"x64": "cfcmovbe X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"}, + {"x64": "cfcmovbe X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"}, {"x64": "cfcmovbe W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 46 /r" , "io": "CF=R ZF=R"}, - {"x64": "cfcmovl W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"}, - {"x64": "cfcmovl W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"}, - {"x64": "cfcmovl W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"}, + {"x64": "cfcmovl X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"}, + {"x64": "cfcmovl X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"}, + {"x64": "cfcmovl X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"}, {"x64": "cfcmovl W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4C /r" , "io": "SF=R OF=R"}, - {"x64": "cfcmovle W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"}, - {"x64": "cfcmovle W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"}, - {"x64": "cfcmovle W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"}, + {"x64": "cfcmovle X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"}, + {"x64": "cfcmovle X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"}, + {"x64": "cfcmovle X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"}, {"x64": "cfcmovle W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4E /r" , "io": "ZF=R SF=R OF=R"}, - {"x64": "cfcmovnb W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"}, - {"x64": "cfcmovnb W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"}, - {"x64": "cfcmovnb W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"}, + {"x64": "cfcmovnb X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"}, + {"x64": "cfcmovnb X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"}, + {"x64": "cfcmovnb X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"}, {"x64": "cfcmovnb W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 43 /r" , "io": "CF=R"}, - {"x64": "cfcmovnbe W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"}, - {"x64": "cfcmovnbe W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"}, - {"x64": "cfcmovnbe W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"}, + {"x64": "cfcmovnbe X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"}, + {"x64": "cfcmovnbe X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"}, + {"x64": "cfcmovnbe X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"}, {"x64": "cfcmovnbe W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 47 /r" , "io": "CF=R ZF=R"}, - {"x64": "cfcmovnl W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"}, - {"x64": "cfcmovnl W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"}, - {"x64": "cfcmovnl W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"}, + {"x64": "cfcmovnl X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"}, + {"x64": "cfcmovnl X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"}, + {"x64": "cfcmovnl X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"}, {"x64": "cfcmovnl W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4D /r" , "io": "SF=R OF=R"}, - {"x64": "cfcmovnle W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"}, - {"x64": "cfcmovnle W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"}, - {"x64": "cfcmovnle W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"}, + {"x64": "cfcmovnle X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"}, + {"x64": "cfcmovnle X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"}, + {"x64": "cfcmovnle X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"}, {"x64": "cfcmovnle W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4F /r" , "io": "ZF=R SF=R OF=R"}, - {"x64": "cfcmovno W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"}, - {"x64": "cfcmovno W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"}, - {"x64": "cfcmovno W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"}, + {"x64": "cfcmovno X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"}, + {"x64": "cfcmovno X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"}, + {"x64": "cfcmovno X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"}, {"x64": "cfcmovno W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 41 /r" , "io": "OF=R"}, - {"x64": "cfcmovnp W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"}, - {"x64": "cfcmovnp W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"}, - {"x64": "cfcmovnp W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"}, + {"x64": "cfcmovnp X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"}, + {"x64": "cfcmovnp X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"}, + {"x64": "cfcmovnp X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"}, {"x64": "cfcmovnp W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4B /r" , "io": "PF=R"}, - {"x64": "cfcmovns W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"}, - {"x64": "cfcmovns W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"}, - {"x64": "cfcmovns W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"}, + {"x64": "cfcmovns X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"}, + {"x64": "cfcmovns X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"}, + {"x64": "cfcmovns X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"}, {"x64": "cfcmovns W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 49 /r" , "io": "SF=R"}, - {"x64": "cfcmovnz W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"}, - {"x64": "cfcmovnz W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"}, - {"x64": "cfcmovnz W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"}, + {"x64": "cfcmovnz X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"}, + {"x64": "cfcmovnz X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"}, + {"x64": "cfcmovnz X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"}, {"x64": "cfcmovnz W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 45 /r" , "io": "ZF=R"}, - {"x64": "cfcmovo W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"}, - {"x64": "cfcmovo W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"}, - {"x64": "cfcmovo W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"}, + {"x64": "cfcmovo X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"}, + {"x64": "cfcmovo X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"}, + {"x64": "cfcmovo X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"}, {"x64": "cfcmovo W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 40 /r" , "io": "OF=R"}, - {"x64": "cfcmovp W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"}, - {"x64": "cfcmovp W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"}, - {"x64": "cfcmovp W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"}, + {"x64": "cfcmovp X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"}, + {"x64": "cfcmovp X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"}, + {"x64": "cfcmovp X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"}, {"x64": "cfcmovp W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 4A /r" , "io": "PF=R"}, - {"x64": "cfcmovs W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"}, - {"x64": "cfcmovs W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"}, - {"x64": "cfcmovs W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"}, + {"x64": "cfcmovs X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"}, + {"x64": "cfcmovs X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"}, + {"x64": "cfcmovs X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"}, {"x64": "cfcmovs W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 48 /r" , "io": "SF=R"}, - {"x64": "cfcmovz W:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"}, - {"x64": "cfcmovz W:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"}, - {"x64": "cfcmovz W?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"}, + {"x64": "cfcmovz X:rv, R:rv/mv" , "op": "[RM ] EVEX.ND=0.NF=0.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"}, + {"x64": "cfcmovz X:rv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"}, + {"x64": "cfcmovz X?:mv, R:rv" , "op": "[MR ] EVEX.ND=0.NF=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"}, {"x64": "cfcmovz W:rv, R:rv, R?:rv/mv" , "op": "[VRM] EVEX.ND=1.NF=1.LLZ.Pv.MAP4.Wv 44 /r" , "io": "ZF=R"}, {"x64": "ctestb dfv, R:r8/m8, R:r8" , "op": "[MR ] EVEX.ND=0.SCC=2.LLZ.NP.MAP4.WIG 84 /r" }, {"x64": "ctestb dfv, R:rv/mv, R:rv" , "op": "[MR ] EVEX.ND=0.SCC=2.LLZ.Pv.MAP4.Wv 85 /r" }, diff --git a/src/asmjit/arm/a64assembler.cpp b/src/asmjit/arm/a64assembler.cpp index 990a395..82af1f7 100644 --- a/src/asmjit/arm/a64assembler.cpp +++ b/src/asmjit/arm/a64assembler.cpp @@ -4491,14 +4491,14 @@ Case_BaseLdurStur: goto InvalidInstruction; uint32_t x = o0.as().is_gp64(); - uint32_t gpMustBeX = uint32_t(size_op.size() >= 3u - op_data.is_signed); + uint32_t gp_must_be_x = uint32_t(size_op.size() >= 3u - op_data.is_signed); if (op_data.is_signed) { - if (gpMustBeX && !x) + if (gp_must_be_x && !x) goto InvalidInstruction; } else { - if (x != gpMustBeX) + if (x != gp_must_be_x) goto InvalidInstruction; } diff --git a/src/asmjit/arm/a64compiler.h b/src/asmjit/arm/a64compiler.h index 17f82b4..2d09c5b 100644 --- a/src/asmjit/arm/a64compiler.h +++ b/src/asmjit/arm/a64compiler.h @@ -37,75 +37,87 @@ public: //! \name Virtual Registers //! \{ - //! \cond INTERNAL - template - ASMJIT_INLINE_NODEBUG RegT _new_reg_internal(const Type& type) { - RegT reg(Globals::NoInit); - _new_reg(Out{reg}, type, nullptr); - return reg; - } + //! Creates a new general-purpose register with `type_id` type and optional name passed via `args`. + //! + //! \note Using \ref TypeId is too generic. In general it's recommended to use \ref new_gp32(), + //! \ref new_gp64(), and \ref new_gpz() or \ref new_gp_ptr(). + template + ASMJIT_INLINE_NODEBUG Gp new_gp(TypeId type_id, Args&&... args) { return new_reg(type_id, std::forward(args)...); } - template - ASMJIT_INLINE_NODEBUG RegT _new_reg_internal(const Type& type, const char* s) { -#ifndef ASMJIT_NO_LOGGING - RegT reg(Globals::NoInit); - _new_reg(Out{reg}, type, s); - return reg; -#else - Support::maybe_unused(s); - return _new_reg_internal(type); -#endif - } + //! Creates a new vector register with `type_id` type and optional name passed via `args`. + //! + //! \note Using \ref TypeId is too generic. In general it's recommended to use \ref new_vec128(), + //! \ref new_vec_s(), \ref new_vec_d(), \ref new_vec_q(), ... + template + ASMJIT_INLINE_NODEBUG Vec new_vec(TypeId type_id, Args&&... args) { return new_reg(type_id, std::forward(args)...); } - template - ASMJIT_INLINE_NODEBUG RegT _new_reg_internal(const Type& type, const char* s, Args&&... args) { -#ifndef ASMJIT_NO_LOGGING - RegT reg(Globals::NoInit); - _new_reg_fmt(Out{reg}, type, s, std::forward(args)...); - return reg; -#else - Support::maybe_unused(s, std::forward(args)...); - return _new_reg_internal(type); -#endif - } - //! \endcond + //! Creates a new 32-bit general purpose register mapped to low 32 bits of a full register (on 64-bit targets). + template + ASMJIT_INLINE_NODEBUG Gp new_gp32(Args&&... args) { return new_reg(TypeId::kUInt32, std::forward(args)...); } - template - ASMJIT_INLINE_NODEBUG RegT new_similar_reg(const RegT& ref, Args&&... args) { - return _new_reg_internal(ref, std::forward(args)...); - } + //! Creates a new 64-bit general purpose register. + template + ASMJIT_INLINE_NODEBUG Gp new_gp64(Args&&... args) { return new_reg(TypeId::kUInt64, std::forward(args)...); } + //! Creates a new 32-bit general purpose register. + //! + //! \note This is a convenience function alias of \ref new_gp32(). template - ASMJIT_INLINE_NODEBUG Reg new_reg(TypeId type_id, Args&&... args) { return _new_reg_internal(type_id, std::forward(args)...); } + ASMJIT_INLINE_NODEBUG Gp new_gpw(Args&&... args) { return new_reg(TypeId::kUIntPtr, std::forward(args)...); } + //! Creates a new 64-bit general purpose register. + //! + //! \note This is a convenience function alias of \ref new_gp64(). template - ASMJIT_INLINE_NODEBUG Gp new_gp(TypeId type_id, Args&&... args) { return _new_reg_internal(type_id, std::forward(args)...); } + ASMJIT_INLINE_NODEBUG Gp new_gpx(Args&&... args) { return new_reg(TypeId::kUIntPtr, std::forward(args)...); } + //! Creates a new 32-bit or 64-bit general purpose register depending on the target register width. + //! + //! \note This is a convenience function, on aarch64 target it always creates a 64-bit general-purpose register. template - ASMJIT_INLINE_NODEBUG Gp new_gp32(Args&&... args) { return _new_reg_internal(TypeId::kUInt32, std::forward(args)...); } - template - ASMJIT_INLINE_NODEBUG Gp new_gp64(Args&&... args) { return _new_reg_internal(TypeId::kUInt64, std::forward(args)...); } + ASMJIT_INLINE_NODEBUG Gp new_gpz(Args&&... args) { return new_reg(TypeId::kUIntPtr, std::forward(args)...); } + //! Creates a new 32-bit or 64-bit general purpose register depending on the target register width. + //! + //! \note This is a convenience function, on aarch64 target it always creates a 64-bit general-purpose register. template - ASMJIT_INLINE_NODEBUG Gp new_gpw(Args&&... args) { return _new_reg_internal(TypeId::kUInt32, std::forward(args)...); } - template - ASMJIT_INLINE_NODEBUG Gp new_gpx(Args&&... args) { return _new_reg_internal(TypeId::kUInt64, std::forward(args)...); } - template - ASMJIT_INLINE_NODEBUG Gp new_gpz(Args&&... args) { return _new_reg_internal(TypeId::kUIntPtr, std::forward(args)...); } - template - ASMJIT_INLINE_NODEBUG Gp new_gp_ptr(Args&&... args) { return _new_reg_internal(TypeId::kUIntPtr, std::forward(args)...); } + ASMJIT_INLINE_NODEBUG Gp new_gp_ptr(Args&&... args) { return new_reg(TypeId::kUIntPtr, std::forward(args)...); } + //! Creates a new 128-bit vector register. template - ASMJIT_INLINE_NODEBUG Vec new_vec(TypeId type_id, Args&&... args) { return _new_reg_internal(type_id, std::forward(args)...); } + ASMJIT_INLINE_NODEBUG Vec new_vec128(Args&&... args) { return new_reg(TypeId::kInt32x4, std::forward(args)...); } + //! Creates a new 128-bit vector register that will be used for scalar 32-bit floating point operation. template - ASMJIT_INLINE_NODEBUG Vec new_vec_s(Args&&... args) { return _new_reg_internal(TypeId::kFloat32, std::forward(args)...); } + ASMJIT_INLINE_NODEBUG Vec new_vec128_f32x1(Args&&... args) { return new_reg(TypeId::kFloat32x1, std::forward(args)...); } + //! Creates a new 128-bit vector register that will be used for scalar 64-bit floating point operation. template - ASMJIT_INLINE_NODEBUG Vec new_vec_d(Args&&... args) { return _new_reg_internal(TypeId::kFloat64, std::forward(args)...); } + ASMJIT_INLINE_NODEBUG Vec new_vec128_f64x1(Args&&... args) { return new_reg(TypeId::kFloat64x1, std::forward(args)...); } + //! Creates a new 128-bit vector register that will be used for packed 32-bit floating point operation. template - ASMJIT_INLINE_NODEBUG Vec new_vec_q(Args&&... args) { return _new_reg_internal(TypeId::kUInt8x16, std::forward(args)...); } + ASMJIT_INLINE_NODEBUG Vec new_vec128_f32x4(Args&&... args) { return new_reg(TypeId::kFloat32x4, std::forward(args)...); } + + //! Creates a new 128-bit vector register that will be used for packed 64-bit floating point operation. + template + ASMJIT_INLINE_NODEBUG Vec new_vec128_f64x2(Args&&... args) { return new_reg(TypeId::kFloat64x2, std::forward(args)...); } + + //! Creates a new 32-bit vector register (S). + //! + //! \note This may look like an alias of \ref new_vec128_f32x1(), but it's not. This really creates a 32-bit + //! register, which has a type \ref RegType::kVec32, whereas \ref new_vec128_f32x1() creates a register, + //! which has a type \ref RegType::kVec64 + template + ASMJIT_INLINE_NODEBUG Vec new_vec_s(Args&&... args) { return new_reg(TypeId::kFloat32, std::forward(args)...); } + + //! Alias of \ref new_vec128_f64x1() that matches aarch64 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_vec_d(Args&&... args) { return new_reg(TypeId::kFloat64, std::forward(args)...); } + + //! Alias of \ref new_vec128() that matches aarch64 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_vec_q(Args&&... args) { return new_reg(TypeId::kInt32x4, std::forward(args)...); } //! \} diff --git a/src/asmjit/core.h b/src/asmjit/core.h index 58c1751..7de2aa6 100644 --- a/src/asmjit/core.h +++ b/src/asmjit/core.h @@ -188,19 +188,16 @@ namespace asmjit { //! you can just use the following CMake snippet that integrates AsmJit with your own CMake project: //! //! ```cmake -//! cmake_minimum_required(VERSION 3.30) +//! cmake_minimum_required(VERSION 3.30 FATAL_ERROR) +//! project(app C CXX) //! -//! project(asmjit_consumer C CXX) # Both C and CXX are required. -//! set(CMAKE_CXX_STANDARD 17) # C++17 and never is supported. +//! set(ASMJIT_DIR "3rdparty/asmjit") # Location of AsmJit. +//! set(ASMJIT_STATIC TRUE) # Force static build. +//! add_subdirectory("${ASMJIT_DIR}") # Adds AsmJit sub-project to your project. //! -//! set(ASMJIT_DIR "3rdparty/asmjit") # Location of AsmJit. -//! set(ASMJIT_STATIC TRUE) # Force static build. -//! -//! add_subdirectory("${ASMJIT_DIR}") # This adds AsmJit as a part of your project. -//! -//! add_executable(asmjit_consumer asmjit_consumer.cpp) -//! target_link_libraries( -//! asmjit_consumer asmjit::asmjit) # This adds AsmJit as a dependency to your target. +//! add_executable(app asmjit_consumer.cpp) # Adds executable that uses AsmJit. +//! target_link_libraries(app asmjit::asmjit) # Adds AsmJit as a dependency to app. +//! target_compile_features(app PUBLIC cxx_std_17) # Makes C++17 as a requirement. //! ``` //! //! \section build_type Build Type Configuration @@ -2111,8 +2108,53 @@ namespace asmjit { //! \defgroup asmjit_a64 AArch64 Backend //! \brief AArch64 backend. -//! \defgroup asmjit_ujit UJIT -//! \brief Universal JIT - abstracts X86|X86_64 and AArch64 code generation. +//! \defgroup asmjit_ujit Universal JIT +//! \brief Universal JIT abstracts X86, X86_64, and AArch64 code generation. +//! +//! ### Overview +//! +//! Universal JIT (UJIT) is an abstraction that uses AsmJit's Compiler, but provides target independent API that +//! users can use to target multiple target architectures at a time. The goal of Universal JIT is not to provide +//! its own IR. Instead, it translates user calls into target-dependent instructions (or instruction sequences) +//! and allows users to switch to target-specific assembly only where required for extra performance. +//! +//! \warning UJIT is still in an experimental phase, expect minor API breaks in the future. +//! +//! API Overview +//! +//! Compiler: +//! +//! - \ref ujit::UniCompiler - UniCompiler that wraps an existing \ref ujit::BackendCompiler. +//! - \ref ujit::BackendCompiler - alias of a platform-dependent Compiler (\ref x86::Compiler or \ref a64::Compiler). +//! +//! Operands: +//! +//! - \ref ujit::Gp - alias of a platform-dependent general-purpose register (\ref x86::Gp, \ref a64::Gp). +//! - \ref ujit::Vec - alias of a platform-dependent vector register (\ref x86::Vec, \ref a64::Vec). +//! - \ref ujit::Mem - alias of a platform-dependent memory operand (\ref x86::Mem, \ref a64::Mem). +//! +//! Conditions: +//! +//! - \ref ujit::CondCode - alias of a platform-dependent condition code (\ref x86::CondCode, a64::CondCode). +//! - \ref ujit::UniCondition - platform-independent condition representation that can be used with some ujit +//! instructions. +//! +//! Instructions: +//! +//! - \ref ujit::UniOpCond - instruction that can be used by \ref ujit::UniCondition. +//! - \ref ujit::UniOpM - instruction with a single `[mem]` operand. +//! - \ref ujit::UniOpRM - instruction with `[reg, mem]` operands. +//! - \ref ujit::UniOpMR - instruction with `[mem, reg]` operands. +//! - \ref ujit::UniOpRR - instruction with `[reg, reg]` operands. +//! - \ref ujit::UniOpRRR - instruction with `[reg, reg, reg]` operands. +//! - \ref ujit::UniOpVR - instruction with `[vec, reg]` operands. +//! - \ref ujit::UniOpVM - instruction with `[vec, mem]` operands. +//! - \ref ujit::UniOpMV - instruction with `[mem, vec]` operands. +//! - \ref ujit::UniOpVV - instruction with `[vec, vec]` operands. +//! - \ref ujit::UniOpVVI - instruction with `[vec, vec, imm]` operands. +//! - \ref ujit::UniOpVVV - instruction with `[vec, vec, vec]` operands. +//! - \ref ujit::UniOpVVVI - instruction with `[vec, vec, vec, imm]` operands. +//! - \ref ujit::UniOpVVVV - instruction with `[vec, vec, vec, vec]` operands. //! \cond INTERNAL //! \defgroup asmjit_ra RA @@ -2122,7 +2164,16 @@ namespace asmjit { } // {asmjit} #include "asmjit-scope-begin.h" +#include "core/api-config.h" +#include "core/archcommons.h" #include "core/archtraits.h" +#include "core/arena.h" +#include "core/arenahash.h" +#include "core/arenalist.h" +#include "core/arenapool.h" +#include "core/arenastring.h" +#include "core/arenatree.h" +#include "core/arenavector.h" #include "core/assembler.h" #include "core/builder.h" #include "core/codebuffer.h" @@ -2149,13 +2200,6 @@ namespace asmjit { #include "core/target.h" #include "core/type.h" #include "core/virtmem.h" -#include "core/arena.h" -#include "core/arenahash.h" -#include "core/arenalist.h" -#include "core/arenapool.h" -#include "core/arenatree.h" -#include "core/arenastring.h" -#include "core/arenavector.h" #include "asmjit-scope-end.h" #endif // ASMJIT_CORE_H_INCLUDED diff --git a/src/asmjit/core/api-build_p.h b/src/asmjit/core/api-build_p.h index 03315fa..af56b64 100644 --- a/src/asmjit/core/api-build_p.h +++ b/src/asmjit/core/api-build_p.h @@ -68,7 +68,7 @@ // Include a unit testing package if this is a `asmjit_test_unit` build. #if defined(ASMJIT_TEST) - #include "../../../test/broken.h" + #include "../../../testing/tests/broken.h" #endif #endif // ASMJIT_CORE_API_BUILD_P_H_INCLUDED diff --git a/src/asmjit/core/api-config.h b/src/asmjit/core/api-config.h index 398fb7f..2d51778 100644 --- a/src/asmjit/core/api-config.h +++ b/src/asmjit/core/api-config.h @@ -16,7 +16,7 @@ #define ASMJIT_LIBRARY_MAKE_VERSION(major, minor, patch) ((major << 16) | (minor << 8) | (patch)) //! AsmJit library version, see \ref ASMJIT_LIBRARY_MAKE_VERSION for a version format reference. -#define ASMJIT_LIBRARY_VERSION ASMJIT_LIBRARY_MAKE_VERSION(1, 18, 1) +#define ASMJIT_LIBRARY_VERSION ASMJIT_LIBRARY_MAKE_VERSION(1, 19, 0) //! \def ASMJIT_ABI_NAMESPACE //! @@ -27,7 +27,7 @@ //! AsmJit default, which makes it possible to use multiple AsmJit libraries within a single project, totally //! controlled by users. This is useful especially in cases in which some of such library comes from third party. #if !defined(ASMJIT_ABI_NAMESPACE) - #define ASMJIT_ABI_NAMESPACE v1_18 + #define ASMJIT_ABI_NAMESPACE v1_19 #endif // !ASMJIT_ABI_NAMESPACE //! \} diff --git a/src/asmjit/core/builder.h b/src/asmjit/core/builder.h index 5e03c16..797d65c 100644 --- a/src/asmjit/core/builder.h +++ b/src/asmjit/core/builder.h @@ -21,8 +21,6 @@ #include "../core/support.h" #include "../core/type.h" -#define ASMJIT_NO_NODE_USERDATA - ASMJIT_BEGIN_NAMESPACE //! \addtogroup asmjit_builder diff --git a/src/asmjit/core/compiler.cpp b/src/asmjit/core/compiler.cpp index 11f0f6b..62f79b4 100644 --- a/src/asmjit/core/compiler.cpp +++ b/src/asmjit/core/compiler.cpp @@ -259,7 +259,7 @@ Error BaseCompiler::new_virt_reg(Out out, TypeId type_id, OperandSigna return Error::kOk; } -Error BaseCompiler::_new_reg(Out out, TypeId type_id, const char* name) { +Error BaseCompiler::_new_reg_with_name(Out out, TypeId type_id, const char* name) { OperandSignature reg_signature; out->reset(); @@ -276,7 +276,7 @@ Error BaseCompiler::_new_reg(Out out, TypeId type_id, const char* name) { return Error::kOk; } -Error BaseCompiler::_new_reg(Out out, const Reg& ref, const char* name) { +Error BaseCompiler::_new_reg_with_name(Out out, const Reg& ref, const char* name) { out->reset(); OperandSignature reg_signature; @@ -351,7 +351,7 @@ Error BaseCompiler::_new_reg(Out out, const Reg& ref, const char* name) { return Error::kOk; } -Error BaseCompiler::_new_reg_fmt(Out out, TypeId type_id, const char* fmt, ...) { +Error BaseCompiler::_new_reg_with_vfmt(Out out, TypeId type_id, const char* fmt, ...) { va_list ap; StringTmp<256> sb; @@ -362,7 +362,7 @@ Error BaseCompiler::_new_reg_fmt(Out out, TypeId type_id, const char* fmt, return _new_reg(out, type_id, sb.data()); } -Error BaseCompiler::_new_reg_fmt(Out out, const Reg& ref, const char* fmt, ...) { +Error BaseCompiler::_new_reg_with_vfmt(Out out, const Reg& ref, const char* fmt, ...) { va_list ap; StringTmp<256> sb; diff --git a/src/asmjit/core/compiler.h b/src/asmjit/core/compiler.h index e4cdc8b..bddf749 100644 --- a/src/asmjit/core/compiler.h +++ b/src/asmjit/core/compiler.h @@ -152,26 +152,86 @@ public: //! Creates a new virtual register representing the given `type_id` and `signature`. //! - //! \note This function is public, but it's not generally recommended to be used by AsmJit users, use architecture - //! specific `new_reg()` functionality instead or functions like \ref _new_reg() and \ref _new_reg_fmt(). + //! \note This function is public, but it's not generally recommended to be used by AsmJit users, use `new_reg()`, + //! `new_similar_reg()`, and architecture specific functions like \ref x86::Compiler::new_gp32(), etc... ASMJIT_API Error new_virt_reg(Out out, TypeId type_id, OperandSignature signature, const char* name); + //! \cond INTERNAL + //! Creates a new virtual register of the given `type_id` and stores it to `out` operand. - ASMJIT_API Error _new_reg(Out out, TypeId type_id, const char* name = nullptr); + ASMJIT_API Error _new_reg_with_name(Out out, TypeId type_id, const char* name); + //! Creates a new virtual register compatible with the provided reference register `ref`. - ASMJIT_API Error _new_reg(Out out, const Reg& ref, const char* name = nullptr); + ASMJIT_API Error _new_reg_with_name(Out out, const Reg& ref, const char* name); //! Creates a new virtual register of the given `type_id` and stores it to `out` operand. //! //! \note This version accepts a snprintf() format `fmt` followed by variadic arguments. - ASMJIT_API Error _new_reg_fmt(Out out, TypeId type_id, const char* fmt, ...); - //! \overload - ASMJIT_INLINE Error _new_reg_fmt(Out out, TypeId type_id) { return _new_reg(out, type_id); } + ASMJIT_API Error _new_reg_with_vfmt(Out out, TypeId type_id, const char* fmt, ...); //! Creates a new virtual register compatible with the provided reference register `ref`. //! //! \note This version accepts a snprintf() format `fmt` followed by variadic arguments. - ASMJIT_API Error _new_reg_fmt(Out out, const Reg& ref, const char* fmt, ...); + ASMJIT_API Error _new_reg_with_vfmt(Out out, const Reg& ref, const char* fmt, ...); + + template + ASMJIT_INLINE Error _new_reg(Out out, TypeId type_id) { + return _new_reg_with_name(Out(out.value()), type_id, nullptr); + } + + template + ASMJIT_INLINE Error _new_reg(Out out, TypeId type_id, const char* name_or_fmt, Args&&... args) { +#ifndef ASMJIT_NO_LOGGING + if constexpr (sizeof...(args) == 0u) { + return _new_reg_with_name(Out(out.value()), type_id, name_or_fmt); + } + else { + return _new_reg_with_vfmt(Out(out.value()), type_id, name_or_fmt, std::forward(args)...); + } +#else + Support::maybe_unused(name_or_fmt, std::forward(args)...); + return _new_reg_with_name(Out(out.value()), type_id, nullptr); +#endif + } + + template + ASMJIT_INLINE Error _new_reg(Out out, const Reg& ref) { + return _new_reg_with_name(Out(out.value()), ref, nullptr); + } + + template + ASMJIT_INLINE Error _new_reg(Out out, const Reg& ref, const char* name_or_fmt, Args&&... args) { +#ifndef ASMJIT_NO_LOGGING + if constexpr (sizeof...(args) == 0u) { + return _new_reg_with_name(Out(out.value()), ref, name_or_fmt); + } + else { + return _new_reg_with_vfmt(Out(out.value()), ref, name_or_fmt, std::forward(args)...); + } +#else + Support::maybe_unused(name_or_fmt, std::forward(args)...); + return _new_reg_with_name(Out(out.value()), ref, nullptr); +#endif + } + + //! \endcond + + template + ASMJIT_INLINE_NODEBUG RegT new_reg(TypeId type_id, Args&&... args) { + RegT reg(Globals::NoInit); + (void)_new_reg(Out(reg), type_id, std::forward(args)...); + return reg; + } + + //! Creates and returns a new register, which is similar to `ref` in terms of size and type. + //! + //! \note Optionally you can provide a name and format parameters via `args`. + template + ASMJIT_INLINE_NODEBUG RegT new_similar_reg(const RegT& ref, Args&&... args) { + RegT reg(Globals::NoInit); + (void)_new_reg(Out(reg), ref, std::forward(args)...); + return reg; + } //! Tests whether the given `virt_id` is a valid virtual register id. [[nodiscard]] diff --git a/src/asmjit/core/cpuinfo.cpp b/src/asmjit/core/cpuinfo.cpp index edd488d..2133a7d 100644 --- a/src/asmjit/core/cpuinfo.cpp +++ b/src/asmjit/core/cpuinfo.cpp @@ -593,6 +593,98 @@ static ASMJIT_FAVOR_SIZE void detect_x86_cpu(CpuInfo& cpu) noexcept { simplify_cpu_brand(cpu._brand.str); } +static ASMJIT_FAVOR_SIZE CpuHints recalculate_hints(const CpuInfo& cpu_info, const CpuFeatures::X86& features) noexcept { + CpuHints hints {}; + + // Vendor Independent CPU Hints + // ---------------------------- + + if (features.has_avx2()) { + hints |= CpuHints::kVecMaskedOps32 | CpuHints::kVecMaskedOps64; + } + + if (features.has_avx512_bw()) { + hints |= CpuHints::kVecMaskedOps8 | CpuHints::kVecMaskedOps16 | CpuHints::kVecMaskedOps32 | CpuHints::kVecMaskedOps64; + } + + // Select optimization flags based on CPU vendor and micro-architecture. + + // AMD Specific CPU Hints + // ---------------------- + + if (cpu_info.is_vendor("AMD")) { + // Zen 3+ has fast gathers, scalar loads and shuffles are faster on Zen 2 and older CPUs. + if (cpu_info.family_id() >= 0x19u) { + hints |= CpuHints::kVecFastGather; + } + + // Zen 1+ provides low-latency VPMULLD instruction. + if (features.has_avx2()) { + hints |= CpuHints::kVecFastIntMul32; + } + + // Zen 4+ provides low-latency VPMULLQ instruction. + if (features.has_avx512_dq()) { + hints |= CpuHints::kVecFastIntMul64; + } + + // Zen 4+ has fast mask operations (starts with AVX-512). + if (features.has_avx512_f()) { + hints |= CpuHints::kVecMaskedStore; + } + } + + // Intel Specific CPU Hints + // ------------------------ + + if (cpu_info.is_vendor("INTEL")) { + if (features.has_avx2()) { + uint32_t family_id = cpu_info.family_id(); + uint32_t model_id = cpu_info.model_id(); + + // NOTE: We only want to hint fast gathers in cases the CPU is immune to DOWNFALL. The reason is that the + // DOWNFALL mitigation delivered via a micro-code update makes gathers almost useless in a way that scalar + // loads can beat it significantly (in Blend2D case scalar loads can offer up to 50% more performance). + // This table basically picks CPUs that are known to not be affected by DOWNFALL. + if (family_id == 0x06u) { + switch (model_id) { + case 0x8Fu: // Sapphire Rapids. + case 0x96u: // Elkhart Lake. + case 0x97u: // Alder Lake / Catlow. + case 0x9Au: // Alder Lake / Arizona Beach. + case 0x9Cu: // Jasper Lake. + case 0xAAu: // Meteor Lake. + case 0xACu: // Meteor Lake. + case 0xADu: // Granite Rapids. + case 0xAEu: // Granite Rapids. + case 0xAFu: // Sierra Forest. + case 0xBAu: // Raptor Lake. + case 0xB5u: // Arrow Lake. + case 0xB6u: // Grand Ridge. + case 0xB7u: // Raptor Lake / Catlow. + case 0xBDu: // Lunar Lake. + case 0xBEu: // Alder Lake (N). + case 0xBFu: // Raptor Lake. + case 0xC5u: // Arrow Lake. + case 0xC6u: // Arrow Lake. + case 0xCFu: // Emerald Rapids. + case 0xDDu: // Clearwater Forest. + hints |= CpuHints::kVecFastGather; + break; + + default: + break; + } + } + } + + // TODO: It seems masked stores are very expensive on consumer INTEL CPUs. + // hints |= CpuHints::kVecMaskedStore; + } + + return hints; +} + } // {x86} #endif // ASMJIT_ARCH_X86 @@ -2237,6 +2329,15 @@ static ASMJIT_FAVOR_SIZE void detect_arm_cpu(CpuInfo& cpu) noexcept { } #endif +static ASMJIT_FAVOR_SIZE CpuHints recalculate_hints(const CpuInfo& cpu_info, const CpuFeatures::ARM& features) noexcept { + Support::maybe_unused(cpu_info, features); + + // Assume ARM CPUs have fast 32-bit SIMD integer multiplication. + CpuHints hints = CpuHints::kVecFastIntMul32; + + return hints; +} + } // {arm} #endif @@ -2261,8 +2362,9 @@ const CpuInfo& CpuInfo::host() noexcept { #elif ASMJIT_ARCH_ARM arm::detect_arm_cpu(cpu_info_local); #endif - cpu_info_local._hw_thread_count = detect_hw_thread_count(); + cpu_info_local.update_hints(); + cpu_info_global = cpu_info_local; cpu_info_initialized_flag.store(1, std::memory_order_seq_cst); } @@ -2270,4 +2372,15 @@ const CpuInfo& CpuInfo::host() noexcept { return cpu_info_global; } +CpuHints CpuInfo::recalculate_hints(const CpuInfo& info, const CpuFeatures& features) noexcept { +#if ASMJIT_ARCH_X86 + return x86::recalculate_hints(info, features.x86()); +#elif ASMJIT_ARCH_ARM + return arm::recalculate_hints(info, features.arm()); +#else + Support::maybe_unused(info, features); + return CpuHints::kNone; +#endif +} + ASMJIT_END_NAMESPACE diff --git a/src/asmjit/core/cpuinfo.h b/src/asmjit/core/cpuinfo.h index 12ea382..6225a8c 100644 --- a/src/asmjit/core/cpuinfo.h +++ b/src/asmjit/core/cpuinfo.h @@ -525,6 +525,59 @@ public: ASMJIT_X86_FEATURE(has_amx_transpose, kAMX_TRANSPOSE) #undef ASMJIT_X86_FEATURE + + ASMJIT_INLINE void remove_avx() noexcept { + remove(kAVX , + kAVX2 , + kAVX_IFMA , + kAVX_NE_CONVERT , + kAVX_VNNI , + kAVX_VNNI_INT16 , + kAVX_VNNI_INT8 , + kF16C , + kFMA , + kFMA4 , + kVAES , + kVPCLMULQDQ , + kXOP); + remove_avx512(); + } + + ASMJIT_INLINE void remove_avx512() noexcept { + remove(kAVX512_BF16 , + kAVX512_BITALG , + kAVX512_BW , + kAVX512_CD , + kAVX512_DQ , + kAVX512_F , + kAVX512_FP16 , + kAVX512_IFMA , + kAVX512_VBMI , + kAVX512_VBMI2 , + kAVX512_VL , + kAVX512_VNNI , + kAVX512_VP2INTERSECT , + kAVX512_VPOPCNTDQ , + kAMX_AVX512); + remove_avx10(); + } + + ASMJIT_INLINE void remove_avx10() noexcept { + remove(kAVX10_1 | kAVX10_2); + } + + ASMJIT_INLINE void remove_amx() noexcept { + remove(kAMX_AVX512 , + kAMX_BF16 , + kAMX_COMPLEX , + kAMX_FP16 , + kAMX_FP8 , + kAMX_INT8 , + kAMX_MOVRS , + kAMX_TF32 , + kAMX_TILE , + kAMX_TRANSPOSE); + } }; //! ARM specific features data. @@ -1104,6 +1157,39 @@ public: //! \} }; +//! Describe micro-architectural hints that can be used for optimization purposes and are not part of \ref CpuFeatures. +enum class CpuHints : uint32_t { + //! No honts. + kNone = 0x0u, + + //! CPU provides fast 8-bit masked loads and stores. + kVecMaskedOps8 = 0x00000001u, + + //! CPU provides fast 16-bit masked loads and stores. + kVecMaskedOps16 = 0x00000002u, + + //! CPU provides fast 32-bit masked loads and stores. + kVecMaskedOps32 = 0x00000004u, + + //! CPU provides fast 64-bit masked loads and stores. + kVecMaskedOps64 = 0x00000008u, + + //! CPU provides low-latency 32-bit multiplication (AMD CPUs). + kVecFastIntMul32 = 0x00000010u, + + //! CPU provides low-latency 64-bit multiplication (AMD CPUs). + kVecFastIntMul64 = 0x00000020u, + + //! CPU provides fast hardware gathers, which are faster than a sequence of loads and inserts. + kVecFastGather = 0x00000040u, + + //! CPU has fast stores with mask. + //! + //! \note This is a hint to the compiler to emit a masked store instead of a sequence having branches. + kVecMaskedStore = 0x00000080u +}; +ASMJIT_DEFINE_ENUM_FLAGS(CpuHints) + //! CPU information. class CpuInfo { public: @@ -1142,6 +1228,9 @@ public: //! CPU features. CpuFeatures _features {}; + //! CPU hints. + CpuHints _hints {}; + //! \} //! \name Construction & Destruction @@ -1167,6 +1256,12 @@ public: [[nodiscard]] ASMJIT_API static const CpuInfo& host() noexcept; + //! Updates CPU hints based on the CPU data and features. + //! + //! \note This function is called automatically by the CPU detection logic. However, if you change the CPU features + //! in your own instance of \ref CpuInfo, CPU hints must be updated too, otherwise they would be out of sync. + ASMJIT_API static CpuHints recalculate_hints(const CpuInfo& info, const CpuFeatures& features) noexcept; + //! \} //! \name Overloaded Operators @@ -1298,6 +1393,16 @@ public: template ASMJIT_INLINE_NODEBUG void remove_feature(Args&&... args) noexcept { return _features.remove(std::forward(args)...); } + //! Returns CPU hints. + [[nodiscard]] + ASMJIT_INLINE_NODEBUG CpuHints hints() const noexcept { return _hints; } + + //! Updates CPU hints based on the CPU data and features. + //! + //! \note This function is called automatically by the CPU detection logic. However, if you change the CPU features + //! in your own instance of \ref CpuInfo, CPU hints must be updated too, otherwise they would be out of sync. + ASMJIT_INLINE void update_hints() noexcept { _hints = recalculate_hints(*this, _features); } + //! \} }; diff --git a/src/asmjit/core/emitter.h b/src/asmjit/core/emitter.h index d604c08..8da9248 100644 --- a/src/asmjit/core/emitter.h +++ b/src/asmjit/core/emitter.h @@ -668,9 +668,9 @@ public: //! \name Sections //! \{ - //! Switches the given `section`. + //! Switches to the given `section`. //! - //! Once switched, everything is added to the given `section`. + //! Once switched, everything is emitted to `section`. ASMJIT_API virtual Error section(Section* section); //! \} diff --git a/src/asmjit/core/jitallocator.cpp b/src/asmjit/core/jitallocator.cpp index e5ac4ce..c7f8e98 100644 --- a/src/asmjit/core/jitallocator.cpp +++ b/src/asmjit/core/jitallocator.cpp @@ -17,7 +17,7 @@ #include "../core/virtmem.h" #if defined(ASMJIT_TEST) -#include "../../../test/asmjit_test_random.h" +#include "../../../testing/commons/random.h" #endif // ASMJIT_TEST ASMJIT_BEGIN_NAMESPACE diff --git a/src/asmjit/core/jitruntime.cpp b/src/asmjit/core/jitruntime.cpp index a22c10e..b4ef077 100644 --- a/src/asmjit/core/jitruntime.cpp +++ b/src/asmjit/core/jitruntime.cpp @@ -15,7 +15,10 @@ JitRuntime::JitRuntime(const JitAllocator::CreateParams* params) noexcept : _allocator(params) { _environment = Environment::host(); _environment.set_object_format(ObjectFormat::kJIT); - _cpu_features = CpuInfo::host().features(); + + const CpuInfo& host_cpu = CpuInfo::host(); + _cpu_features = host_cpu.features(); + _cpu_hints = host_cpu.hints(); } JitRuntime::~JitRuntime() noexcept {} diff --git a/src/asmjit/core/support.h b/src/asmjit/core/support.h index ac28fae..3149ae1 100644 --- a/src/asmjit/core/support.h +++ b/src/asmjit/core/support.h @@ -11,9 +11,7 @@ #if defined(_MSC_VER) #include -#endif - -#if defined(__BMI2__) +#elif defined(__BMI2__) #include #endif diff --git a/src/asmjit/core/target.cpp b/src/asmjit/core/target.cpp index df8dc14..59c6fc7 100644 --- a/src/asmjit/core/target.cpp +++ b/src/asmjit/core/target.cpp @@ -10,7 +10,8 @@ ASMJIT_BEGIN_NAMESPACE Target::Target() noexcept : _environment{}, - _cpu_features{} {} + _cpu_features{}, + _cpu_hints{} {} Target::~Target() noexcept {} ASMJIT_END_NAMESPACE diff --git a/src/asmjit/core/target.h b/src/asmjit/core/target.h index 73228b6..36cbef1 100644 --- a/src/asmjit/core/target.h +++ b/src/asmjit/core/target.h @@ -25,6 +25,8 @@ public: Environment _environment; //! Target CPU features. CpuFeatures _cpu_features; + //! Target CPU hints. + CpuHints _cpu_hints; //! \name Construction & Destruction //! \{ @@ -55,6 +57,10 @@ public: //! Returns target CPU features. ASMJIT_INLINE_NODEBUG const CpuFeatures& cpu_features() const noexcept { return _cpu_features; } + [[nodiscard]] + //! Returns target CPU hints. + ASMJIT_INLINE_NODEBUG CpuHints cpu_hints() const noexcept { return _cpu_hints; } + //! \} }; diff --git a/src/asmjit/ujit.h b/src/asmjit/ujit.h index 3ad4bc1..78c1082 100644 --- a/src/asmjit/ujit.h +++ b/src/asmjit/ujit.h @@ -9,6 +9,7 @@ #include "asmjit-scope-begin.h" #include "ujit/ujitbase.h" #include "ujit/unicompiler.h" +#include "ujit/unicondition.h" #include "ujit/uniop.h" #include "ujit/vecconsttable.h" #include "asmjit-scope-end.h" diff --git a/src/asmjit/ujit/ujitbase.h b/src/asmjit/ujit/ujitbase.h index 3dcf3c4..ecc72cb 100644 --- a/src/asmjit/ujit/ujitbase.h +++ b/src/asmjit/ujit/ujitbase.h @@ -14,23 +14,35 @@ #if !defined(ASMJIT_NO_UJIT) +//! \namespace asmjit::ujit +//! \ingroup asmjit_ujit +//! +//! Namespace that provides all UJIT (Universal JIT) functionality. + ASMJIT_BEGIN_SUB_NAMESPACE(ujit) //! \addtogroup asmjit_ujit //! \{ +//! Backend compiler is simply an alias to a `host::Compiler`, which would be used by \ref UniCompiler. using BackendCompiler = host::Compiler; +//! Condition code is simply an alias to a `host::CondCode`. using CondCode = host::CondCode; +//! Target memory operand. using Mem = host::Mem; +//! Target general-purpose register. using Gp = host::Gp; +//! Target vector register. using Vec = host::Vec; #if defined(ASMJIT_UJIT_X86) +static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Label& label, int32_t disp = 0) noexcept { return x86::ptr(label, disp); } static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Gp& base, int32_t disp = 0) noexcept { return x86::ptr(base, disp); } static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Gp& base, const Gp& index, uint32_t shift = 0, int32_t disp = 0) noexcept { return x86::ptr(base, index, shift, disp); } #endif #if defined(ASMJIT_UJIT_AARCH64) +static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Label& label, int32_t disp = 0) noexcept { return a64::ptr(label, disp); } static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Gp& base, int32_t disp = 0) noexcept { return a64::ptr(base, disp); } static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Gp& base, const Gp& index, uint32_t shift = 0) noexcept { return a64::ptr(base, index, a64::lsl(shift)); } #endif @@ -38,6 +50,7 @@ static ASMJIT_INLINE_NODEBUG Mem mem_ptr(const Gp& base, const Gp& index, uint32 // Types & Enums // ------------- +//! Data alignment. enum class Alignment : uint32_t {}; //! The behavior of a floating point scalar operation. @@ -48,6 +61,16 @@ enum class ScalarOpBehavior : uint8_t { kPreservingVec128 }; +//! The behavior of floating point to int conversion. +enum class FloatToIntOutsideRangeBehavior : uint8_t { + //! In case that the floating point is outside of the integer range, the value is the smallest integer value, + //! which would be `0x80`, `0x8000`, `0x80000000`, or `0x8000000000000000` depending on the target integer width. + kSmallestValue, + //! In case that the floating point is outside of the integer range, the resulting integer will be saturated. If + //! the floating point is NaN, the resulting integer value would be zero. + kSaturatedValue +}; + //! The behavior of a floating point min/max instructions when comparing against NaN. enum class FMinFMaxOpBehavior : uint8_t { //! Min and max selects a finite value if one of the compared values is NaN. @@ -68,16 +91,21 @@ enum class FMAddOpBehavior : uint8_t { //! SIMD data width. enum class DataWidth : uint8_t { + //! 8-bit elements. k8 = 0, + //! 16-bit elements. k16 = 1, + //! 32-bit elements. k32 = 2, + //! 64-bit elements or 64-bit wide data is used. k64 = 3, + //! 128-bit elements or 128-bit wide data is used. k128 = 4 }; //! Vector register width. enum class VecWidth : uint8_t { - //! 128-bit vector register (baseline, SSE/AVX, NEON, ASIMD, etc...). + //! 128-bit vector register (baseline, SSE/AVX, NEON, etc...). k128 = 0, //! 256-bit vector register (AVX2+). k256 = 1, @@ -89,9 +117,13 @@ enum class VecWidth : uint8_t { //! Broadcast width. enum class Bcst : uint8_t { + //! Broadcast 8-bit elements. k8 = 0, + //! Broadcast 16-bit elements. k16 = 1, + //! Broadcast 32-bit elements. k32 = 2, + //! Broadcast 64-bit elements. k64 = 3, kNA = 0xFE, @@ -104,7 +136,7 @@ static ASMJIT_INLINE OperandSignature signature_of(VecWidth vw) noexcept { RegType reg_type = RegType(uint32_t(RegType::kVec128) + uint32_t(vw)); uint32_t reg_size = 16u << uint32_t(vw); - return OperandSignature::from_reg_type_and_group(reg_type, RegGroup::kVec) | OperandSignature::from_size(reg_size); + return OperandSignature::from_op_type(OperandType::kReg) | OperandSignature::from_reg_type_and_group(reg_type, RegGroup::kVec) | OperandSignature::from_size(reg_size); } static ASMJIT_INLINE TypeId type_id_of(VecWidth vw) noexcept { @@ -143,7 +175,7 @@ static ASMJIT_INLINE Vec clone_vec_as(const Vec& src, VecWidth vw) noexcept { // AsmJit Helpers // ============== -//! Operand array used by SIMD pipeline. +//! Operand array, mostly used for code generation that uses SIMD. //! //! Can hold up to `kMaxSize` registers, however, the number of actual registers is dynamic and depends //! on initialization. @@ -151,12 +183,15 @@ class OpArray { public: using Op = Operand_; + //! Maximum number of active operands `OpArray` can hold. static inline constexpr size_t kMaxSize = 8; //! \name Members //! \{ + //! Number of operands in OpArray size_t _size; + //! Underlying operand array. Operand_ v[kMaxSize]; //! \} @@ -405,6 +440,16 @@ public: ASMJIT_INLINE_NODEBUG OpArray even_odd(size_t from) const noexcept { return OpArray(*this, _size > 1u ? from : size_t(0), 2u, _size); } }; +//! Vector operand array. +//! +//! Used to model SIMD code generation where the code generator can use up to \ref OpArray::kMaxSize registers per +//! `VecArray`. The advantage of `VecArray` is that it allows to parametrize the ideal number of registers at runtime +//! and to use a single code-path to generate advanced SIMD code. +//! +//! In addition, \ref UniCompiler fully understands `VecArray` so it can be passed instead of a regular operand when +//! emitting code, which greatly simplifies designing high-performance SIMD code. +//! +//! \note VecArray is like \ref OpArray, just the whole API works with \ref Vec instead of \ref Operand_. class VecArray : public OpArray { public: //! \name Construction & Destruction @@ -587,7 +632,7 @@ static ASMJIT_INLINE void reset_var_array(T* array, size_t size) noexcept { template static ASMJIT_INLINE void reset_var_struct(T* data, size_t size = sizeof(T)) noexcept { - reset_var_array(reinterpret_cast(data), size / sizeof(asmjit::Reg)); + reset_var_array(reinterpret_cast(data), size / sizeof(Reg)); } static ASMJIT_INLINE_NODEBUG const Operand_& first_op(const Operand_& operand) noexcept { return operand; } @@ -615,10 +660,12 @@ struct Swizzle4 { ASMJIT_INLINE_CONSTEXPR bool operator!=(const Swizzle4& other) const noexcept { return value != other.value; } }; +//! Constructs a backend-independent 2-element vector swizzle parameter. static ASMJIT_INLINE_CONSTEXPR Swizzle2 swizzle(uint8_t b, uint8_t a) noexcept { return Swizzle2{(uint32_t(b) << 8) | a}; } +//! Constructs a backend-independent 4-element vector swizzle parameter. static ASMJIT_INLINE_CONSTEXPR Swizzle4 swizzle(uint8_t d, uint8_t c, uint8_t b, uint8_t a) noexcept { return Swizzle4{(uint32_t(d) << 24) | (uint32_t(c) << 16) | (uint32_t(b) << 8) | a}; } @@ -631,6 +678,9 @@ enum class Perm2x128 : uint32_t { kZero = 8 }; +//! Constructs a backend-independent permutation of 128-bit lanes. +//! +//! \note This is currently only used by AVX2 and AVX-512 backends. static ASMJIT_INLINE_CONSTEXPR uint8_t perm_2x128_imm(Perm2x128 hi, Perm2x128 lo) noexcept { return uint8_t((uint32_t(hi) << 4) | (uint32_t(lo))); } diff --git a/src/asmjit/ujit/unicompiler.h b/src/asmjit/ujit/unicompiler.h index 34126c1..26ee3e4 100644 --- a/src/asmjit/ujit/unicompiler.h +++ b/src/asmjit/ujit/unicompiler.h @@ -17,159 +17,9 @@ ASMJIT_BEGIN_SUB_NAMESPACE(ujit) //! \addtogroup asmjit_ujit //! \{ -//! Condition represents either a condition or an assignment operation that can be checked. -class Condition { -public: - //! \name Members - //! \{ +class UniCondition; - UniOpCond op; - CondCode cond; - Operand a; - Operand b; - - //! \} - - //! \name Construction & Destruction - //! \{ - - ASMJIT_INLINE_NODEBUG Condition(UniOpCond op, CondCode cond, const Operand& a, const Operand& b) noexcept - : op(op), - cond(cond), - a(a), - b(b) {} - - ASMJIT_INLINE_NODEBUG Condition(const Condition& other) noexcept = default; - - //! \} - - //! \name Overloaded Operators - //! \{ - - ASMJIT_INLINE_NODEBUG Condition& operator=(const Condition& other) noexcept = default; - - //! \} -}; - -static ASMJIT_INLINE Condition and_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition and_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition and_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition and_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition and_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition and_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); } - -static ASMJIT_INLINE Condition or_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition or_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition or_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition or_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition or_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition or_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); } - -static ASMJIT_INLINE Condition xor_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition xor_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition xor_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition xor_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition xor_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition xor_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); } - -static ASMJIT_INLINE Condition add_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition add_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition add_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition add_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition add_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition add_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition add_c(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); } -static ASMJIT_INLINE Condition add_c(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); } -static ASMJIT_INLINE Condition add_c(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); } -static ASMJIT_INLINE Condition add_nc(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); } -static ASMJIT_INLINE Condition add_nc(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); } -static ASMJIT_INLINE Condition add_nc(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); } -static ASMJIT_INLINE Condition add_s(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); } -static ASMJIT_INLINE Condition add_s(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); } -static ASMJIT_INLINE Condition add_s(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); } -static ASMJIT_INLINE Condition add_ns(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); } -static ASMJIT_INLINE Condition add_ns(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); } -static ASMJIT_INLINE Condition add_ns(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); } - -static ASMJIT_INLINE Condition sub_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition sub_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition sub_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition sub_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition sub_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition sub_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition sub_c(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); } -static ASMJIT_INLINE Condition sub_c(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); } -static ASMJIT_INLINE Condition sub_c(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); } -static ASMJIT_INLINE Condition sub_nc(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); } -static ASMJIT_INLINE Condition sub_nc(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); } -static ASMJIT_INLINE Condition sub_nc(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); } -static ASMJIT_INLINE Condition sub_s(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kSign, a, b); } -static ASMJIT_INLINE Condition sub_s(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kSign, a, b); } -static ASMJIT_INLINE Condition sub_s(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kSign, a, b); } -static ASMJIT_INLINE Condition sub_ns(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); } -static ASMJIT_INLINE Condition sub_ns(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); } -static ASMJIT_INLINE Condition sub_ns(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); } - -static ASMJIT_INLINE Condition sub_ugt(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); } -static ASMJIT_INLINE Condition sub_ugt(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); } -static ASMJIT_INLINE Condition sub_ugt(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); } - -static ASMJIT_INLINE Condition shr_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition shr_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition shr_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition shr_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition shr_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition shr_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); } - -static ASMJIT_INLINE Condition cmp_eq(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kEqual, a, b); } -static ASMJIT_INLINE Condition cmp_eq(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kEqual, a, b); } -static ASMJIT_INLINE Condition cmp_eq(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kEqual, a, b); } -static ASMJIT_INLINE Condition cmp_ne(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); } -static ASMJIT_INLINE Condition cmp_ne(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); } -static ASMJIT_INLINE Condition cmp_ne(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); } -static ASMJIT_INLINE Condition scmp_lt(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); } -static ASMJIT_INLINE Condition scmp_lt(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); } -static ASMJIT_INLINE Condition scmp_lt(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); } -static ASMJIT_INLINE Condition scmp_le(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); } -static ASMJIT_INLINE Condition scmp_le(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); } -static ASMJIT_INLINE Condition scmp_le(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); } -static ASMJIT_INLINE Condition scmp_gt(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); } -static ASMJIT_INLINE Condition scmp_gt(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); } -static ASMJIT_INLINE Condition scmp_gt(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); } -static ASMJIT_INLINE Condition scmp_ge(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); } -static ASMJIT_INLINE Condition scmp_ge(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); } -static ASMJIT_INLINE Condition scmp_ge(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); } -static ASMJIT_INLINE Condition ucmp_lt(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); } -static ASMJIT_INLINE Condition ucmp_lt(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); } -static ASMJIT_INLINE Condition ucmp_lt(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); } -static ASMJIT_INLINE Condition ucmp_le(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); } -static ASMJIT_INLINE Condition ucmp_le(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); } -static ASMJIT_INLINE Condition ucmp_le(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); } -static ASMJIT_INLINE Condition ucmp_gt(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); } -static ASMJIT_INLINE Condition ucmp_gt(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); } -static ASMJIT_INLINE Condition ucmp_gt(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); } -static ASMJIT_INLINE Condition ucmp_ge(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); } -static ASMJIT_INLINE Condition ucmp_ge(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); } -static ASMJIT_INLINE Condition ucmp_ge(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); } - -static ASMJIT_INLINE Condition test_z(const Gp& a) noexcept { return Condition(UniOpCond::kCompare, CondCode::kEqual, a, Imm(0)); } -static ASMJIT_INLINE Condition test_nz(const Gp& a) noexcept { return Condition(UniOpCond::kCompare, CondCode::kNotEqual, a, Imm(0)); } - -static ASMJIT_INLINE Condition test_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition test_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition test_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kZero, a, b); } -static ASMJIT_INLINE Condition test_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition test_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kNotZero, a, b); } -static ASMJIT_INLINE Condition test_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kTest, CondCode::kNotZero, a, b); } - -static ASMJIT_INLINE Condition bt_z(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); } -static ASMJIT_INLINE Condition bt_z(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); } -static ASMJIT_INLINE Condition bt_z(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); } -static ASMJIT_INLINE Condition bt_nz(const Gp& a, const Gp& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); } -static ASMJIT_INLINE Condition bt_nz(const Gp& a, const Mem& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); } -static ASMJIT_INLINE Condition bt_nz(const Gp& a, const Imm& b) noexcept { return Condition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); } - -//! Pipeline compiler. +//! Universal compiler. class UniCompiler { public: ASMJIT_NONCOPYABLE(UniCompiler) @@ -285,7 +135,10 @@ public: //! AsmJit compiler. BackendCompiler* cc = nullptr; - const VecConstTable& ct; + //! Reference to a table that provides global constants. + //! + //! \note This table can be extended by users so it fits a particular use-case, see \ref UniCompiler constructor. + VecConstTableRef _ct_ref; #if defined(ASMJIT_UJIT_X86) //! General purpose extension mask (X86 and X86_64 only). @@ -306,14 +159,16 @@ public: //! The behavior of scalar operations (mostly floating point). ScalarOpBehavior _scalar_op_behavior {}; //! The behavior of floating point min/max operation. - FMinFMaxOpBehavior _fmin_fmax_op_hehavior {}; + FMinFMaxOpBehavior _fmin_fmax_op_behavior {}; //! The behavior of floating point `madd` operation. FMAddOpBehavior _fmadd_op_behavior {}; + //! The behavior of a float-to-int conversion when the float is out of integer range, infinite, or NaN. + FloatToIntOutsideRangeBehavior _float_to_int_outside_range_behavior {}; //! Target CPU features. CpuFeatures _features {}; //! Optimization flags. - UniOptFlags _opt_flags = UniOptFlags::kNone; + CpuHints _cpu_hints {}; //! Number of available vector registers. uint32_t _vec_reg_count = 0; @@ -323,20 +178,20 @@ public: //! SIMD multiplier, derived from `_vec_width` (1, 2, 4). uint8_t _vec_multiplier = 0; //! SIMD register type (AsmJit). - asmjit::RegType _vec_reg_type = asmjit::RegType::kNone; + RegType _vec_reg_type = RegType::kNone; //! SIMD type id (AsmJit). - asmjit::TypeId _vec_type_id = asmjit::TypeId::kVoid; + TypeId _vec_type_id = TypeId::kVoid; //! Function node. - asmjit::FuncNode* _func_node = nullptr; + FuncNode* _func_node = nullptr; //! Function initialization hook. - asmjit::BaseNode* _func_init = nullptr; + BaseNode* _func_init = nullptr; //! Function end hook (to add 'unlikely' branches). - asmjit::BaseNode* _func_end = nullptr; + BaseNode* _func_end = nullptr; //! Invalid GP register. Gp _gp_none; - //! Temporary stack used to transfer SIMD regs to GP/MM. + //! Temporary stack used to transfer SIMD regs to GP. Mem _tmp_stack[size_t(StackId::kMaxValue) + 1]; //! Offset to the first constant to the `commonTable` global. @@ -359,15 +214,22 @@ public: uint32_t virt_reg_id; }; - asmjit::ArenaVector _vec_consts; - asmjit::ArenaVector _vec_consts_ex; + ArenaVector _vec_consts; + ArenaVector _vec_consts_ex; //! \} //! \name Construction & Destruction //! \{ - ASMJIT_API UniCompiler(BackendCompiler* cc, const CpuFeatures& features, UniOptFlags opt_flags) noexcept; + //! Creates `UniCompiler` that would use the existing BackendCompiler (it would keep the pointer to it). + ASMJIT_API UniCompiler(BackendCompiler* cc, const CpuFeatures& features, CpuHints cpu_hints, VecConstTableRef ct_ref) noexcept; + + //! Creates `UniCompiler` that would use the existing BackendCompiler (it would keep the pointer to it). + ASMJIT_INLINE UniCompiler(BackendCompiler* cc, const CpuFeatures& features, CpuHints cpu_hints) noexcept + : UniCompiler(cc, features, cpu_hints, VecConstTableRef{vec_const_table, sizeof(VecConstTable)}) {} + + //! Destroys `UniCompiler` - the existing BackendCompiler would be untouched. ASMJIT_API ~UniCompiler() noexcept; //! \} @@ -375,22 +237,39 @@ public: //! \name Allocators //! \{ - ASMJIT_INLINE_NODEBUG asmjit::Arena& arena() noexcept { return cc->_builder_arena; } + //! Returns the arena used by `UniCompiler`. + ASMJIT_INLINE_NODEBUG Arena& arena() noexcept { return cc->_builder_arena; } + + //! \} + + //! \name Constant Table + //! \{ + + template + ASMJIT_INLINE_NODEBUG const T& ct() const noexcept { return static_cast(_ct_ref.table); } + + template + ASMJIT_INLINE_NODEBUG const T* ct_ptr() const noexcept { return static_cast(&_ct_ref.table); } + + ASMJIT_INLINE_NODEBUG size_t ct_size() const noexcept { return _ct_ref.size; } //! \} //! \name CPU Architecture, Features and Optimization Options //! \{ - ASMJIT_API void _init_extensions(const asmjit::CpuFeatures& features) noexcept; + ASMJIT_API void _init_extensions(const CpuFeatures& features) noexcept; ASMJIT_INLINE_NODEBUG bool is_32bit() const noexcept { return cc->is_32bit(); } ASMJIT_INLINE_NODEBUG bool is_64bit() const noexcept { return cc->is_64bit(); } ASMJIT_INLINE_NODEBUG uint32_t register_size() const noexcept { return cc->register_size(); } #if defined(ASMJIT_UJIT_X86) + //! Tests whether a general purpose extension `ext` is available. ASMJIT_INLINE_NODEBUG bool has_gp_ext(GPExt ext) const noexcept { return (_gp_ext_mask & (1u << uint32_t(ext))) != 0; } + //! Tests whether an SSE extension `ext` is available. ASMJIT_INLINE_NODEBUG bool has_sse_ext(SSEExt ext) const noexcept { return (_sse_ext_mask & (1u << uint32_t(ext))) != 0; } + //! Tests whether an AVX or AVX-512 extension `ext` is available. ASMJIT_INLINE_NODEBUG bool has_avx_ext(AVXExt ext) const noexcept { return (_avx_ext_mask & (uint64_t(1) << uint32_t(ext))) != 0; } //! Tests whether ADX extension is available. @@ -468,7 +347,9 @@ public: #endif // ASMJIT_UJIT_X86 #if defined(ASMJIT_UJIT_AARCH64) + //! Tests whether a general purpose extension `ext` is available. ASMJIT_INLINE_NODEBUG bool has_gp_ext(GPExt ext) const noexcept { return (_gp_ext_mask & (uint64_t(1) << uint32_t(ext))) != 0; } + //! Tests whether an ASIMD extension `ext` is available. ASMJIT_INLINE_NODEBUG bool has_asimd_ext(ASIMDExt ext) const noexcept { return (_asimd_ext_mask & (uint64_t(1) << uint32_t(ext))) != 0; } //! Tests whether CSSC extension is available. @@ -535,9 +416,12 @@ public: //! Returns the behavior of scalar operations (mostly floating point). ASMJIT_INLINE_NODEBUG ScalarOpBehavior scalar_op_behavior() const noexcept { return _scalar_op_behavior; } //! Returns the behavior of floating point min/max operations. - ASMJIT_INLINE_NODEBUG FMinFMaxOpBehavior fmin_fmax_op_hehavior() const noexcept { return _fmin_fmax_op_hehavior; } + ASMJIT_INLINE_NODEBUG FMinFMaxOpBehavior fmin_fmax_op_behavior() const noexcept { return _fmin_fmax_op_behavior; } //! Returns the behavior of floating point mul+add (`madd`) operations. ASMJIT_INLINE_NODEBUG FMAddOpBehavior fmadd_op_behavior() const noexcept { return _fmadd_op_behavior; } + //! Returns the behavior of float-to-integer conversion when the floating point is outside of the integer representable + //! range, infinite, or NaN. + ASMJIT_INLINE_NODEBUG FloatToIntOutsideRangeBehavior float_to_int_outside_range_behavior() const noexcept { return _float_to_int_outside_range_behavior; } //! Tests whether a scalar operation is zeroing the rest of the destination register (AArch64). ASMJIT_INLINE_NODEBUG bool is_scalar_op_zeroing() const noexcept { return _scalar_op_behavior == ScalarOpBehavior::kZeroing; } @@ -545,9 +429,9 @@ public: ASMJIT_INLINE_NODEBUG bool is_scalar_op_preserving_vec128() const noexcept { return _scalar_op_behavior == ScalarOpBehavior::kPreservingVec128; } //! Tests whether a floating point min/max operation selects a finite value if one of the values is NaN (AArch64). - ASMJIT_INLINE_NODEBUG bool is_fmin_fmax_finite() const noexcept { return _fmin_fmax_op_hehavior == FMinFMaxOpBehavior::kFiniteValue; } + ASMJIT_INLINE_NODEBUG bool is_fmin_fmax_finite() const noexcept { return _fmin_fmax_op_behavior == FMinFMaxOpBehavior::kFiniteValue; } //! Tests whether a floating point min/max operation works as a ternary if - `if a <|> b ? a : b` (X86|X86_64). - ASMJIT_INLINE_NODEBUG bool is_fmin_fmax_ternary() const noexcept { return _fmin_fmax_op_hehavior == FMinFMaxOpBehavior::kTernaryLogic; } + ASMJIT_INLINE_NODEBUG bool is_fmin_fmax_ternary() const noexcept { return _fmin_fmax_op_behavior == FMinFMaxOpBehavior::kTernaryLogic; } //! Tests whether a floating point mul+add operation is fused (uses FMA). ASMJIT_INLINE_NODEBUG bool is_fmadd_fused() const noexcept { return _fmadd_op_behavior != FMAddOpBehavior::kNoFMA; } @@ -556,8 +440,10 @@ public: //! Tests whether a FMA operation is available and that it only stores the result to accumulator register. ASMJIT_INLINE_NODEBUG bool is_fma_storing_to_any_accumulator() const noexcept { return _fmadd_op_behavior == FMAddOpBehavior::kFMAStoreToAccumulator; } - ASMJIT_INLINE_NODEBUG UniOptFlags opt_flags() const noexcept { return _opt_flags; } - ASMJIT_INLINE_NODEBUG bool has_opt_flag(UniOptFlags flag) const noexcept { return Support::test(_opt_flags, flag); } + //! Returns CPU hints. + ASMJIT_INLINE_NODEBUG CpuHints cpu_hints() const noexcept { return _cpu_hints; } + //! Tests whether a CPU hint `hint` is enabled. + ASMJIT_INLINE_NODEBUG bool has_cpu_hint(CpuHints hint) const noexcept { return Support::test(_cpu_hints, hint); } //! Returns a native register signature, either 32-bit or 64-bit depending on the target architecture). ASMJIT_INLINE_NODEBUG OperandSignature gp_signature() const noexcept { return cc->gp_signature(); } @@ -599,7 +485,7 @@ public: //! \name Function //! \{ - ASMJIT_API void init_function(asmjit::FuncNode* func_node) noexcept; + ASMJIT_API void init_function(FuncNode* func_node) noexcept; //! \} @@ -607,13 +493,15 @@ public: //! \{ ASMJIT_INLINE void rename(const OpArray& op_array, const char* name) noexcept { - for (uint32_t i = 0; i < op_array.size(); i++) - cc->rename(op_array[i].as(), "%s%u", name, unsigned(i)); + for (uint32_t i = 0; i < op_array.size(); i++) { + cc->rename(op_array[i].as(), "%s%u", name, unsigned(i)); + } } ASMJIT_INLINE void rename(const OpArray& op_array, const char* prefix, const char* name) noexcept { - for (uint32_t i = 0; i < op_array.size(); i++) - cc->rename(op_array[i].as(), "%s%s%u", prefix, name, unsigned(i)); + for (uint32_t i = 0; i < op_array.size(); i++) { + cc->rename(op_array[i].as(), "%s%s%u", prefix, name, unsigned(i)); + } } //! \} @@ -631,84 +519,163 @@ public: //! \name Virtual Registers & Memory (Target Independent) //! \{ - ASMJIT_INLINE Gp new_gp32() noexcept { return cc->new_gp32(); } - ASMJIT_INLINE Gp new_gp64() noexcept { return cc->new_gp64(); } - ASMJIT_INLINE Gp new_gpz() noexcept { return cc->new_gpz(); } - - template - ASMJIT_INLINE Gp new_gp32(const char* name, Args&&... args) noexcept { return cc->new_gp32(name, std::forward(args)...); } - template - ASMJIT_INLINE Gp new_gp64(const char* name, Args&&... args) noexcept { return cc->new_gp64(name, std::forward(args)...); } - template - ASMJIT_INLINE Gp new_gpz(const char* name, Args&&... args) noexcept { return cc->new_gpz(name, std::forward(args)...); } - - template - ASMJIT_INLINE RegT new_similar_reg(const RegT& ref) noexcept { return cc->new_similar_reg(ref); } + //! Wraps `BackendCompiler::new_reg(type_id, args...)`. template - ASMJIT_INLINE RegT new_similar_reg(const RegT& ref, Args&&... args) noexcept { return cc->new_similar_reg(ref, std::forward(args)...); } + ASMJIT_INLINE RegT new_reg(TypeId type_id, Args&&... args) noexcept { + return cc->new_similar_reg(type_id, std::forward(args)...); + } + //! Wraps `BackendCompiler::new_similar_reg(ref, args...)`. + template + ASMJIT_INLINE RegT new_similar_reg(const RegT& ref, Args&&... args) noexcept { + return cc->new_similar_reg(ref, std::forward(args)...); + } + + //! Wraps `BackendCompiler::new_gp32(args...)`. template - ASMJIT_INLINE Vec new_vec(const char* name, Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), _vec_type_id, name, std::forward(args)...); - return reg; + ASMJIT_INLINE Gp new_gp32(Args&&... args) noexcept { + return cc->new_gp32(std::forward(args)...); + } + + //! Wraps `BackendCompiler::new_gp64(args...)`. + template + ASMJIT_INLINE Gp new_gp64(Args&&... args) noexcept { + return cc->new_gp64(std::forward(args)...); + } + + //! Wraps `BackendCompiler::new_gpz(args...)`. + template + ASMJIT_INLINE Gp new_gpz(Args&&... args) noexcept { + return cc->new_gpz(std::forward(args)...); + } + + //! Wraps `BackendCompiler::new_gpz(args...)`. + template + ASMJIT_INLINE Gp new_gp_ptr(Args&&... args) noexcept { + return cc->new_gp_ptr(std::forward(args)...); } template - ASMJIT_INLINE Vec new_vec(VecWidth vw, const char* name, Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), VecWidthUtils::type_id_of(vw), name, std::forward(args)...); - return reg; + ASMJIT_INLINE Vec new_vec(Args&&... args) noexcept { + return cc->new_vec(_vec_type_id, std::forward(args)...); } - ASMJIT_NOINLINE void new_reg_array(OpArray& dst, uint32_t n, asmjit::TypeId type_id, const char* name) noexcept { + template + ASMJIT_INLINE Vec new_vec_with_width(VecWidth vw, Args&&... args) noexcept { + return cc->new_reg(VecWidthUtils::type_id_of(vw), std::forward(args)...); + } + + template + ASMJIT_INLINE Vec new_vec128(Args&&... args) noexcept { + return cc->new_vec128(std::forward(args)...); + } + + template + ASMJIT_INLINE Vec new_vec128_f32x1(Args&&... args) noexcept { + return cc->new_vec128_f32x1(std::forward(args)...); + } + + template + ASMJIT_INLINE Vec new_vec128_f64x1(Args&&... args) noexcept { + return cc->new_vec128_f64x1(std::forward(args)...); + } + + template + ASMJIT_INLINE Vec new_vec128_f32x4(Args&&... args) noexcept { + return cc->new_vec128_f32x4(std::forward(args)...); + } + + template + ASMJIT_INLINE Vec new_vec128_f64x2(Args&&... args) noexcept { + return cc->new_vec128_f64x2(std::forward(args)...); + } + +#if defined(ASMJIT_UJIT_X86) + template + ASMJIT_INLINE Vec new_vec256(Args&&... args) noexcept { + return cc->new_vec256(std::forward(args)...); + } + + template + ASMJIT_INLINE Vec new_vec512(Args&&... args) noexcept { + return cc->new_vec512(std::forward(args)...); + } +#endif // ASMJIT_UJIT_X86 + + ASMJIT_NOINLINE void new_reg_array(OpArray& dst, size_t n, TypeId type_id, const char* name) noexcept { ASMJIT_ASSERT(n <= OpArray::kMaxSize); dst._size = n; - for (uint32_t i = 0; i < n; i++) { - cc->_new_reg_fmt(Out(dst[i].as()), type_id, "%s%u", name, i); + for (size_t i = 0; i < n; i++) { + cc->_new_reg(Out(dst[i].as()), type_id, "%s%u", name, i); } } - ASMJIT_NOINLINE void new_reg_array(OpArray& dst, uint32_t n, asmjit::TypeId type_id, const char* prefix, const char* name) noexcept { + ASMJIT_NOINLINE void new_reg_array(OpArray& dst, size_t n, TypeId type_id, const char* prefix, const char* name) noexcept { ASMJIT_ASSERT(n <= OpArray::kMaxSize); dst._size = n; - for (uint32_t i = 0; i < n; i++) { - cc->_new_reg_fmt(Out(dst[i].as()), type_id, "%s%s%u", prefix, name, i); + for (size_t i = 0; i < n; i++) { + cc->_new_reg(Out(dst[i].as()), type_id, "%s%s%u", prefix, name, i); } } - ASMJIT_NOINLINE void new_reg_array(OpArray& dst, uint32_t n, const asmjit::Reg& ref, const char* name) noexcept { + ASMJIT_NOINLINE void new_reg_array(OpArray& dst, size_t n, const Reg& ref, const char* name) noexcept { ASMJIT_ASSERT(n <= OpArray::kMaxSize); dst._size = n; - for (uint32_t i = 0; i < n; i++) { - cc->_new_reg_fmt(Out(dst[i].as()), ref, "%s%u", name, i); + for (size_t i = 0; i < n; i++) { + cc->_new_reg(Out(dst[i].as()), ref, "%s%u", name, i); } } - ASMJIT_NOINLINE void new_reg_array(OpArray& dst, uint32_t n, const asmjit::Reg& ref, const char* prefix, const char* name) noexcept { + ASMJIT_NOINLINE void new_reg_array(OpArray& dst, size_t n, const Reg& ref, const char* prefix, const char* name) noexcept { ASMJIT_ASSERT(n <= OpArray::kMaxSize); dst._size = n; - for (uint32_t i = 0; i < n; i++) { - cc->_new_reg_fmt(Out(dst[i].as()), ref, "%s%s%u", prefix, name, i); + for (size_t i = 0; i < n; i++) { + cc->_new_reg(Out(dst[i].as()), ref, "%s%s%u", prefix, name, i); } } - ASMJIT_INLINE void new_vec_array(OpArray& dst, uint32_t n, VecWidth vw, const char* name) noexcept { + ASMJIT_INLINE void new_vec_array(OpArray& dst, size_t n, VecWidth vw, const char* name) noexcept { new_reg_array(dst, n, VecWidthUtils::type_id_of(vw), name); } - ASMJIT_INLINE void new_vec_array(OpArray& dst, uint32_t n, VecWidth vw, const char* prefix, const char* name) noexcept { + ASMJIT_INLINE void new_vec_array(OpArray& dst, size_t n, VecWidth vw, const char* prefix, const char* name) noexcept { new_reg_array(dst, n, VecWidthUtils::type_id_of(vw), prefix, name); } - ASMJIT_INLINE void new_vec_array(OpArray& dst, uint32_t n, const Vec& ref, const char* name) noexcept { + ASMJIT_INLINE void new_vec_array(OpArray& dst, size_t n, const Vec& ref, const char* name) noexcept { new_reg_array(dst, n, ref, name); } - ASMJIT_INLINE void new_vec_array(OpArray& dst, uint32_t n, const Vec& ref, const char* prefix, const char* name) noexcept { + ASMJIT_INLINE void new_vec_array(OpArray& dst, size_t n, const Vec& ref, const char* prefix, const char* name) noexcept { new_reg_array(dst, n, ref, prefix, name); } + ASMJIT_INLINE void new_vec128_array(OpArray& dst, size_t n, const char* name) noexcept { + new_reg_array(dst, n, TypeId::kInt32x4, name); + } + + ASMJIT_INLINE void new_vec128_array(OpArray& dst, size_t n, const char* prefix, const char* name) noexcept { + new_reg_array(dst, n, TypeId::kInt32x4, prefix, name); + } + +#if defined(ASMJIT_UJIT_X86) + ASMJIT_INLINE void new_vec256_array(OpArray& dst, size_t n, const char* name) noexcept { + new_reg_array(dst, n, TypeId::kInt32x8, name); + } + + ASMJIT_INLINE void new_vec256_array(OpArray& dst, size_t n, const char* prefix, const char* name) noexcept { + new_reg_array(dst, n, TypeId::kInt32x8, prefix, name); + } + + ASMJIT_INLINE void new_vec512_array(OpArray& dst, size_t n, const char* name) noexcept { + new_reg_array(dst, n, TypeId::kInt32x16, name); + } + + ASMJIT_INLINE void new_vec512_array(OpArray& dst, size_t n, const char* prefix, const char* name) noexcept { + new_reg_array(dst, n, TypeId::kInt32x16, prefix, name); + } +#endif // ASMJIT_UJIT_X86 + ASMJIT_API Mem tmp_stack(StackId id, uint32_t size) noexcept; //! \} @@ -722,165 +689,6 @@ public: ASMJIT_API void _init_vec_const_table_ptr() noexcept; - //! \name Virtual Registers - //! \{ - -#if defined(ASMJIT_UJIT_X86) - - ASMJIT_INLINE Vec new_vec128() noexcept { - Vec reg; - cc->_new_reg(Out(reg), asmjit::TypeId::kInt32x4); - return reg; - } - - ASMJIT_INLINE Vec new_vec128_1xf32() noexcept { - Vec reg; - cc->_new_reg(Out(reg), asmjit::TypeId::kFloat32x1); - return reg; - } - - ASMJIT_INLINE Vec new_vec128_1xf64() noexcept { - Vec reg; - cc->_new_reg(Out(reg), asmjit::TypeId::kFloat64x1); - return reg; - } - - ASMJIT_INLINE Vec new_vec128_4xf32() noexcept { - Vec reg; - cc->_new_reg(Out(reg), asmjit::TypeId::kFloat32x4); - return reg; - } - - ASMJIT_INLINE Vec new_vec128_2xf64() noexcept { - Vec reg; - cc->_new_reg(Out(reg), asmjit::TypeId::kFloat64x2); - return reg; - } - - template - ASMJIT_INLINE Vec new_vec128(Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kInt32x4, std::forward(args)...); - return reg; - } - - template - ASMJIT_INLINE Vec new_vec128_1xf32(Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kFloat32x1, std::forward(args)...); - return reg; - } - - template - ASMJIT_INLINE Vec new_vec128_1xf64(Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kFloat64x1, std::forward(args)...); - return reg; - } - - template - ASMJIT_INLINE Vec new_vec128_4xf32(Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kFloat32x4, std::forward(args)...); - return reg; - } - - template - ASMJIT_INLINE Vec new_vec128_2xf64(Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kFloat64x2, std::forward(args)...); - return reg; - } - - ASMJIT_INLINE void new_vec128_array(OpArray& dst, uint32_t n, const char* name) noexcept { - new_reg_array(dst, n, asmjit::TypeId::kInt32x4, name); - } - - ASMJIT_INLINE void new_vec128_array(OpArray& dst, uint32_t n, const char* prefix, const char* name) noexcept { - new_reg_array(dst, n, asmjit::TypeId::kInt32x4, prefix, name); - } - - template - ASMJIT_INLINE Vec new_vec256(const char* name, Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kInt32x8, name, std::forward(args)...); - return reg; - } - - ASMJIT_INLINE void new_vec256_array(OpArray& dst, uint32_t n, const char* name) noexcept { - new_reg_array(dst, n, asmjit::TypeId::kInt32x8, name); - } - - ASMJIT_INLINE void new_vec256_array(OpArray& dst, uint32_t n, const char* prefix, const char* name) noexcept { - new_reg_array(dst, n, asmjit::TypeId::kInt32x8, prefix, name); - } - - template - ASMJIT_INLINE Vec new_vec512(const char* name, Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kInt32x16, name, std::forward(args)...); - return reg; - } - - ASMJIT_INLINE void new_vec512_array(OpArray& dst, uint32_t n, const char* name) noexcept { - new_reg_array(dst, n, asmjit::TypeId::kInt32x16, name); - } - - ASMJIT_INLINE void new_vec512_array(OpArray& dst, uint32_t n, const char* prefix, const char* name) noexcept { - new_reg_array(dst, n, asmjit::TypeId::kInt32x16, prefix, name); - } - -#endif // ASMJIT_UJIT_X86 - -#if defined(ASMJIT_UJIT_AARCH64) - - template - ASMJIT_INLINE Vec new_vec128(const char* name, Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kInt32x4, name, std::forward(args)...); - return reg; - } - - template - ASMJIT_INLINE Vec new_vec128_1xf32(const char* name, Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kFloat32x1, name, std::forward(args)...); - return reg.v128(); - } - - template - ASMJIT_INLINE Vec new_vec128_1xf64(const char* name, Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kFloat64x1, name, std::forward(args)...); - return reg.v128(); - } - - template - ASMJIT_INLINE Vec new_vec128_4xf32(const char* name, Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kFloat32x4, name, std::forward(args)...); - return reg; - } - - template - ASMJIT_INLINE Vec new_vec128_2xf64(const char* name, Args&&... args) noexcept { - Vec reg; - cc->_new_reg_fmt(Out(reg), asmjit::TypeId::kFloat64x2, name, std::forward(args)...); - return reg; - } - - ASMJIT_INLINE void new_vec128_array(OpArray& dst, uint32_t n, const char* name) noexcept { - new_reg_array(dst, n, asmjit::TypeId::kInt32x4, name); - } - - ASMJIT_INLINE void new_vec128_array(OpArray& dst, uint32_t n, const char* prefix, const char* name) noexcept { - new_reg_array(dst, n, asmjit::TypeId::kInt32x4, prefix, name); - } - -#endif - - //! \} - //! \name Constants (X86|X86_64) //! \{ @@ -901,14 +709,14 @@ public: ASMJIT_API Mem simd_mem_const(const void* c, Bcst bcst_width, const VecArray& similar_to) noexcept; ASMJIT_API Mem _get_mem_const(const void* c) noexcept; - ASMJIT_API Vec _new_vecConst(const void* c, bool is_unique_const) noexcept; + ASMJIT_API Vec _new_vec_const(const void* c, bool is_unique_const) noexcept; #if defined(ASMJIT_UJIT_AARCH64) ASMJIT_API Vec simd_const_16b(const void* data16) noexcept; #endif // ASMJIT_UJIT_AARCH64 #if defined(ASMJIT_UJIT_AARCH64) - inline Vec simd_vec_zero(const Vec& similar_to) noexcept { return simd_vec_const(&ct.p_0000000000000000, Bcst::k32, similar_to); } + inline Vec simd_vec_zero(const Vec& similar_to) noexcept { return simd_vec_const(&ct().p_0000000000000000, Bcst::k32, similar_to); } #endif // ASMJIT_UJIT_AARCH64 //! \} @@ -920,12 +728,12 @@ public: ASMJIT_API void emit_m(UniOpM op, const Mem& m) noexcept; ASMJIT_API void emit_rm(UniOpRM op, const Gp& dst, const Mem& src) noexcept; ASMJIT_API void emit_mr(UniOpMR op, const Mem& dst, const Gp& src) noexcept; - ASMJIT_API void emit_cmov(const Gp& dst, const Operand_& sel, const Condition& condition) noexcept; - ASMJIT_API void emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const Condition& condition) noexcept; + ASMJIT_API void emit_cmov(const Gp& dst, const Operand_& sel, const UniCondition& condition) noexcept; + ASMJIT_API void emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const UniCondition& condition) noexcept; ASMJIT_API void emit_2i(UniOpRR op, const Gp& dst, const Operand_& src_) noexcept; ASMJIT_API void emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, const Operand_& src2_) noexcept; ASMJIT_API void emit_j(const Operand_& target) noexcept; - ASMJIT_API void emit_j_if(const Label& target, const Condition& condition) noexcept; + ASMJIT_API void emit_j_if(const Label& target, const UniCondition& condition) noexcept; ASMJIT_INLINE void mov(const Gp& dst, const Gp& src) noexcept { return emit_mov(dst, src); } ASMJIT_INLINE void mov(const Gp& dst, const Imm& src) noexcept { return emit_mov(dst, src); } @@ -963,11 +771,11 @@ public: ASMJIT_INLINE void mem_add_u32(const Mem& dst, const Gp& src) noexcept { return emit_mr(UniOpMR::kAddU32, dst, src); } ASMJIT_INLINE void mem_add_u64(const Mem& dst, const Gp& src) noexcept { return emit_mr(UniOpMR::kAddU64, dst, src); } - ASMJIT_INLINE void cmov(const Gp& dst, const Gp& sel, const Condition& condition) noexcept { emit_cmov(dst, sel, condition); } - ASMJIT_INLINE void cmov(const Gp& dst, const Mem& sel, const Condition& condition) noexcept { emit_cmov(dst, sel, condition); } + ASMJIT_INLINE void cmov(const Gp& dst, const Gp& sel, const UniCondition& condition) noexcept { emit_cmov(dst, sel, condition); } + ASMJIT_INLINE void cmov(const Gp& dst, const Mem& sel, const UniCondition& condition) noexcept { emit_cmov(dst, sel, condition); } template - ASMJIT_INLINE void select(const Gp& dst, const Sel1& sel1, const Sel2& sel2, const Condition& condition) noexcept { emit_select(dst, sel1, sel2, condition); } + ASMJIT_INLINE void select(const Gp& dst, const Sel1& sel1, const Sel2& sel2, const UniCondition& condition) noexcept { emit_select(dst, sel1, sel2, condition); } ASMJIT_INLINE void abs(const Gp& dst, const Gp& src) noexcept { emit_2i(UniOpRR::kAbs, dst, src); } ASMJIT_INLINE void abs(const Gp& dst, const Mem& src) noexcept { emit_2i(UniOpRR::kAbs, dst, src); } @@ -1101,7 +909,7 @@ public: ASMJIT_INLINE void j(const Gp& target) noexcept { emit_j(target); } ASMJIT_INLINE void j(const Label& target) noexcept { emit_j(target); } - ASMJIT_INLINE void j(const Label& target, const Condition& condition) noexcept { emit_j_if(target, condition); } + ASMJIT_INLINE void j(const Label& target, const UniCondition& condition) noexcept { emit_j_if(target, condition); } ASMJIT_API void adds_u8(const Gp& dst, const Gp& src1, const Gp& src2) noexcept; @@ -1283,8 +1091,12 @@ public: DEFINE_OP_2V(v_cvt_i32_hi_to_i64, UniOpVV::kCvtI32HiToI64) DEFINE_OP_2V(v_cvt_u32_lo_to_u64, UniOpVV::kCvtU32LoToU64) DEFINE_OP_2V(v_cvt_u32_hi_to_u64, UniOpVV::kCvtU32HiToU64) + DEFINE_OP_2V(s_abs_f32, UniOpVV::kAbsF32S) + DEFINE_OP_2V(s_abs_f64, UniOpVV::kAbsF64S) DEFINE_OP_2V(v_abs_f32, UniOpVV::kAbsF32) DEFINE_OP_2V(v_abs_f64, UniOpVV::kAbsF64) + DEFINE_OP_2V(s_neg_f32, UniOpVV::kNegF32S) + DEFINE_OP_2V(s_neg_f64, UniOpVV::kNegF64S) DEFINE_OP_2V(v_neg_f32, UniOpVV::kNegF32) DEFINE_OP_2V(v_neg_f64, UniOpVV::kNegF64) DEFINE_OP_2V(v_not_f32, UniOpVV::kNotF32) @@ -1301,10 +1113,18 @@ public: DEFINE_OP_2V(s_ceil_f64, UniOpVV::kCeilF64S) DEFINE_OP_2V(v_ceil_f32, UniOpVV::kCeilF32) DEFINE_OP_2V(v_ceil_f64, UniOpVV::kCeilF64) - DEFINE_OP_2V(s_round_f32, UniOpVV::kRoundF32S) - DEFINE_OP_2V(s_round_f64, UniOpVV::kRoundF64S) - DEFINE_OP_2V(v_round_f32, UniOpVV::kRoundF32) - DEFINE_OP_2V(v_round_f64, UniOpVV::kRoundF64) + DEFINE_OP_2V(s_round_even_f32, UniOpVV::kRoundEvenF32S) + DEFINE_OP_2V(s_round_even_f64, UniOpVV::kRoundEvenF64S) + DEFINE_OP_2V(v_round_even_f32, UniOpVV::kRoundEvenF32) + DEFINE_OP_2V(v_round_even_f64, UniOpVV::kRoundEvenF64) + DEFINE_OP_2V(s_round_half_away_f32, UniOpVV::kRoundHalfAwayF32S) + DEFINE_OP_2V(s_round_half_away_f64, UniOpVV::kRoundHalfAwayF64S) + DEFINE_OP_2V(v_round_half_away_f32, UniOpVV::kRoundHalfAwayF32) + DEFINE_OP_2V(v_round_half_away_f64, UniOpVV::kRoundHalfAwayF64) + DEFINE_OP_2V(s_round_half_up_f32, UniOpVV::kRoundHalfUpF32S) + DEFINE_OP_2V(s_round_half_up_f64, UniOpVV::kRoundHalfUpF64S) + DEFINE_OP_2V(v_round_half_up_f32, UniOpVV::kRoundHalfUpF32) + DEFINE_OP_2V(v_round_half_up_f64, UniOpVV::kRoundHalfUpF64) DEFINE_OP_2V(v_rcp_f32, UniOpVV::kRcpF32) DEFINE_OP_2V(v_rcp_f64, UniOpVV::kRcpF64) DEFINE_OP_2V(s_sqrt_f32, UniOpVV::kSqrtF32S) @@ -1743,6 +1563,10 @@ public: DEFINE_OP_3V(s_div_f64, UniOpVVV::kDivF64S) DEFINE_OP_3V(v_div_f32, UniOpVVV::kDivF32) DEFINE_OP_3V(v_div_f64, UniOpVVV::kDivF64) + DEFINE_OP_3V(s_mod_f32, UniOpVVV::kModF32S) + DEFINE_OP_3V(s_mod_f64, UniOpVVV::kModF64S) + DEFINE_OP_3V(v_mod_f32, UniOpVVV::kModF32) + DEFINE_OP_3V(v_mod_f64, UniOpVVV::kModF64) DEFINE_OP_3V(s_min_f32, UniOpVVV::kMinF32S) DEFINE_OP_3V(s_min_f64, UniOpVVV::kMinF64S) DEFINE_OP_3V(v_min_f32, UniOpVVV::kMinF32) @@ -1856,7 +1680,6 @@ public: DEFINE_OP_3VI(v_insert_v256_u64, UniOpVVVI::kInsertV256_U64) DEFINE_OP_3VI(v_insert_v256_f64, UniOpVVVI::kInsertV256_F64) - DEFINE_OP_4V(v_blendv_u8, UniOpVVVV::kBlendV_U8) DEFINE_OP_4V(v_madd_i16, UniOpVVVV::kMAddU16) DEFINE_OP_4V(v_madd_u16, UniOpVVVV::kMAddU16) @@ -1944,7 +1767,7 @@ public: //! \name Memory Loads & Stores with Parameterized Size //! \{ - ASMJIT_NOINLINE void v_load_iany(const Vec& dst, const Mem& src, uint32_t n_bytes, Alignment alignment) noexcept { + ASMJIT_NOINLINE void v_load_iany(const Vec& dst, const Mem& src, size_t n_bytes, Alignment alignment) noexcept { switch (n_bytes) { case 1: v_load8(dst, src); break; case 2: v_loada16(dst, src, alignment); break; @@ -1959,7 +1782,7 @@ public: } } - ASMJIT_NOINLINE void v_store_iany(const Mem& dst, const Vec& src, uint32_t n_bytes, Alignment alignment) noexcept { + ASMJIT_NOINLINE void v_store_iany(const Mem& dst, const Vec& src, size_t n_bytes, Alignment alignment) noexcept { switch (n_bytes) { case 1: v_store8(dst, src); break; case 2: v_storea16(dst, src, alignment); break; @@ -2005,27 +1828,6 @@ public: #endif } - // d = int(floor(a / b) * b). - template - ASMJIT_NOINLINE void v_mod_pd(const Vec& d, const Vec& a, const VecOrMem& b) noexcept { -#if defined(ASMJIT_UJIT_X86) - if (!has_sse4_1()) { - Vec t = new_vec128("vModTmp"); - - v_div_f64(d, a, b); - v_cvt_trunc_f64_to_i32_lo(t, d); - v_cvt_i32_lo_to_f64(d, t); - v_mul_f64(d, d, b); - } - else -#endif // ASMJIT_UJIT_X86 - { - v_div_f64(d, a, b); - v_trunc_f64(d, d); - v_mul_f64(d, d, b); - } - } - //! \} }; diff --git a/src/asmjit/ujit/unicompiler_a64.cpp b/src/asmjit/ujit/unicompiler_a64.cpp index b68b3de..292ef0d 100644 --- a/src/asmjit/ujit/unicompiler_a64.cpp +++ b/src/asmjit/ujit/unicompiler_a64.cpp @@ -9,6 +9,8 @@ #if defined(ASMJIT_UJIT_AARCH64) #include "unicompiler.h" +#include "unicompiler_utils_p.h" +#include "unicondition.h" ASMJIT_BEGIN_SUB_NAMESPACE(ujit) @@ -19,17 +21,19 @@ namespace Inst { using namespace a64::Inst; } // ujit::UniCompiler - Construction & Destruction // ============================================== -UniCompiler::UniCompiler(BackendCompiler* cc, const CpuFeatures& features, UniOptFlags opt_flags) noexcept +UniCompiler::UniCompiler(BackendCompiler* cc, const CpuFeatures& features, CpuHints cpu_hints, VecConstTableRef ct_ref) noexcept : cc(cc), - ct(vec_const_table), + _ct_ref(ct_ref), _features(features), - _opt_flags(opt_flags), + _cpu_hints(cpu_hints), _vec_reg_count(32), _common_table_offset(0) { _scalar_op_behavior = ScalarOpBehavior::kZeroing; - _fmin_fmax_op_hehavior = FMinFMaxOpBehavior::kFiniteValue; + _fmin_fmax_op_behavior = FMinFMaxOpBehavior::kFiniteValue; _fmadd_op_behavior = FMAddOpBehavior::kFMAStoreToAccumulator; + _float_to_int_outside_range_behavior = FloatToIntOutsideRangeBehavior::kSaturatedValue; + _init_extensions(features); } @@ -38,7 +42,7 @@ UniCompiler::~UniCompiler() noexcept {} // ujit::UniCompiler - CPU Architecture, Features and Optimization Options // ======================================================================= -void UniCompiler::_init_extensions(const asmjit::CpuFeatures& features) noexcept { +void UniCompiler::_init_extensions(const CpuFeatures& features) noexcept { uint64_t gp_ext_mask = 0; uint64_t asimd_ext_mask = 0; @@ -85,16 +89,16 @@ void UniCompiler::init_vec_width(VecWidth vw) noexcept { _vec_width = VecWidth::k128; _vec_reg_type = RegType::kVec128; - _vec_type_id = asmjit::TypeId::kInt32x4; + _vec_type_id = TypeId::kInt32x4; _vec_multiplier = 1u; } bool UniCompiler::has_masked_access_of(uint32_t data_size) const noexcept { switch (data_size) { - case 1: return has_opt_flag(UniOptFlags::kMaskOps8Bit); - case 2: return has_opt_flag(UniOptFlags::kMaskOps16Bit); - case 4: return has_opt_flag(UniOptFlags::kMaskOps32Bit); - case 8: return has_opt_flag(UniOptFlags::kMaskOps64Bit); + case 1: return has_cpu_hint(CpuHints::kVecMaskedOps8); + case 2: return has_cpu_hint(CpuHints::kVecMaskedOps16); + case 4: return has_cpu_hint(CpuHints::kVecMaskedOps32); + case 8: return has_cpu_hint(CpuHints::kVecMaskedOps64); default: return false; @@ -104,7 +108,7 @@ bool UniCompiler::has_masked_access_of(uint32_t data_size) const noexcept { // ujit::UniCompiler - Function // ============================ -void UniCompiler::init_function(asmjit::FuncNode* func_node) noexcept { +void UniCompiler::init_function(FuncNode* func_node) noexcept { cc->add_func(func_node); _func_node = func_node; @@ -116,7 +120,7 @@ void UniCompiler::init_function(asmjit::FuncNode* func_node) noexcept { // ============================= void UniCompiler::_init_vec_const_table_ptr() noexcept { - const void* global = &vec_const_table; + const void* global = ct_ptr(); if (!_common_table_ptr.is_valid()) { ScopedInjector injector(cc, &_func_init); @@ -152,7 +156,7 @@ Vec UniCompiler::simd_vec_const(const void* c, Bcst bcst_width, VecWidth const_w } } - return Vec(OperandSignature{RegTraits::kSignature}, _new_vecConst(c, true).id()); + return Vec(OperandSignature{RegTraits::kSignature}, _new_vec_const(c, true).id()); } Vec UniCompiler::simd_vec_const(const void* c, Bcst bcst_width, const Vec& similar_to) noexcept { @@ -181,39 +185,39 @@ Mem UniCompiler::simd_mem_const(const void* c, Bcst bcst_width, const VecArray& } Mem UniCompiler::_get_mem_const(const void* c) noexcept { - // Make sure we are addressing a constant from the `vec_const_table` constant pool. - const void* global = &vec_const_table; - ASMJIT_ASSERT((uintptr_t)c >= (uintptr_t)global && - (uintptr_t)c < (uintptr_t)global + sizeof(VecConstTable)); + // Make sure we are addressing a constant from the `ct` constant pool. + const void* ct_addr = ct_ptr(); + ASMJIT_ASSERT((uintptr_t)c >= (uintptr_t)ct_addr && + (uintptr_t)c < (uintptr_t)ct_addr + _ct_ref.size); - // One GP register is sacrificed to hold the pointer to the `vec_const_table`. + // One GP register is sacrificed to hold the pointer to the `ct`. _init_vec_const_table_ptr(); - int32_t disp = int32_t((intptr_t)c - (intptr_t)global); + int32_t disp = int32_t((intptr_t)c - (intptr_t)ct_addr); return mem_ptr(_common_table_ptr, disp - _common_table_offset); } -Vec UniCompiler::_new_vecConst(const void* c, bool is_unique_const) noexcept { +Vec UniCompiler::_new_vec_const(const void* c, bool is_unique_const) noexcept { Support::maybe_unused(is_unique_const); Vec vec; const char* special_const_name = nullptr; if (special_const_name) { - vec = new_vec(vec_width(), special_const_name); + vec = new_vec_with_width(vec_width(), special_const_name); } else { uint64_t u0 = static_cast(c)[0]; uint64_t u1 = static_cast(c)[1]; if (u0 != u1) - vec = new_vec(vec_width(), "c_0x%016llX%016llX", (unsigned long long)u1, (unsigned long long)u0); + vec = new_vec_with_width(vec_width(), "c_0x%016llX%016llX", (unsigned long long)u1, (unsigned long long)u0); else if ((u0 >> 32) != (u0 & 0xFFFFFFFFu)) - vec = new_vec(vec_width(), "c_0x%016llX", (unsigned long long)u0); + vec = new_vec_with_width(vec_width(), "c_0x%016llX", (unsigned long long)u0); else if (((u0 >> 16) & 0xFFFFu) != (u0 & 0xFFFFu)) - vec = new_vec(vec_width(), "c_0x%08X", (unsigned)(u0 & 0xFFFFFFFFu)); + vec = new_vec_with_width(vec_width(), "c_0x%08X", (unsigned)(u0 & 0xFFFFFFFFu)); else - vec = new_vec(vec_width(), "c_0x%04X", (unsigned)(u0 & 0xFFFFu)); + vec = new_vec_with_width(vec_width(), "c_0x%04X", (unsigned)(u0 & 0xFFFFu)); } VecConstData const_data; @@ -221,7 +225,7 @@ Vec UniCompiler::_new_vecConst(const void* c, bool is_unique_const) noexcept { const_data.virt_reg_id = vec.id(); _vec_consts.append(arena(), const_data); - if (c == &ct.p_0000000000000000) { + if (c == &ct().p_0000000000000000) { ScopedInjector inject(cc, &_func_init); v_zero_i(vec.v128()); } @@ -245,14 +249,14 @@ Vec UniCompiler::simd_const_16b(const void* data16) noexcept { } } - Vec vec = new_vec(VecWidth::k128, "const"); + Vec vec = new_vec128("const"); VecConstDataEx entry; memcpy(entry.data, data16, 16); entry.virt_reg_id = vec.id(); _vec_consts_ex.append(arena(), entry); - Mem mem = cc->new_const(asmjit::ConstPoolScope::kLocal, data16, 16); + Mem mem = cc->new_const(ConstPoolScope::kLocal, data16, 16); { ScopedInjector inject(cc, &_func_init); v_loadavec(vec, mem); @@ -302,8 +306,8 @@ struct MemInst { uint16_t mem_size; }; -static ASMJIT_NOINLINE void gp_emit_mem_op(UniCompiler* pc, Gp r, Mem m, MemInst ii) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void gp_emit_mem_op(UniCompiler& uc, Gp r, Mem m, MemInst ii) noexcept { + BackendCompiler* cc = uc.cc; InstId inst_id = ii.inst_id; if (m.has_index() && m.has_shift()) { @@ -313,7 +317,7 @@ static ASMJIT_NOINLINE void gp_emit_mem_op(UniCompiler* pc, Gp r, Mem m, MemInst uint32_t shift = m.shift(); if (mem_size != (1u << shift)) { - Gp tmp = pc->new_gpz("@mem_addr"); + Gp tmp = uc.new_gpz("@mem_addr"); cc->add(tmp, m.base_reg().as(), m.index_reg().as(), a64::Shift(m.shift_op(), shift)); m = a64::ptr(tmp); } @@ -328,7 +332,7 @@ static ASMJIT_INLINE const Gp& gp_zero_as(const Gp& ref) noexcept { return gp_zero_regs[size_t(ref.is_gp64())]; } -static ASMJIT_NOINLINE Gp gp_force_reg(UniCompiler* pc, const Operand_& op, const Gp& ref) noexcept { +static ASMJIT_NOINLINE Gp gp_force_reg(UniCompiler& uc, const Operand_& op, const Gp& ref) noexcept { ASMJIT_ASSERT(op.is_gp() || op.is_mem() || op.is_imm()); Gp reg; @@ -343,11 +347,11 @@ static ASMJIT_NOINLINE Gp gp_force_reg(UniCompiler* pc, const Operand_& op, cons return gp_zero_as(ref); } - BackendCompiler* cc = pc->cc; - reg = pc->new_similar_reg(ref, "@tmp"); + BackendCompiler* cc = uc.cc; + reg = uc.new_similar_reg(ref, "@tmp"); if (op.is_mem()) { - gp_emit_mem_op(pc, reg, op.as(), MemInst{uint16_t(Inst::kIdLdr), uint16_t(reg.size())}); + gp_emit_mem_op(uc, reg, op.as(), MemInst{uint16_t(Inst::kIdLdr), uint16_t(reg.size())}); } else { cc->mov(reg, op.as()); @@ -375,15 +379,15 @@ static constexpr ConditionOpInfo condition_op_info[size_t(UniOpCond::kMaxValue) { Inst::kIdCmp , 0 } // UniOpCond::kCompare }; -class ConditionApplier : public Condition { +class ConditionApplier : public UniCondition { public: - ASMJIT_INLINE ConditionApplier(const Condition& condition) noexcept : Condition(condition) { + ASMJIT_INLINE ConditionApplier(const UniCondition& condition) noexcept : UniCondition(condition) { // The first operand must always be a register. ASMJIT_ASSERT(a.is_gp()); } - ASMJIT_NOINLINE void optimize(UniCompiler* pc) noexcept { - Support::maybe_unused(pc); + ASMJIT_NOINLINE void optimize(UniCompiler& uc) noexcept { + Support::maybe_unused(uc); switch (op) { case UniOpCond::kCompare: @@ -412,8 +416,8 @@ public: cond = a64::reverse_cond(cond); } - ASMJIT_NOINLINE void emit(UniCompiler* pc) noexcept { - BackendCompiler* cc = pc->cc; + ASMJIT_NOINLINE void emit(UniCompiler& uc) noexcept { + BackendCompiler* cc = uc.cc; ConditionOpInfo info = condition_op_info[size_t(op)]; Gp aGp = a.as(); @@ -424,7 +428,7 @@ public: cc->emit(info.inst_id, aGp, aGp, b.as()); } else { - cc->emit(info.inst_id, aGp, aGp, gp_force_reg(pc, b, aGp)); + cc->emit(info.inst_id, aGp, aGp, gp_force_reg(uc, b, aGp)); } return; } @@ -435,7 +439,7 @@ public: cc->emit(info.inst_id, aGp, aGp, b.as()); } else { - cc->emit(info.inst_id, aGp, aGp, gp_force_reg(pc, b, aGp)); + cc->emit(info.inst_id, aGp, aGp, gp_force_reg(uc, b, aGp)); } return; } @@ -457,7 +461,7 @@ public: cc->tst(aGp, aGp); } else { - cc->emit(info.inst_id, aGp, aGp, gp_force_reg(pc, b, aGp)); + cc->emit(info.inst_id, aGp, aGp, gp_force_reg(uc, b, aGp)); cc->tst(aGp, aGp); } return; @@ -468,7 +472,7 @@ public: cc->adds(aGp, gp_zero_as(aGp), aGp, a64::lsr(b.as().value_as())); } else { - cc->lsr(aGp, aGp, gp_force_reg(pc, b, aGp)); + cc->lsr(aGp, aGp, gp_force_reg(uc, b, aGp)); cc->tst(aGp, aGp); } return; @@ -479,7 +483,7 @@ public: cc->emit(info.inst_id, aGp, b.as()); } else { - cc->emit(info.inst_id, aGp, gp_force_reg(pc, b, aGp)); + cc->emit(info.inst_id, aGp, gp_force_reg(uc, b, aGp)); } return; } @@ -489,14 +493,14 @@ public: cc->emit(info.inst_id, aGp, b.as()); } else { - cc->emit(info.inst_id, aGp, gp_force_reg(pc, b, aGp)); + cc->emit(info.inst_id, aGp, gp_force_reg(uc, b, aGp)); } return; } case UniOpCond::kBitTest: { - Gp tmp = pc->new_similar_reg(aGp); - cc->lsr(tmp, aGp, gp_force_reg(pc, b, aGp)); + Gp tmp = uc.new_similar_reg(aGp); + cc->lsr(tmp, aGp, gp_force_reg(uc, b, aGp)); cc->tst(tmp, Imm(1)); return; } @@ -512,7 +516,7 @@ public: void UniCompiler::emit_mov(const Gp& dst, const Operand_& src) noexcept { if (src.is_mem()) { - gp_emit_mem_op(this, dst, src.as(), MemInst{uint16_t(Inst::kIdLdr), uint16_t(dst.size())}); + gp_emit_mem_op(*this, dst, src.as(), MemInst{uint16_t(Inst::kIdLdr), uint16_t(dst.size())}); } else { cc->emit(Inst::kIdMov, dst, src); @@ -531,7 +535,7 @@ void UniCompiler::emit_m(UniOpM op, const Mem& m_) noexcept { Gp zero = gp_zero_regs[size_t(op == UniOpM::kStoreZeroReg || op == UniOpM::kStoreZeroU64)]; MemInst ii = st_inst[size_t(op)]; - gp_emit_mem_op(this, zero, m_, ii); + gp_emit_mem_op(*this, zero, m_, ii); } void UniCompiler::emit_rm(UniOpRM op, const Gp& dst, const Mem& src) noexcept { @@ -578,14 +582,14 @@ void UniCompiler::emit_rm(UniOpRM op, const Gp& dst, const Mem& src) noexcept { r = r.w(); } - gp_emit_mem_op(this, r, m, ii); + gp_emit_mem_op(*this, r, m, ii); return; } case UniOpRM::kLoadShiftU8: case UniOpRM::kLoadShiftU16: { Gp tmp = new_similar_reg(r); - gp_emit_mem_op(this, tmp.r32(), m, ii); + gp_emit_mem_op(*this, tmp.r32(), m, ii); cc->orr(r, tmp, r, a64::lsl(ii.mem_size * 8)); return; } @@ -593,7 +597,7 @@ void UniCompiler::emit_rm(UniOpRM op, const Gp& dst, const Mem& src) noexcept { case UniOpRM::kLoadMergeU8: case UniOpRM::kLoadMergeU16: { Gp tmp = new_similar_reg(r); - gp_emit_mem_op(this, tmp.r32(), m, ii); + gp_emit_mem_op(*this, tmp.r32(), m, ii); cc->orr(r, r, tmp); return; } @@ -649,20 +653,20 @@ void UniCompiler::emit_mr(UniOpMR op, const Mem& dst, const Gp& src) noexcept { } } -void UniCompiler::emit_cmov(const Gp& dst, const Operand_& sel, const Condition& condition) noexcept { +void UniCompiler::emit_cmov(const Gp& dst, const Operand_& sel, const UniCondition& condition) noexcept { ConditionApplier ca(condition); - ca.optimize(this); - ca.emit(this); - cc->csel(dst, gp_force_reg(this, sel, dst), dst, condition.cond); + ca.optimize(*this); + ca.emit(*this); + cc->csel(dst, gp_force_reg(*this, sel, dst), dst, condition.cond); } -void UniCompiler::emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const Condition& condition) noexcept { +void UniCompiler::emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const UniCondition& condition) noexcept { ConditionApplier ca(condition); - ca.optimize(this); - ca.emit(this); + ca.optimize(*this); + ca.emit(*this); - Gp sel1 = gp_force_reg(this, sel1_, dst); - Gp sel2 = gp_force_reg(this, sel2_, dst); + Gp sel1 = gp_force_reg(*this, sel1_, dst); + Gp sel2 = gp_force_reg(*this, sel2_, dst); cc->csel(dst, sel1, sel2, condition.cond); } @@ -671,7 +675,7 @@ void UniCompiler::emit_2i(UniOpRR op, const Gp& dst, const Operand_& src_) noexc // ---------------- if (src_.is_reg_or_mem()) { - Gp src = gp_force_reg(this, src_, dst); + Gp src = gp_force_reg(*this, src_, dst); switch (op) { case UniOpRR::kAbs: { @@ -755,7 +759,7 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con std::swap(src1, src2); } else { - src1 = gp_force_reg(this, src1, dst); + src1 = gp_force_reg(*this, src1, dst); } } @@ -805,13 +809,13 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con // TODO: [JIT] Just testing the idea of patching the previous instruction to have a post-index addressing. if (!reverse && uint64_t(value) < 256 && dst.id() == a.id()) { - if (cc->cursor()->type() == asmjit::NodeType::kInst) { - asmjit::InstNode* prev_inst = cc->cursor()->as(); + if (cc->cursor()->type() == NodeType::kInst) { + InstNode* prev_inst = cc->cursor()->as(); if (prev_inst->inst_id() == Inst::kIdLdr || prev_inst->inst_id() == Inst::kIdStr || prev_inst->inst_id() == Inst::kIdLdr_v || prev_inst->inst_id() == Inst::kIdStr_v) { Mem& mem_op = prev_inst->op(prev_inst->op_count() - 1).as(); if (mem_op.base_reg() == a && !mem_op.has_index() && !mem_op.has_offset()) { - mem_op.set_offset_mode(asmjit::arm::OffsetMode::kPostIndex); + mem_op.set_offset_mode(arm::OffsetMode::kPostIndex); mem_op.add_offset(int64_t(value)); return; } @@ -819,7 +823,7 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con } } - if (asmjit::Support::is_uint_n<12>(value)) { + if (Support::is_uint_n<12>(value)) { cc->emit(addsub_inst[reverse], dst, a, Imm(value)); return; } @@ -831,15 +835,15 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con case UniOpRRR::kMul: { uint64_t value = b.value_as(); if (value > 0u) { - if (asmjit::Support::is_power_of_2(value)) { - uint32_t shift = asmjit::Support::ctz(value); + if (Support::is_power_of_2(value)) { + uint32_t shift = Support::ctz(value); cc->lsl(dst, a, Imm(shift)); return; } // We can still support multiplication with `power_of_2 + 1` - if (asmjit::Support::is_power_of_2(--value)) { - uint32_t shift = asmjit::Support::ctz(value); + if (Support::is_power_of_2(--value)) { + uint32_t shift = Support::ctz(value); cc->add(dst, a, a, a64::lsl(shift)); return; } @@ -921,7 +925,7 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con // --------------------- { - src2 = gp_force_reg(this, src2, dst); + src2 = gp_force_reg(*this, src2, dst); Gp a = src1.as(); Gp b = src2.as(); @@ -1020,10 +1024,10 @@ void UniCompiler::emit_j(const Operand_& target) noexcept { cc->emit(Inst::kIdB, target); } -void UniCompiler::emit_j_if(const Label& target, const Condition& condition) noexcept { +void UniCompiler::emit_j_if(const Label& target, const UniCondition& condition) noexcept { ConditionApplier ca(condition); - ca.optimize(this); - ca.emit(this); + ca.optimize(*this); + ca.emit(*this); cc->b(ca.cond, target); } @@ -1066,8 +1070,8 @@ void UniCompiler::mul_257_hu16(const Gp& dst, const Gp& src) noexcept { void UniCompiler::add_scaled(const Gp& dst, const Gp& a_, int b) noexcept { Gp a = a_.clone_as(dst); - if (asmjit::Support::is_power_of_2(b)) { - uint32_t shift = asmjit::Support::ctz(b); + if (Support::is_power_of_2(b)) { + uint32_t shift = Support::ctz(b); cc->add(dst, dst, a, a64::lsl(shift)); } else if (b == 3 && dst.id() == a.id()) { @@ -1086,8 +1090,8 @@ void UniCompiler::add_ext(const Gp& dst, const Gp& src_, const Gp& idx_, uint32_ Gp src = src_.clone_as(dst); Gp idx = idx_.clone_as(dst); - if (asmjit::Support::is_power_of_2(scale)) { - cc->add(dst, src, idx, a64::lsl(asmjit::Support::ctz(scale))); + if (Support::is_power_of_2(scale)) { + cc->add(dst, src, idx, a64::lsl(Support::ctz(scale))); } else { Gp tmp = new_similar_reg(dst, "@tmp"); @@ -1206,20 +1210,20 @@ struct UniOpVInfo { }; #define DEFINE_OP(inst_id, ext, commutative, comparison, reverse, same_vec_op, float_mode, dst_element, dst_part, src_element, src_part, imm) \ - UniOpVInfo { \ + UniOpVInfo { \ inst_id, \ - ASIMDExt::ext, \ - commutative, \ - comparison, \ - reverse, \ - SameVecOp::same_vec_op, \ + ASIMDExt::ext, \ + commutative, \ + comparison, \ + reverse, \ + SameVecOp::same_vec_op, \ FloatMode::float_mode, \ ElementSize::dst_element, \ VecPart::dst_part, \ ElementSize::src_element, \ VecPart::src_part, \ - imm, \ - 0 \ + imm, \ + 0 \ } static constexpr UniOpVInfo opcode_info_2v[size_t(UniOpVV::kMaxValue) + 1] = { @@ -1261,8 +1265,12 @@ static constexpr UniOpVInfo opcode_info_2v[size_t(UniOpVV::kMaxValue) + 1] = { DEFINE_OP(Inst::kIdSshll2_v , kIntrin, 0, 0, 0, kNone, kNone, k64, kNA, k32, kHi, 0x00u), // kCvtI32HiToI64 DEFINE_OP(Inst::kIdUshll_v , kIntrin, 0, 0, 0, kNone, kNone, k64, kNA, k32, kLo, 0x00u), // kCvtU32LoToU64 DEFINE_OP(Inst::kIdUshll2_v , kIntrin, 0, 0, 0, kNone, kNone, k64, kNA, k32, kHi, 0x00u), // kCvtU32HiToU64 + DEFINE_OP(Inst::kIdFabs_v , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kAbsF32S. + DEFINE_OP(Inst::kIdFabs_v , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kAbsF64S. DEFINE_OP(Inst::kIdFabs_v , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kAbsF32. DEFINE_OP(Inst::kIdFabs_v , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kAbsF64. + DEFINE_OP(Inst::kIdFneg_v , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kNegF32S. + DEFINE_OP(Inst::kIdFneg_v , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kNegF64S. DEFINE_OP(Inst::kIdFneg_v , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kNegF32. DEFINE_OP(Inst::kIdFneg_v , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kNegF64. DEFINE_OP(Inst::kIdMvn_v , kASIMD , 0, 0, 0, kNone, kNone, k8 , kNA, k8 , kNA, 0x00u), // kNotF32. @@ -1279,10 +1287,18 @@ static constexpr UniOpVInfo opcode_info_2v[size_t(UniOpVV::kMaxValue) + 1] = { DEFINE_OP(Inst::kIdFrintp_v , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kCeilF64S. DEFINE_OP(Inst::kIdFrintp_v , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kCeilF32. DEFINE_OP(Inst::kIdFrintp_v , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kCeilF64. - DEFINE_OP(Inst::kIdFrintn_v , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kRoundF32S. - DEFINE_OP(Inst::kIdFrintn_v , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kRoundF64S. - DEFINE_OP(Inst::kIdFrintn_v , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kRoundF32. - DEFINE_OP(Inst::kIdFrintn_v , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kRoundF64. + DEFINE_OP(Inst::kIdFrintn_v , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kRoundEvenF32S. + DEFINE_OP(Inst::kIdFrintn_v , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kRoundEvenF64S. + DEFINE_OP(Inst::kIdFrintn_v , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kRoundEvenF32. + DEFINE_OP(Inst::kIdFrintn_v , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kRoundEvenF64. + DEFINE_OP(Inst::kIdFrinta_v , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kRoundHalfAwayF32S. + DEFINE_OP(Inst::kIdFrinta_v , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kRoundHalfAwayF64S. + DEFINE_OP(Inst::kIdFrinta_v , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kRoundHalfAwayF32. + DEFINE_OP(Inst::kIdFrinta_v , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kRoundHalfAwayF64. + DEFINE_OP(Inst::kIdNone , kIntrin, 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kRoundHalfUpF32S. + DEFINE_OP(Inst::kIdNone , kIntrin, 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kRoundHalfUpF64S. + DEFINE_OP(Inst::kIdNone , kIntrin, 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kRoundHalfUpF32. + DEFINE_OP(Inst::kIdNone , kIntrin, 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kRoundHalfUpF64. DEFINE_OP(Inst::kIdNone , kIntrin, 0, 0, 0, kNone, kNone, k32, kNA, k32, kNA, 0x00u), // kRcpF32. DEFINE_OP(Inst::kIdNone , kIntrin, 0, 0, 0, kNone, kNone, k64, kNA, k64, kNA, 0x00u), // kRcpF64. DEFINE_OP(Inst::kIdFsqrt_v , kASIMD , 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kSqrtF32S. @@ -1492,6 +1508,10 @@ static constexpr UniOpVInfo opcode_info_3v[size_t(UniOpVVV::kMaxValue) + 1] = { DEFINE_OP(Inst::kIdFdiv_v , kASIMD , 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kDivF64S. DEFINE_OP(Inst::kIdFdiv_v , kASIMD , 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kDivF32. DEFINE_OP(Inst::kIdFdiv_v , kASIMD , 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kDivF64. + DEFINE_OP(Inst::kIdNone , kIntrin, 0, 0, 0, kNone, kF32S, k32, kNA, k32, kNA, 0x00u), // kModF32S. + DEFINE_OP(Inst::kIdNone , kIntrin, 0, 0, 0, kNone, kF64S, k64, kNA, k64, kNA, 0x00u), // kModF64S. + DEFINE_OP(Inst::kIdNone , kIntrin, 0, 0, 0, kNone, kF32V, k32, kNA, k32, kNA, 0x00u), // kModF32. + DEFINE_OP(Inst::kIdNone , kIntrin, 0, 0, 0, kNone, kF64V, k64, kNA, k64, kNA, 0x00u), // kModF64. DEFINE_OP(Inst::kIdFminnm_v , kASIMD , 1, 0, 0, kSrc , kF32S, k32, kNA, k32, kNA, 0x00u), // kMinF32S. DEFINE_OP(Inst::kIdFminnm_v , kASIMD , 1, 0, 0, kSrc , kF64S, k64, kNA, k64, kNA, 0x00u), // kMinF64S. DEFINE_OP(Inst::kIdFminnm_v , kASIMD , 1, 0, 0, kSrc , kF32V, k32, kNA, k32, kNA, 0x00u), // kMinF32. @@ -1809,8 +1829,8 @@ static ASMJIT_INLINE void vec_set_type_and_index(Vec& vec, ElementSize sz, uint3 vec.set_element_index(idx); } -static ASMJIT_NOINLINE void vec_load_mem(UniCompiler* pc, const Vec& dst, Mem src, uint32_t mem_size) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void vec_load_mem(UniCompiler& uc, const Vec& dst, Mem src, uint32_t mem_size) noexcept { + BackendCompiler* cc = uc.cc; if (src.has_index() && src.has_shift()) { // AArch64 limitation: index shift can be the same size as the size of the read operation, so H << 1, S << 2, @@ -1825,7 +1845,7 @@ static ASMJIT_NOINLINE void vec_load_mem(UniCompiler* pc, const Vec& dst, Mem sr src = a64::ptr(base, src.offset_lo32()); } else { - Gp tmp = pc->new_gpz("@mem_addr"); + Gp tmp = uc.new_gpz("@mem_addr"); cc->add(tmp, base, index, a64::Shift(src.shift_op(), shift)); src = a64::ptr(tmp, src.offset_lo32()); } @@ -1843,30 +1863,30 @@ static ASMJIT_NOINLINE void vec_load_mem(UniCompiler* pc, const Vec& dst, Mem sr } } -static ASMJIT_NOINLINE Vec vec_from_mem(UniCompiler* pc, const Mem& op, const Vec& ref, uint32_t mem_size = 0) noexcept { - Vec vec = pc->new_vec128("@tmp"); +static ASMJIT_NOINLINE Vec vec_from_mem(UniCompiler& uc, const Mem& op, const Vec& ref, uint32_t mem_size = 0) noexcept { + Vec vec = uc.new_vec128("@tmp"); if (mem_size == 0) mem_size = ref.size(); - vec_load_mem(pc, vec, op, mem_size); + vec_load_mem(uc, vec, op, mem_size); return vec.clone_as(ref); } -static ASMJIT_INLINE Vec as_vec(UniCompiler* pc, const Operand_& op, const Vec& ref, uint32_t mem_size = 0) noexcept { +static ASMJIT_INLINE Vec as_vec(UniCompiler& uc, const Operand_& op, const Vec& ref, uint32_t mem_size = 0) noexcept { if (op.is_vec()) return op.as().clone_as(ref); else - return vec_from_mem(pc, op.as(), ref, mem_size); + return vec_from_mem(uc, op.as(), ref, mem_size); } -static ASMJIT_INLINE Vec as_vec(UniCompiler* pc, const Operand_& op, const Vec& ref, FloatMode fm) noexcept { +static ASMJIT_INLINE Vec as_vec(UniCompiler& uc, const Operand_& op, const Vec& ref, FloatMode fm) noexcept { if (op.is_vec()) return op.as().clone_as(ref); else - return vec_from_mem(pc, op.as(), ref, float_mode_mem_size_table[size_t(fm)]); + return vec_from_mem(uc, op.as(), ref, float_mode_mem_size_table[size_t(fm)]); } -static ASMJIT_NOINLINE Vec vec_mov(UniCompiler* pc, const Vec& dst_, const Operand_& src_) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE Vec vec_mov(UniCompiler& uc, const Vec& dst_, const Operand_& src_) noexcept { + BackendCompiler* cc = uc.cc; Vec dst(dst_); vec_set_type(dst, ElementSize::k8); @@ -1881,15 +1901,15 @@ static ASMJIT_NOINLINE Vec vec_mov(UniCompiler* pc, const Vec& dst_, const Opera } if (src_.is_mem()) { - vec_load_mem(pc, dst, src_.as(), dst.size()); + vec_load_mem(uc, dst, src_.as(), dst.size()); return dst; } ASMJIT_NOT_REACHED(); } -static ASMJIT_NOINLINE void vec_neg(UniCompiler* pc, const Vec& dst, const Vec& src, FloatMode fm) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void vec_neg(UniCompiler& uc, const Vec& dst, const Vec& src, FloatMode fm) noexcept { + BackendCompiler* cc = uc.cc; if (fm == FloatMode::kF32S) cc->mvn_(dst.s(), src.s()); @@ -2109,10 +2129,10 @@ static constexpr Swizzle32Data swizzle_32_data[256] = { #undef OP -static void emit_swizzle32_impl(UniCompiler* pc, const Vec& dst, const Vec& src, uint32_t imm) noexcept { +static void emit_swizzle32_impl(UniCompiler& uc, const Vec& dst, const Vec& src, uint32_t imm) noexcept { ASMJIT_ASSERT((imm & 0xFCFCFCFC) == 0); - BackendCompiler* cc = pc->cc; + BackendCompiler* cc = uc.cc; uint32_t table_index = ((imm & 0x03000000) >> (24 - 6)) | ((imm & 0x00030000) >> (16 - 4)) | @@ -2134,12 +2154,12 @@ static void emit_swizzle32_impl(UniCompiler* pc, const Vec& dst, const Vec& src, op_dst = dst; } else { - op_dst = pc->new_similar_reg(dst, "@tmp"); + op_dst = uc.new_similar_reg(dst, "@tmp"); } switch (op.type()) { case Swizzle32Data::Op::kMov: { - vec_mov(pc, op_dst, op_src[0]); + vec_mov(uc, op_dst, op_src[0]); break; } @@ -2226,7 +2246,7 @@ static void emit_swizzle32_impl(UniCompiler* pc, const Vec& dst, const Vec& src, pred_data[14] = uint8_t(d + 2u); pred_data[15] = uint8_t(d + 3u); - Vec pred = pc->simd_const_16b(pred_data); + Vec pred = uc.simd_const_16b(pred_data); cc->tbl(dst.b16(), src.b16(), pred.b16()); } } @@ -2508,13 +2528,13 @@ static constexpr InterleavedShuffle32Ops interleaved_shuffle32_ops_dst_same_as_b #undef OP -static void emit_interleaved_shuffle32_impl(UniCompiler* pc, const Vec& dst, const Vec& src1, const Vec& src2, uint32_t imm) noexcept { +static void emit_interleaved_shuffle32_impl(UniCompiler& uc, const Vec& dst, const Vec& src1, const Vec& src2, uint32_t imm) noexcept { ASMJIT_ASSERT((imm & 0xFCFCFCFC) == 0); if (src1.id() == src2.id()) - return emit_swizzle32_impl(pc, dst, src1, imm); + return emit_swizzle32_impl(uc, dst, src1, imm); - BackendCompiler* cc = pc->cc; + BackendCompiler* cc = uc.cc; uint32_t table_index = ((imm & 0x03000000) >> (24 - 6)) | ((imm & 0x00030000) >> (16 - 4)) | @@ -2575,7 +2595,7 @@ static void emit_interleaved_shuffle32_impl(UniCompiler* pc, const Vec& dst, con op_dst = regs[op_index]; } else { - op_dst = pc->new_similar_reg(dst, "@shuf_tmp_%u", op_index - 2); + op_dst = uc.new_similar_reg(dst, "@shuf_tmp_%u", op_index - 2); } } else { @@ -2592,7 +2612,7 @@ static void emit_interleaved_shuffle32_impl(UniCompiler* pc, const Vec& dst, con // In this case the destination is in conflict with one of the source registers. We have to // create a new virtual register and then move it to the real `dst` to not mess up the shuffle. ASMJIT_ASSERT(!regs[op_index].is_valid()); - final_dst = pc->new_similar_reg(dst, "@shuf_dst"); + final_dst = uc.new_similar_reg(dst, "@shuf_dst"); } else { // Perfect - the destination is not in conflict with any source register. @@ -2661,7 +2681,7 @@ static void emit_interleaved_shuffle32_impl(UniCompiler* pc, const Vec& dst, con regs[op_index] = op_dst; } - vec_mov(pc, dst, final_dst); + vec_mov(uc, dst, final_dst); } // ujit::UniCompiler - Vector Instructions - OpArray Iterator @@ -2690,62 +2710,62 @@ public: }; template -static ASMJIT_INLINE void emit_2v_t(UniCompiler* pc, UniOpVV op, const OpArray& dst_, const Src& src_) noexcept { +static ASMJIT_INLINE void emit_2v_t(UniCompiler& uc, UniOpVV op, const OpArray& dst_, const Src& src_) noexcept { size_t n = dst_.size(); OpArrayIter src(src_); for (size_t i = 0; i < n; i++) { - pc->emit_2v(op, dst_[i], src.op()); + uc.emit_2v(op, dst_[i], src.op()); src.next(); } } template -static ASMJIT_INLINE void emit_2vi_t(UniCompiler* pc, UniOpVVI op, const OpArray& dst_, const Src& src_, uint32_t imm) noexcept { +static ASMJIT_INLINE void emit_2vi_t(UniCompiler& uc, UniOpVVI op, const OpArray& dst_, const Src& src_, uint32_t imm) noexcept { size_t n = dst_.size(); OpArrayIter src(src_); for (size_t i = 0; i < n; i++) { - pc->emit_2vi(op, dst_[i], src.op(), imm); + uc.emit_2vi(op, dst_[i], src.op(), imm); src.next(); } } template -static ASMJIT_INLINE void emit_3v_t(UniCompiler* pc, UniOpVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_) noexcept { +static ASMJIT_INLINE void emit_3v_t(UniCompiler& uc, UniOpVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_) noexcept { size_t n = dst_.size(); OpArrayIter src1(src1_); OpArrayIter src2(src2_); for (size_t i = 0; i < n; i++) { - pc->emit_3v(op, dst_[i], src1.op(), src2.op()); + uc.emit_3v(op, dst_[i], src1.op(), src2.op()); src1.next(); src2.next(); } } template -static ASMJIT_INLINE void emit_3vi_t(UniCompiler* pc, UniOpVVVI op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, uint32_t imm) noexcept { +static ASMJIT_INLINE void emit_3vi_t(UniCompiler& uc, UniOpVVVI op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, uint32_t imm) noexcept { size_t n = dst_.size(); OpArrayIter src1(src1_); OpArrayIter src2(src2_); for (size_t i = 0; i < n; i++) { - pc->emit_3vi(op, dst_[i], src1.op(), src2.op(), imm); + uc.emit_3vi(op, dst_[i], src1.op(), src2.op(), imm); src1.next(); src2.next(); } } template -static ASMJIT_INLINE void emit_4v_t(UniCompiler* pc, UniOpVVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, const Src3& src3_) noexcept { +static ASMJIT_INLINE void emit_4v_t(UniCompiler& uc, UniOpVVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, const Src3& src3_) noexcept { size_t n = dst_.size(); OpArrayIter src1(src1_); OpArrayIter src2(src2_); OpArrayIter src3(src3_); for (size_t i = 0; i < n; i++) { - pc->emit_4v(op, dst_[i], src1.op(), src2.op(), src3.op()); + uc.emit_4v(op, dst_[i], src1.op(), src2.op(), src3.op()); src1.next(); src2.next(); src3.next(); @@ -2774,13 +2794,13 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ return; } - vec_mov(this, dst, src_); + vec_mov(*this, dst, src_); return; } case UniOpVV::kMovU64: { dst = dst.d(); - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); cc->mov(dst.b8(), src.b8()); return; @@ -2809,7 +2829,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ return; } - v_load_iany(dst, src.as(), 1u << uint32_t(op_info.src_element), Alignment{1}); + v_load_iany(dst, src.as(), size_t(1) << uint32_t(op_info.src_element), Alignment(1)); src = dst; } @@ -2855,7 +2875,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kNotU64: case UniOpVV::kNotF32: case UniOpVV::kNotF64: { - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); vec_set_type(dst, op_info.dst_element); vec_set_type(src, op_info.src_element); @@ -2866,7 +2886,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kCvtI8ToI32: case UniOpVV::kCvtU8ToU32: { - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); cc->emit(inst_id, dst.h8(), src.b8(), 0); cc->emit(inst_id, dst.s4(), dst.h4(), 0); return; @@ -2889,7 +2909,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ Vec src; if (op_info.src_part == VecPart::kLo) { - src = as_vec(this, src_, dst, 8); + src = as_vec(*this, src_, dst, 8); src = src.v64(); } else if (src_.is_vec()) { @@ -2898,7 +2918,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ else { Mem m(src_.as()); m.add_offset(8); - src = vec_from_mem(this, m, dst, 8); + src = vec_from_mem(*this, m, dst, 8); src = src.v64(); // Since we have loaded from memory, we want to use the low-part variant of the instruction. @@ -2913,10 +2933,10 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kRcpF32: case UniOpVV::kRcpF64: { // Intrinsic. - const void* one_ptr = op == UniOpVV::kRcpF32 ? static_cast(&ct.f32_1) : static_cast(&ct.f64_1); + const void* one_ptr = op_info.src_element == ElementSize::k32 ? static_cast(&ct().f32_1) : static_cast(&ct().f64_1); Vec one = simd_vec_const(one_ptr, Bcst::kNA, dst); - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); vec_set_type(dst, op_info.dst_element); vec_set_type(one, op_info.dst_element); @@ -2926,30 +2946,71 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ return; } + case UniOpVV::kAbsF32S: + case UniOpVV::kNegF32S: case UniOpVV::kTruncF32S: case UniOpVV::kFloorF32S: case UniOpVV::kCeilF32S: - case UniOpVV::kRoundF32S: + case UniOpVV::kRoundEvenF32S: + case UniOpVV::kRoundHalfAwayF32S: case UniOpVV::kSqrtF32S: { dst.set_signature(RegTraits::kSignature); - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); cc->emit(inst_id, dst, src); return; } + case UniOpVV::kAbsF64S: + case UniOpVV::kNegF64S: case UniOpVV::kTruncF64S: case UniOpVV::kFloorF64S: case UniOpVV::kCeilF64S: - case UniOpVV::kRoundF64S: + case UniOpVV::kRoundEvenF64S: + case UniOpVV::kRoundHalfAwayF64S: case UniOpVV::kSqrtF64S: { dst.set_signature(RegTraits::kSignature); - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); cc->emit(inst_id, dst, src); return; } + case UniOpVV::kRoundHalfUpF32S: + case UniOpVV::kRoundHalfUpF64S: + case UniOpVV::kRoundHalfUpF32: + case UniOpVV::kRoundHalfUpF64: { + // Intrinsic. + const void* one_ptr = + op_info.src_element == ElementSize::k32 + ? static_cast(&ct().f32_0_5_minus_1ulp) + : static_cast(&ct().f64_0_5_minus_1ulp); + + Vec one = simd_vec_const(one_ptr, Bcst::kNA, dst); + Vec src = as_vec(*this, src_, dst); + + if (op == UniOpVV::kRoundHalfUpF32S) { + dst.set_signature(RegTraits::kSignature); + src.set_signature(RegTraits::kSignature); + one.set_signature(RegTraits::kSignature); + } + else if (op == UniOpVV::kRoundHalfUpF64S) { + dst.set_signature(RegTraits::kSignature); + src.set_signature(RegTraits::kSignature); + one.set_signature(RegTraits::kSignature); + } + else { + vec_set_type(dst, op_info.dst_element); + vec_set_type(one, op_info.src_element); + vec_set_type(src, op_info.src_element); + } + + cc->fadd(dst, src, one); + cc->frintm(dst, dst); + + return; + } + case UniOpVV::kAbsF32: case UniOpVV::kAbsF64: case UniOpVV::kNegF32: @@ -2960,14 +3021,16 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kFloorF64: case UniOpVV::kCeilF32: case UniOpVV::kCeilF64: - case UniOpVV::kRoundF32: - case UniOpVV::kRoundF64: + case UniOpVV::kRoundEvenF32: + case UniOpVV::kRoundEvenF64: + case UniOpVV::kRoundHalfAwayF32: + case UniOpVV::kRoundHalfAwayF64: case UniOpVV::kSqrtF32: case UniOpVV::kSqrtF64: case UniOpVV::kCvtI32ToF32: case UniOpVV::kCvtRoundF32ToI32: case UniOpVV::kCvtTruncF32ToI32: { - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); vec_set_type(dst, op_info.dst_element); vec_set_type(src, op_info.src_element); @@ -2978,7 +3041,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kCvtF32ToF64S: case UniOpVV::kCvtF64ToF32S: { - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); vec_set_vec_type(dst, op_info.dst_element); vec_set_vec_type(src, op_info.src_element); @@ -2989,7 +3052,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kCvtF32HiToF64: if (src_.is_mem()) { - Vec src = as_vec(this, src_.as().clone_adjusted(8), dst, 8).v64(); + Vec src = as_vec(*this, src_.as().clone_adjusted(8), dst, 8).v64(); vec_set_type(dst, op_info.dst_element); vec_set_type(src, op_info.src_element); @@ -3000,7 +3063,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ [[fallthrough]]; case UniOpVV::kCvtF32LoToF64: { - Vec src = as_vec(this, src_, dst, 8); + Vec src = as_vec(*this, src_, dst, 8); if (op_info.src_part == VecPart::kLo) { src = src.v64(); @@ -3015,7 +3078,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kCvtI32HiToF64: if (src_.is_mem()) { - Vec src = as_vec(this, src_.as().clone_adjusted(8), dst, 8).v64(); + Vec src = as_vec(*this, src_.as().clone_adjusted(8), dst, 8).v64(); vec_set_type(dst, op_info.dst_element); vec_set_type(src, op_info.src_element); @@ -3027,7 +3090,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ [[fallthrough]]; case UniOpVV::kCvtI32LoToF64: { - Vec src = as_vec(this, src_, dst, 8); + Vec src = as_vec(*this, src_, dst, 8); if (op_info.src_part == VecPart::kLo) { src = src.v64(); @@ -3044,7 +3107,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kCvtF64ToF32Lo: case UniOpVV::kCvtF64ToF32Hi: { dst = dst.q(); - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); if (op_info.dst_part == VecPart::kLo) { dst = dst.d(); @@ -3063,7 +3126,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kCvtRoundF64ToI32Hi: { dst = dst.q(); - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); Vec tmp = new_similar_reg(dst, "@tmp"); cc->emit(inst_id, tmp.d2(), src.d2()); @@ -3083,8 +3146,8 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ } } -void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const Operand_& src_) noexcept { emit_2v_t(this, op, dst_, src_); } -void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const OpArray& src_) noexcept { emit_2v_t(this, op, dst_, src_); } +void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const Operand_& src_) noexcept { emit_2v_t(*this, op, dst_, src_); } +void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const OpArray& src_) noexcept { emit_2v_t(*this, op, dst_, src_); } // ujit::UniCompiler - Vector Instructions - Emit 2VI // ================================================== @@ -3102,7 +3165,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr case UniOpVVI::kSrlbU128: { ASMJIT_ASSERT(imm < 16); - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); // If the shift is used to extract a high 64-bit element and zero the rest of the register. if (op == UniOpVVI::kSrlbU128 && imm == 8) { @@ -3131,7 +3194,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr case UniOpVVI::kSwizzleU16x4: case UniOpVVI::kSwizzleLoU16x4: case UniOpVVI::kSwizzleHiU16x4: { - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); uint8_t pred_data[16] = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF }; @@ -3169,14 +3232,14 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr case UniOpVVI::kSwizzleU32x4: case UniOpVVI::kSwizzleF32x4: { - Vec src = as_vec(this, src_, dst); - emit_swizzle32_impl(this, dst, src, imm); + Vec src = as_vec(*this, src_, dst); + emit_swizzle32_impl(*this, dst, src, imm); return; } case UniOpVVI::kSwizzleU64x2: case UniOpVVI::kSwizzleF64x2: { - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); // Use `dup` to broadcast one 64-bit elements. if (Swizzle2{imm} == swizzle(0, 0) || @@ -3215,7 +3278,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr ASMJIT_NOT_REACHED(); default: { - Vec src = as_vec(this, src_, dst); + Vec src = as_vec(*this, src_, dst); if (op_info.dst_part == VecPart::kLo) dst = dst.d(); if (op_info.src_part == VecPart::kLo) src = src.d(); @@ -3229,8 +3292,8 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr } } -void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept { emit_2vi_t(this, op, dst_, src_, imm); } -void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept { emit_2vi_t(this, op, dst_, src_, imm); } +void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept { emit_2vi_t(*this, op, dst_, src_, imm); } +void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept { emit_2vi_t(*this, op, dst_, src_, imm); } // ujit::UniCompiler - Vector Instructions - Emit 2VS // ================================================== @@ -3375,7 +3438,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen case UniOpVM::kLoad128_U64: case UniOpVM::kLoad128_F32: case UniOpVM::kLoad128_F64: { - vec_load_mem(this, dst, src, op_info.mem_size); + vec_load_mem(*this, dst, src, op_info.mem_size); return; } @@ -3383,7 +3446,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen case UniOpVM::kLoadN_U64: case UniOpVM::kLoadN_F32: case UniOpVM::kLoadN_F64: { - vec_load_mem(this, dst.q(), src, 16); + vec_load_mem(*this, dst.q(), src, 16); return; } @@ -3411,7 +3474,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen case UniOpVM::kLoadCvt64_U16ToU32: case UniOpVM::kLoadCvt64_I32ToI64: case UniOpVM::kLoadCvt64_U32ToU64: { - vec_load_mem(this, dst, src, op_info.mem_size); + vec_load_mem(*this, dst, src, op_info.mem_size); emit_2v(UniOpVV(op_info.cvt_op), dst, dst); return; } @@ -3424,7 +3487,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen case UniOpVM::kLoadCvtN_U16ToU32: case UniOpVM::kLoadCvtN_I32ToI64: case UniOpVM::kLoadCvtN_U32ToU64: { - vec_load_mem(this, dst, src, dst.size() / 2u); + vec_load_mem(*this, dst, src, dst.size() / 2u); emit_2v(UniOpVV(op_info.cvt_op), dst, dst); return; } @@ -3442,7 +3505,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen } else { Vec tmp = new_similar_reg(dst, "@tmp"); - v_load_iany(tmp, src, op_info.mem_size, Alignment{1}); + v_load_iany(tmp, src, op_info.mem_size, Alignment(1)); vec_set_type_and_index(dst, op_info.element, idx); vec_set_type_and_index(tmp, op_info.element, 0); @@ -3725,7 +3788,7 @@ void UniCompiler::emit_mv(UniOpMV op, const Mem& dst_, const OpArray& src_, Alig // ================================================= static void emit_3v_op( - UniCompiler* pc, + UniCompiler& uc, InstId inst_id, Vec dst, Vec src1, Operand_ src2_, FloatMode float_mode, @@ -3739,19 +3802,19 @@ static void emit_3v_op( case FloatMode::kF32S: { dst = dst.s(); src1 = src1.s(); - src2 = as_vec(pc, src2_, dst, 4); + src2 = as_vec(uc, src2_, dst, 4); break; } case FloatMode::kF64S: { dst = dst.d(); src1 = src1.d(); - src2 = as_vec(pc, src2_, dst, 8); + src2 = as_vec(uc, src2_, dst, 8); break; } default: { - src2 = as_vec(pc, src2_, dst); + src2 = as_vec(uc, src2_, dst); if (dst_part == VecPart::kLo) { dst = dst.d(); @@ -3769,7 +3832,7 @@ static void emit_3v_op( } } - BackendCompiler* cc = pc->cc; + BackendCompiler* cc = uc.cc; if (reversed) cc->emit(inst_id, dst, src2, src1); else @@ -3799,7 +3862,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src } case SameVecOp::kSrc: { - vec_mov(this, dst, src1); + vec_mov(*this, dst, src1); return; } @@ -3809,8 +3872,29 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src } switch (op) { + // dst = a - (floor(a / b) * b). + case UniOpVVV::kModF32S: + case UniOpVVV::kModF64S: + case UniOpVVV::kModF32: + case UniOpVVV::kModF64: { + Vec src2 = as_vec(*this, src2_, dst, op_info.float_mode); + Vec tmp = new_similar_reg(dst, "@tmp1"); + + UniOpVVV sub_op = translate_op(op, UniOpVVV::kModF32S, UniOpVVV::kSubF32S); + UniOpVVV mul_op = translate_op(op, UniOpVVV::kModF32S, UniOpVVV::kMulF32S); + UniOpVVV div_op = translate_op(op, UniOpVVV::kModF32S, UniOpVVV::kDivF32S); + UniOpVV trunc_op = translate_op(op, UniOpVVV::kModF32S, UniOpVV::kTruncF32S); + + emit_3v(div_op, tmp, src1, src2); + emit_2v(trunc_op, tmp, tmp); + emit_3v(mul_op, tmp, tmp, src2); + emit_3v(sub_op, dst, src1, tmp); + + return; + } + case UniOpVVV::kMulU64: { - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); Vec tmp1 = new_similar_reg(dst, "@tmp1"); Vec tmp2 = new_similar_reg(dst, "@tmp2"); Vec tmp3 = new_similar_reg(dst, "@tmp3"); @@ -3827,7 +3911,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src } case UniOpVVV::kMulhI16: { - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); Vec tmp = new_similar_reg(dst, "@tmp"); cc->smull(tmp.s4(), src1.h4(), src2.h4()); @@ -3837,7 +3921,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src } case UniOpVVV::kMulhU16: { - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); Vec tmp = new_similar_reg(dst, "@tmp"); cc->umull(tmp.s4(), src1.h4(), src2.h4()); @@ -3847,7 +3931,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src } case UniOpVVV::kMulU64_LoU32: { - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); Vec tmp1 = new_similar_reg(dst, "@tmp1"); Vec tmp2 = new_similar_reg(dst, "@tmp2"); Vec tmp3 = dst; @@ -3865,7 +3949,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src } case UniOpVVV::kMHAddI16_I32: { - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); Vec al = new_similar_reg(dst, "@al"); Vec ah = new_similar_reg(dst, "@ah"); @@ -3888,7 +3972,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kMinU64: case UniOpVVV::kMaxI64: case UniOpVVV::kMaxU64: { - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); // Min/Max is commutative, so let's make dst only overlap src1. if (dst.id() == src2.id()) { @@ -3923,8 +4007,8 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCmpNeF64S: case UniOpVVV::kCmpNeF32: case UniOpVVV::kCmpNeF64: { - emit_3v_op(this, inst_id, dst, src1, src2_, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0); - vec_neg(this, dst, dst, op_info.float_mode); + emit_3v_op(*this, inst_id, dst, src1, src2_, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0); + vec_neg(*this, dst, dst, op_info.float_mode); return; } @@ -3937,7 +4021,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCmpUnordF32: case UniOpVVV::kCmpUnordF64: { if (is_same_vec(src1, src2_)) { - emit_3v_op(this, Inst::kIdFcmeq_v, dst, src1, src1, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0); + emit_3v_op(*this, Inst::kIdFcmeq_v, dst, src1, src1, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0); } else { // This takes advantage of the following: @@ -3952,20 +4036,20 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src // // - If both elements are zeros, regardless of the sign of either zero, the result is the second element. // - If either element is a NaN, regardless of the value of FPCR.DN, the result is the second element. - Vec src2 = as_vec(this, src2_, dst, op_info.float_mode); - emit_3v_op(this, Inst::kIdFmin_v, dst, src1, src2, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0); - emit_3v_op(this, Inst::kIdFcmeq_v, dst, dst, dst, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0); + Vec src2 = as_vec(*this, src2_, dst, op_info.float_mode); + emit_3v_op(*this, Inst::kIdFmin_v, dst, src1, src2, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0); + emit_3v_op(*this, Inst::kIdFcmeq_v, dst, dst, dst, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, 0); } if (op_info.imm) - vec_neg(this, dst, dst, op_info.float_mode); + vec_neg(*this, dst, dst, op_info.float_mode); return; } case UniOpVVV::kHAddF64: { Vec tmp = new_similar_reg(dst, "@tmp"); - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); if (src1.id() == src2.id()) { cc->ext(tmp.b16(), src1.b16(), src1.b16(), 8); @@ -3982,7 +4066,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCombineLoHiU64: case UniOpVVV::kCombineLoHiF64: { // Intrinsic - dst = {src1.u64[0], src2.64[1]} - combining low part of src1 and high part of src1. - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); vec_set_type(dst, ElementSize::k8); vec_set_type(src1, ElementSize::k8); @@ -3996,7 +4080,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCombineHiLoU64: case UniOpVVV::kCombineHiLoF64: { // Intrinsic - dst = {src1.u64[1], src2.64[0]} - combining high part of src1 and low part of src2. - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); if (is_same_vec(dst, src1)) { if (is_same_vec(dst, src2)) @@ -4023,7 +4107,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src size_t id = size_t(op) - size_t(UniOpVVV::kPacksI16_I8); - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); vec_set_type(src1, op_info.src_element); vec_set_type(src2, op_info.src_element); @@ -4058,15 +4142,15 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src } default: { - emit_3v_op(this, inst_id, dst, src1, src2_, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, op_info.reverse); + emit_3v_op(*this, inst_id, dst, src1, src2_, op_info.float_mode, op_info.dst_element, op_info.dst_part, op_info.src_element, op_info.src_part, op_info.reverse); return; } } } -void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); } -void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); } -void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); } +void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); } +void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); } +void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); } // ujit::UniCompiler - Vector Instructions - Emit 3VI // ================================================== @@ -4087,11 +4171,11 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s ASMJIT_ASSERT(imm < 16); if (imm == 0) { - vec_mov(this, dst, src2_); + vec_mov(*this, dst, src2_); return; } - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); vec_set_type(dst, ElementSize::k8); vec_set_type(src1, ElementSize::k8); vec_set_type(src2, ElementSize::k8); @@ -4103,8 +4187,8 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s case UniOpVVVI::kInterleaveShuffleF32x4: { ASMJIT_ASSERT((imm & 0xFCFCFCFC) == 0); - Vec src2 = as_vec(this, src2_, dst); - emit_interleaved_shuffle32_impl(this, dst, src1, src2, imm); + Vec src2 = as_vec(*this, src2_, dst); + emit_interleaved_shuffle32_impl(*this, dst, src1, src2, imm); return; } @@ -4112,7 +4196,7 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s case UniOpVVVI::kInterleaveShuffleF64x2: { ASMJIT_ASSERT((imm & 0xFFFCFEFE) == 0); - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); if (src1.id() == src2.id()) { v_swizzle_u64x2(dst, src1, Swizzle2{imm}); @@ -4147,9 +4231,9 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s } } -void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); } -void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); } -void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); } +void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); } +void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); } +void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); } // ujit::UniCompiler - Vector Instructions - Emit 4V // ================================================= @@ -4166,8 +4250,8 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr switch (op) { case UniOpVVVV::kBlendV_U8: { - Vec src2 = as_vec(this, src2_, dst); - Vec src3 = as_vec(this, src3_, dst); + Vec src2 = as_vec(*this, src2_, dst); + Vec src3 = as_vec(*this, src3_, dst); vec_set_type(dst, op_info.dst_element); vec_set_type(src1, op_info.src_element); @@ -4189,7 +4273,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr return; } - vec_mov(this, dst, src3); + vec_mov(*this, dst, src3); cc->bsl(dst, src2, src1); return; } @@ -4208,14 +4292,14 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr if (op_info.float_mode == FloatMode::kF32S) { dst = dst.s(); src1 = src1.s(); - src2 = as_vec(this, src2_, dst, 4); - src3 = as_vec(this, src3_, dst, 4); + src2 = as_vec(*this, src2_, dst, 4); + src3 = as_vec(*this, src3_, dst, 4); } else { dst = dst.d(); src1 = src1.d(); - src2 = as_vec(this, src2_, dst, 8); - src3 = as_vec(this, src3_, dst, 8); + src2 = as_vec(*this, src2_, dst, 8); + src3 = as_vec(*this, src3_, dst, 8); } cc->emit(inst_id, dst, src1, src2, src3); @@ -4232,7 +4316,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr case UniOpVVVV::kNMAddF64: case UniOpVVVV::kNMSubF32: case UniOpVVVV::kNMSubF64: { - Vec src2 = as_vec(this, src2_, dst); + Vec src2 = as_vec(*this, src2_, dst); Vec src3; bool negate_acc = op_info.imm != 0; @@ -4240,11 +4324,11 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr bool destructible = is_same_vec(dst, src3_) || !src3_.is_reg(); if (!dst_overlaps && src3_.is_mem()) { - vec_load_mem(this, dst, src3_.as(), dst.size()); + vec_load_mem(*this, dst, src3_.as(), dst.size()); src3 = dst; } else { - src3 = as_vec(this, src3_, dst); + src3 = as_vec(*this, src3_, dst); } vec_set_type(dst, op_info.dst_element); @@ -4288,13 +4372,13 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr } } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } ASMJIT_END_SUB_NAMESPACE diff --git a/src/asmjit/ujit/unicompiler_utils_p.h b/src/asmjit/ujit/unicompiler_utils_p.h new file mode 100644 index 0000000..b01cc1c --- /dev/null +++ b/src/asmjit/ujit/unicompiler_utils_p.h @@ -0,0 +1,32 @@ +// This file is part of AsmJit project +// +// See or LICENSE.md for license and copyright information +// SPDX-License-Identifier: Zlib + +#ifndef ASMJIT_UJIT_UNICOMPILER_UTILS_P_H_INCLUDED +#define ASMJIT_UJIT_UNICOMPILER_UTILS_P_H_INCLUDED + +#include "ujitbase.h" + +#if !defined(ASMJIT_NO_UJIT) + +#include "uniop.h" + +ASMJIT_BEGIN_SUB_NAMESPACE(ujit) + +//! \addtogroup asmjit_ujit +//! \{ + +template +static ASMJIT_INLINE UniOpDst translate_op(UniOpSrc op, UniOpSrc begin, UniOpDst target) noexcept { + ASMJIT_ASSERT(begin <= op); + uint32_t offset = uint32_t(op) - uint32_t(begin); + return UniOpDst(uint32_t(target) + offset); +} + +//! \} + +ASMJIT_END_SUB_NAMESPACE + +#endif // !ASMJIT_NO_UJIT +#endif // ASMJIT_UJIT_UNICOMPILER_UTILS_P_H_INCLUDED diff --git a/src/asmjit/ujit/unicompiler_x86.cpp b/src/asmjit/ujit/unicompiler_x86.cpp index 57b247b..a0c8939 100644 --- a/src/asmjit/ujit/unicompiler_x86.cpp +++ b/src/asmjit/ujit/unicompiler_x86.cpp @@ -9,6 +9,8 @@ #if defined(ASMJIT_UJIT_X86) #include "unicompiler.h" +#include "unicompiler_utils_p.h" +#include "unicondition.h" ASMJIT_BEGIN_SUB_NAMESPACE(ujit) @@ -17,8 +19,8 @@ using SSEExt = UniCompiler::SSEExt; using AVXExt = UniCompiler::AVXExt; namespace Inst { using namespace x86::Inst; } -// ujit::UniCompiler - Constants -// ============================== +// ujit::UniCompiler - Constants +// ============================= static constexpr OperandSignature signature_of_xmm_ymm_zmm[] = { OperandSignature{RegTraits::kSignature}, @@ -30,30 +32,31 @@ static ASMJIT_INLINE RegType vec_reg_type_from_width(VecWidth vw) noexcept { return RegType(uint32_t(RegType::kVec128) + uint32_t(vw)); } -// ujit::UniCompiler - Construction & Destruction -// =============================================== +// ujit::UniCompiler - Construction & Destruction +// ============================================== -UniCompiler::UniCompiler(BackendCompiler* cc, const CpuFeatures& features, UniOptFlags opt_flags) noexcept +UniCompiler::UniCompiler(BackendCompiler* cc, const CpuFeatures& features, CpuHints cpu_hints, VecConstTableRef ct_ref) noexcept : cc(cc), - ct(vec_const_table), + _ct_ref(ct_ref), _features(features), - _opt_flags(opt_flags), + _cpu_hints(cpu_hints), _vec_reg_count(16), _common_table_offset(128) { _scalar_op_behavior = ScalarOpBehavior::kPreservingVec128; - _fmin_fmax_op_hehavior = FMinFMaxOpBehavior::kTernaryLogic; + _fmin_fmax_op_behavior = FMinFMaxOpBehavior::kTernaryLogic; _fmadd_op_behavior = FMAddOpBehavior::kNoFMA; // Will be changed by _init_extensions() if supported. + _float_to_int_outside_range_behavior = FloatToIntOutsideRangeBehavior::kSmallestValue; _init_extensions(features); } UniCompiler::~UniCompiler() noexcept {} -// ujit::UniCompiler - CPU Architecture, Features and Optimization Options -// ======================================================================== +// ujit::UniCompiler - CPU Architecture, Features and Optimization Options +// ======================================================================= -void UniCompiler::_init_extensions(const asmjit::CpuFeatures& features) noexcept { +void UniCompiler::_init_extensions(const CpuFeatures& features) noexcept { uint32_t gp_ext_mask = 0; uint32_t sse_ext_mask = 0; uint64_t avx_ext_mask = 0; @@ -134,20 +137,20 @@ void UniCompiler::init_vec_width(VecWidth vw) noexcept { bool UniCompiler::has_masked_access_of(uint32_t data_size) const noexcept { switch (data_size) { - case 1: return has_opt_flag(UniOptFlags::kMaskOps8Bit); - case 2: return has_opt_flag(UniOptFlags::kMaskOps16Bit); - case 4: return has_opt_flag(UniOptFlags::kMaskOps32Bit); - case 8: return has_opt_flag(UniOptFlags::kMaskOps64Bit); + case 1: return has_cpu_hint(CpuHints::kVecMaskedOps8); + case 2: return has_cpu_hint(CpuHints::kVecMaskedOps16); + case 4: return has_cpu_hint(CpuHints::kVecMaskedOps32); + case 8: return has_cpu_hint(CpuHints::kVecMaskedOps64); default: return false; } } -// ujit::UniCompiler - Function -// ============================= +// ujit::UniCompiler - Function +// ============================ -void UniCompiler::init_function(asmjit::FuncNode* func_node) noexcept { +void UniCompiler::init_function(FuncNode* func_node) noexcept { cc->add_func(func_node); _func_node = func_node; @@ -164,16 +167,16 @@ void UniCompiler::init_function(asmjit::FuncNode* func_node) noexcept { } } -// ujit::UniCompiler - Constants -// ============================== +// ujit::UniCompiler - Constants +// ============================= void UniCompiler::_init_vec_const_table_ptr() noexcept { - const void* global = &vec_const_table; + const void* ct_addr = ct_ptr(); if (!_common_table_ptr.is_valid()) { ScopedInjector injector(cc, &_func_init); _common_table_ptr = new_gpz("common_table_ptr"); - cc->mov(_common_table_ptr, (int64_t)global + _common_table_offset); + cc->mov(_common_table_ptr, (int64_t)ct_addr + _common_table_offset); } } @@ -183,7 +186,7 @@ x86::KReg UniCompiler::k_const(uint64_t value) noexcept { if (_k_reg[slot].is_valid() && _k_imm[slot] == value) return _k_reg[slot]; - asmjit::BaseNode* prevNode = nullptr; + BaseNode* prevNode = nullptr; Gp tmp; x86::KReg kReg; @@ -213,55 +216,58 @@ x86::KReg UniCompiler::k_const(uint64_t value) noexcept { } Operand UniCompiler::simd_const(const void* c, Bcst bcst_width, VecWidth const_width) noexcept { - size_t constCount = _vec_consts.size(); + size_t const_count = _vec_consts.size(); - for (size_t i = 0; i < constCount; i++) - if (_vec_consts[i].ptr == c) + for (size_t i = 0; i < const_count; i++) { + if (_vec_consts[i].ptr == c) { return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _vec_consts[i].virt_reg_id); + } + } // We don't use memory constants when compiling for AVX-512, because we don't store 64-byte constants and AVX-512 // has enough registers to hold all the constants that we need. However, in SSE/AVX2 case, we don't want so many // constants in registers as that could limit registers that we need during fetching and composition. if (!has_avx512()) { - bool useVReg = (c == &ct.p_0000000000000000); // Required if the CPU doesn't have SSE4.1. - if (!useVReg) + bool use_vreg = (c == &ct().p_0000000000000000); // Required if the CPU doesn't have SSE4.1. + if (!use_vreg) { return simd_mem_const(c, bcst_width, const_width); + } } - return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _new_vecConst(c, bcst_width == Bcst::kNA_Unique).id()); + return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _new_vec_const(c, bcst_width == Bcst::kNA_Unique).id()); } Operand UniCompiler::simd_const(const void* c, Bcst bcst_width, const Vec& similar_to) noexcept { - VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(asmjit::RegType::kVec128)); + VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(RegType::kVec128)); return simd_const(c, bcst_width, const_width); } Operand UniCompiler::simd_const(const void* c, Bcst bcst_width, const VecArray& similar_to) noexcept { ASMJIT_ASSERT(!similar_to.is_empty()); - VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(asmjit::RegType::kVec128)); + VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(RegType::kVec128)); return simd_const(c, bcst_width, const_width); } Vec UniCompiler::simd_vec_const(const void* c, Bcst bcst_width, VecWidth const_width) noexcept { - size_t constCount = _vec_consts.size(); + size_t const_count = _vec_consts.size(); - for (size_t i = 0; i < constCount; i++) + for (size_t i = 0; i < const_count; i++) if (_vec_consts[i].ptr == c) return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _vec_consts[i].virt_reg_id); - return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _new_vecConst(c, bcst_width == Bcst::kNA_Unique).id()); + return Vec(signature_of_xmm_ymm_zmm[size_t(const_width)], _new_vec_const(c, bcst_width == Bcst::kNA_Unique).id()); } Vec UniCompiler::simd_vec_const(const void* c, Bcst bcst_width, const Vec& similar_to) noexcept { - VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(asmjit::RegType::kVec128)); + VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(RegType::kVec128)); return simd_vec_const(c, bcst_width, const_width); } Vec UniCompiler::simd_vec_const(const void* c, Bcst bcst_width, const VecArray& similar_to) noexcept { ASMJIT_ASSERT(!similar_to.is_empty()); - VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(asmjit::RegType::kVec128)); + VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(RegType::kVec128)); return simd_vec_const(c, bcst_width, const_width); } @@ -284,22 +290,22 @@ x86::Mem UniCompiler::simd_mem_const(const void* c, Bcst bcst_width, VecWidth co } x86::Mem UniCompiler::simd_mem_const(const void* c, Bcst bcst_width, const Vec& similar_to) noexcept { - VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(asmjit::RegType::kVec128)); + VecWidth const_width = VecWidth(uint32_t(similar_to.reg_type()) - uint32_t(RegType::kVec128)); return simd_mem_const(c, bcst_width, const_width); } x86::Mem UniCompiler::simd_mem_const(const void* c, Bcst bcst_width, const VecArray& similar_to) noexcept { ASMJIT_ASSERT(!similar_to.is_empty()); - VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(asmjit::RegType::kVec128)); + VecWidth const_width = VecWidth(uint32_t(similar_to[0].reg_type()) - uint32_t(RegType::kVec128)); return simd_mem_const(c, bcst_width, const_width); } x86::Mem UniCompiler::_get_mem_const(const void* c) noexcept { // Make sure we are addressing a constant from the `commonTable` constant pool. - const void* global = &vec_const_table; - ASMJIT_ASSERT((uintptr_t)c >= (uintptr_t)global && - (uintptr_t)c < (uintptr_t)global + sizeof(VecConstTable)); + const void* ct_addr = ct_ptr(); + ASMJIT_ASSERT((uintptr_t)c >= (uintptr_t)ct_addr && + (uintptr_t)c < (uintptr_t)ct_addr + _ct_ref.size); if (is_32bit()) { // 32-bit mode - These constants will never move in memory so the absolute addressing is a win/win as we can save @@ -307,34 +313,34 @@ x86::Mem UniCompiler::_get_mem_const(const void* c) noexcept { return x86::ptr((uint64_t)c); } else { - // 64-bit mode - One GP register is sacrificed to hold the pointer to the `vec_const_table`. This is probably the - // safest approach as relying on absolute addressing or anything else could lead to problems or performance issues. + // 64-bit mode - One GP register is sacrificed to hold the pointer to the `ct`. This is probably the safest + // approach as relying on absolute addressing or anything else could lead to problems or performance issues. _init_vec_const_table_ptr(); - int32_t disp = int32_t((intptr_t)c - (intptr_t)global); + int32_t disp = int32_t((intptr_t)c - (intptr_t)ct_addr); return x86::ptr(_common_table_ptr, disp - _common_table_offset); } } -Vec UniCompiler::_new_vecConst(const void* c, bool is_unique_const) noexcept { +Vec UniCompiler::_new_vec_const(const void* c, bool is_unique_const) noexcept { Vec vec; const char* special_const_name = nullptr; if (special_const_name) { - vec = new_vec(vec_width(), special_const_name); + vec = new_vec_with_width(vec_width(), special_const_name); } else { uint64_t u0 = static_cast(c)[0]; uint64_t u1 = static_cast(c)[1]; if (u0 != u1) - vec = new_vec(vec_width(), "c_0x%016llX%016llX", (unsigned long long)u1, (unsigned long long)u0); + vec = new_vec_with_width(vec_width(), "c_0x%016llX%016llX", (unsigned long long)u1, (unsigned long long)u0); else if ((u0 >> 32) != (u0 & 0xFFFFFFFFu)) - vec = new_vec(vec_width(), "c_0x%016llX", (unsigned long long)u0); + vec = new_vec_with_width(vec_width(), "c_0x%016llX", (unsigned long long)u0); else if (((u0 >> 16) & 0xFFFFu) != (u0 & 0xFFFFu)) - vec = new_vec(vec_width(), "c_0x%08X", (unsigned)(u0 & 0xFFFFFFFFu)); + vec = new_vec_with_width(vec_width(), "c_0x%08X", (unsigned)(u0 & 0xFFFFFFFFu)); else - vec = new_vec(vec_width(), "c_0x%04X", (unsigned)(u0 & 0xFFFFu)); + vec = new_vec_with_width(vec_width(), "c_0x%04X", (unsigned)(u0 & 0xFFFFu)); } VecConstData const_data; @@ -342,7 +348,7 @@ Vec UniCompiler::_new_vecConst(const void* c, bool is_unique_const) noexcept { const_data.virt_reg_id = vec.id(); _vec_consts.append(arena(), const_data); - if (c == &ct.p_0000000000000000) { + if (c == &ct().p_0000000000000000) { ScopedInjector inject(cc, &_func_init); v_zero_i(vec.xmm()); } @@ -363,8 +369,8 @@ Vec UniCompiler::_new_vecConst(const void* c, bool is_unique_const) noexcept { return vec; } -// ujit::UniCompiler - Stack -// ========================== +// ujit::UniCompiler - Stack +// ========================= x86::Mem UniCompiler::tmp_stack(StackId id, uint32_t size) noexcept { ASMJIT_ASSERT(Support::is_power_of_2(size)); @@ -379,8 +385,8 @@ x86::Mem UniCompiler::tmp_stack(StackId id, uint32_t size) noexcept { return stack; } -// ujit::UniCompiler - Utilities -// ============================== +// ujit::UniCompiler - Utilities +// ============================= void UniCompiler::embed_jump_table(const Label* jump_table, size_t jump_table_size, const Label& jump_table_base, uint32_t entry_size) noexcept { static const uint8_t zeros[8] {}; @@ -393,8 +399,8 @@ void UniCompiler::embed_jump_table(const Label* jump_table, size_t jump_table_si } } -// ujit::UniCompiler - General Purpose Instructions - Conditions -// ============================================================== +// ujit::UniCompiler - General Purpose Instructions - Conditions +// ============================================================= static constexpr InstId condition_to_inst_id[size_t(UniOpCond::kMaxValue) + 1] = { Inst::kIdAnd, // UniOpCond::kAssignAnd @@ -408,14 +414,14 @@ static constexpr InstId condition_to_inst_id[size_t(UniOpCond::kMaxValue) + 1] = Inst::kIdCmp // UniOpCond::kCompare }; -class ConditionApplier : public Condition { +class ConditionApplier : public UniCondition { public: - ASMJIT_INLINE ConditionApplier(const Condition& condition) noexcept : Condition(condition) { + ASMJIT_INLINE ConditionApplier(const UniCondition& condition) noexcept : UniCondition(condition) { // The first operand must always be a register. ASMJIT_ASSERT(a.is_gp()); } - ASMJIT_NOINLINE void optimize(UniCompiler* pc) noexcept { + ASMJIT_NOINLINE void optimize(UniCompiler& uc) noexcept { switch (op) { case UniOpCond::kAssignShr: if (b.is_imm() && b.as().value() == 0) { @@ -448,7 +454,7 @@ public: // test on 64-bit hardware as it's guaranteed that any register index is encodable. On 32-bit hardware only the // first 4 registers can be used, which could mean that the register would have to be moved just to be tested, // which is something we would like to avoid. - if (pc->is_64bit() && bit_index < 8) { + if (uc.is_64bit() && bit_index < 8) { op = UniOpCond::kTest; b = Imm(1u << bit_index); cond = cond == CondCode::kC ? CondCode::kNZ : CondCode::kZ; @@ -466,8 +472,8 @@ public: cond = x86::reverse_cond(cond); } - ASMJIT_NOINLINE void emit(UniCompiler* pc) noexcept { - BackendCompiler* cc = pc->cc; + ASMJIT_NOINLINE void emit(UniCompiler& uc) noexcept { + BackendCompiler* cc = uc.cc; InstId inst_id = condition_to_inst_id[size_t(op)]; if (inst_id == Inst::kIdTest && cc->is_64bit()) { @@ -494,8 +500,8 @@ public: } }; -// ujit::UniCompiler - General Purpose Instructions - Emit -// ======================================================== +// ujit::UniCompiler - General Purpose Instructions - Emit +// ======================================================= void UniCompiler::emit_mov(const Gp& dst, const Operand_& src) noexcept { if (src.is_imm() && src.as().value() == 0) { @@ -642,16 +648,16 @@ void UniCompiler::emit_mr(UniOpMR op, const Mem& dst, const Gp& src) noexcept { cc->emit(op_info.inst_id, m, r); } -void UniCompiler::emit_cmov(const Gp& dst, const Operand_& sel, const Condition& condition) noexcept { +void UniCompiler::emit_cmov(const Gp& dst, const Operand_& sel, const UniCondition& condition) noexcept { ConditionApplier ca(condition); - ca.optimize(this); - ca.emit(this); + ca.optimize(*this); + ca.emit(*this); cc->emit(Inst::cmovcc_from_cond(ca.cond), dst, sel); } -void UniCompiler::emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const Condition& condition) noexcept { +void UniCompiler::emit_select(const Gp& dst, const Operand_& sel1_, const Operand_& sel2_, const UniCondition& condition) noexcept { ConditionApplier ca(condition); - ca.optimize(this); + ca.optimize(*this); bool dst_is_a = ca.a.is_reg() && dst.id() == ca.a.as().id(); bool dst_is_b = ca.b.is_reg() && dst.id() == ca.b.as().id(); @@ -674,17 +680,17 @@ void UniCompiler::emit_select(const Gp& dst, const Operand_& sel1_, const Operan if (sel1.is_imm() && sel1.as().value() == 0 && !dst_is_a && !dst_is_b && !dst_is_sel) { cc->xor_(dst, dst); - ca.emit(this); + ca.emit(*this); } else { - ca.emit(this); + ca.emit(*this); if (!dst_is_sel) cc->emit(Inst::kIdMov, dst, sel1); } if (sel2.is_imm()) { int64_t value = sel2.as().value(); - Mem sel2_mem = cc->new_const(asmjit::ConstPoolScope::kLocal, &value, dst.size()); + Mem sel2_mem = cc->new_const(ConstPoolScope::kLocal, &value, dst.size()); sel2 = sel2_mem; } @@ -1045,7 +1051,7 @@ void UniCompiler::emit_3i(UniOpRRR op, const Gp& dst, const Operand_& src1_, con case UniOpRRR::kRol: { if (has_bmi2()) { uint32_t reg_size = dst.size() * 8u; - uint32_t imm = (reg_size - b.value_as()) & asmjit::Support::lsb_mask(reg_size); + uint32_t imm = (reg_size - b.value_as()) & Support::lsb_mask(reg_size); cc->rorx(dst, a, imm); } else { @@ -1479,10 +1485,10 @@ void UniCompiler::emit_j(const Operand_& target) noexcept { cc->emit(Inst::kIdJmp, target); } -void UniCompiler::emit_j_if(const Label& target, const Condition& condition) noexcept { +void UniCompiler::emit_j_if(const Label& target, const UniCondition& condition) noexcept { ConditionApplier ca(condition); - ca.optimize(this); - ca.emit(this); + ca.optimize(*this); + ca.emit(*this); cc->j(ca.cond, target); } @@ -1581,7 +1587,7 @@ void UniCompiler::add_ext(const Gp& dst, const Gp& src_, const Gp& idx_, uint32_ case 2: case 4: case 8: - lea(dst, x86::ptr(src, idx, asmjit::Support::ctz(scale), disp)); + lea(dst, x86::ptr(src, idx, Support::ctz(scale), disp)); return; default: @@ -1608,20 +1614,20 @@ void UniCompiler::lea(const Gp& dst, const Mem& src) noexcept { Mem m(src); if (is_64bit() && dst.size() == 4) { - if (m.base_type() == asmjit::RegType::kGp32) { - m.set_base_type(asmjit::RegType::kGp64); + if (m.base_type() == RegType::kGp32) { + m.set_base_type(RegType::kGp64); } - if (m.index_type() == asmjit::RegType::kGp32) { - m.set_index_type(asmjit::RegType::kGp64); + if (m.index_type() == RegType::kGp32) { + m.set_index_type(RegType::kGp64); } } cc->lea(dst, m); } -// ujit::UniCompiler - Vector Instructions - Constants -// ==================================================== +// ujit::UniCompiler - Vector Instructions - Constants +// =================================================== //! Floating point mode is used in places that are generic and implement various functionality that needs more //! than a single instruction. Typically implementing either higher level concepts or missing functionality. @@ -1692,8 +1698,17 @@ enum class NarrowingMode : uint32_t { kSaturateUToU }; -// ujit::UniCompiler - Vector Instructions - Broadcast / Shuffle Data -// =================================================================== +[[maybe_unused]] +static ASMJIT_INLINE bool is_scalar_fp_op(FloatMode fm) noexcept { return fm <= kF64S; } + +[[maybe_unused]] +static ASMJIT_INLINE bool is_f32_op(FloatMode fm) noexcept { return fm == kF32S || fm == kF32V; } + +[[maybe_unused]] +static ASMJIT_INLINE bool is_f64_op(FloatMode fm) noexcept { return fm == kF64S || fm == kF64V; } + +// ujit::UniCompiler - Vector Instructions - Broadcast / Shuffle Data +// ================================================================== static constexpr uint16_t avx512_vinsert_128[] = { Inst::kIdVinserti32x4, @@ -1709,8 +1724,8 @@ static constexpr uint16_t avx512_vshuf_128[] = { Inst::kIdVshuff64x2 }; -// ujit::UniCompiler - Vector Instructions - Integer Cmp/Min/Max Data -// =================================================================== +// ujit::UniCompiler - Vector Instructions - Integer Cmp/Min/Max Data +// ================================================================== struct CmpMinMaxInst { uint16_t peq; @@ -1741,8 +1756,8 @@ static constexpr CmpMinMaxInst avx_cmp_min_max[] = { { Inst::kIdVpcmpeqq, Inst::kIdVpcmpgtq, Inst::kIdVpminuq, Inst::kIdVpmaxuq }, }; -// ujit::UniCompiler - Vector Instructions - Integer Conversion Data -// ================================================================== +// ujit::UniCompiler - Vector Instructions - Integer Conversion Data +// ================================================================= struct WideningOpInfo { uint32_t mov : 16; @@ -1773,12 +1788,13 @@ static constexpr WideningOpInfo sse_int_widening_op_info[] = { { Inst::kIdPmovzxdq , Inst::kIdPunpckldq , Inst::kIdPunpckhdq , 0, 0 } // kU32ToU64. }; -// ujit::UniCompiler - Vector Instructions - Float Instruction Data -// ================================================================= +// ujit::UniCompiler - Vector Instructions - Float Instruction Data +// ================================================================ struct FloatInst { uint16_t fmovs; - uint16_t fmov; + uint16_t fmova; + uint16_t fmovu; uint16_t fand; uint16_t for_; uint16_t fxor; @@ -1791,6 +1807,7 @@ struct FloatInst { uint16_t fmax; uint16_t fcmp; uint16_t fround; + uint16_t frndscale; uint16_t psrl; uint16_t psll; }; @@ -1799,6 +1816,7 @@ static constexpr FloatInst sse_float_inst[4] = { { Inst::kIdMovss, Inst::kIdMovaps, + Inst::kIdMovups, Inst::kIdAndps, Inst::kIdOrps, Inst::kIdXorps, @@ -1811,12 +1829,14 @@ static constexpr FloatInst sse_float_inst[4] = { Inst::kIdMaxss, Inst::kIdCmpss, Inst::kIdRoundss, + Inst::kIdNone, Inst::kIdPsrld, Inst::kIdPslld }, { Inst::kIdMovsd, Inst::kIdMovaps, + Inst::kIdMovups, Inst::kIdAndpd, Inst::kIdOrpd, Inst::kIdXorpd, @@ -1829,12 +1849,14 @@ static constexpr FloatInst sse_float_inst[4] = { Inst::kIdMaxsd, Inst::kIdCmpsd, Inst::kIdRoundsd, + Inst::kIdNone, Inst::kIdPsrlq, Inst::kIdPsllq }, { Inst::kIdMovaps, Inst::kIdMovaps, + Inst::kIdMovups, Inst::kIdAndps, Inst::kIdOrps, Inst::kIdXorps, @@ -1847,12 +1869,14 @@ static constexpr FloatInst sse_float_inst[4] = { Inst::kIdMaxps, Inst::kIdCmpps, Inst::kIdRoundps, + Inst::kIdNone, Inst::kIdPsrld, Inst::kIdPslld }, { Inst::kIdMovaps, Inst::kIdMovaps, + Inst::kIdMovups, Inst::kIdAndpd, Inst::kIdOrpd, Inst::kIdXorpd, @@ -1865,6 +1889,7 @@ static constexpr FloatInst sse_float_inst[4] = { Inst::kIdMaxpd, Inst::kIdCmppd, Inst::kIdRoundpd, + Inst::kIdNone, Inst::kIdPsrlq, Inst::kIdPsllq } @@ -1874,6 +1899,7 @@ static constexpr FloatInst avx_float_inst[4] = { { Inst::kIdVmovss, Inst::kIdVmovaps, + Inst::kIdVmovups, Inst::kIdVandps, Inst::kIdVorps, Inst::kIdVxorps, @@ -1886,12 +1912,14 @@ static constexpr FloatInst avx_float_inst[4] = { Inst::kIdVmaxss, Inst::kIdVcmpss, Inst::kIdVroundss, + Inst::kIdVrndscaless, Inst::kIdVpsrld, Inst::kIdVpslld }, { Inst::kIdVmovsd, Inst::kIdVmovaps, + Inst::kIdVmovups, Inst::kIdVandpd, Inst::kIdVorpd, Inst::kIdVxorpd, @@ -1904,12 +1932,14 @@ static constexpr FloatInst avx_float_inst[4] = { Inst::kIdVmaxsd, Inst::kIdVcmpsd, Inst::kIdVroundsd, + Inst::kIdVrndscalesd, Inst::kIdVpsrlq, Inst::kIdVpsllq }, { Inst::kIdVmovaps, Inst::kIdVmovaps, + Inst::kIdVmovups, Inst::kIdVandps, Inst::kIdVorps, Inst::kIdVxorps, @@ -1922,12 +1952,14 @@ static constexpr FloatInst avx_float_inst[4] = { Inst::kIdVmaxps, Inst::kIdVcmpps, Inst::kIdVroundps, + Inst::kIdVrndscaleps, Inst::kIdVpsrld, Inst::kIdVpslld }, { Inst::kIdVmovaps, Inst::kIdVmovaps, + Inst::kIdVmovups, Inst::kIdVandpd, Inst::kIdVorpd, Inst::kIdVxorpd, @@ -1940,13 +1972,14 @@ static constexpr FloatInst avx_float_inst[4] = { Inst::kIdVmaxpd, Inst::kIdVcmppd, Inst::kIdVroundpd, + Inst::kIdVrndscalepd, Inst::kIdVpsrlq, Inst::kIdVpsllq } }; -// ujit::UniCompiler - Vector Instructions - UniOp Information -// ============================================================= +// ujit::UniCompiler - Vector Instructions - UniOp Information +// =========================================================== struct UniOpVInfo { //! \name Members @@ -2029,8 +2062,12 @@ static constexpr UniOpVInfo opcode_info_2v[size_t(UniOpVV::kMaxValue) + 1] = { DEFINE_OP(kIdPmovsxdq , 0, kIntrin, kIdVpmovsxdq , kIntrin , 0, 0, kNone, 0, 0x00u, kNone, k64, 0, kNA), // kCvtI32HiToI64. DEFINE_OP(kIdPmovzxdq , 0, kIntrin, kIdVpmovzxdq , kIntrin , 0, 0, kNone, 0, 0x00u, kNone, k64, 0, kNA), // kCvtU32LoToU64. DEFINE_OP(kIdPmovzxdq , 0, kIntrin, kIdVpmovzxdq , kIntrin , 0, 0, kNone, 0, 0x00u, kNone, k64, 0, kNA), // kCvtU32HiToU64. + DEFINE_OP(kIdAndps , 0, kIntrin, kIdVandps , kIntrin , 0, 0, kNone, 0, 0x00u, kF32S, k32, 4, kNA), // kAbsF32S. + DEFINE_OP(kIdAndpd , 0, kIntrin, kIdVandpd , kIntrin , 0, 0, kNone, 0, 0x00u, kF64S, k64, 8, kNA), // kAbsF64S. DEFINE_OP(kIdAndps , 0, kIntrin, kIdVandps , kIntrin , 0, 0, kNone, 0, 0x00u, kF32V, k32, 4, kNA), // kAbsF32. DEFINE_OP(kIdAndpd , 0, kIntrin, kIdVandpd , kIntrin , 0, 0, kNone, 0, 0x00u, kF64V, k64, 8, kNA), // kAbsF64. + DEFINE_OP(kIdXorps , 0, kIntrin, kIdVxorps , kIntrin , 0, 0, kNone, 0, 0x00u, kF32S, k32, 4, kNA), // kNegF32S. + DEFINE_OP(kIdXorpd , 0, kIntrin, kIdVxorpd , kIntrin , 0, 0, kNone, 0, 0x00u, kF64S, k64, 8, kNA), // kNegF64S. DEFINE_OP(kIdXorps , 0, kIntrin, kIdVxorps , kIntrin , 0, 0, kNone, 0, 0x00u, kF32V, k32, 4, kNA), // kNegF32. DEFINE_OP(kIdXorpd , 0, kIntrin, kIdVxorpd , kIntrin , 0, 0, kNone, 0, 0x00u, kF64V, k64, 8, kNA), // kNegF64. DEFINE_OP(kIdNone , 0, kIntrin, kIdNone , kIntrin , 0, 0, kNone, 0, 0x00u, kNone, k32, 4, kNA), // kAbsU32. @@ -2047,10 +2084,18 @@ static constexpr UniOpVInfo opcode_info_2v[size_t(UniOpVV::kMaxValue) + 1] = { DEFINE_OP(kIdRoundsd , 2, kIntrin, kIdVroundsd , kIntrin , 0, 0, kNone, 1, 0x0Au, kF64S, k64, 8, kNA), // kCeilF64S. DEFINE_OP(kIdRoundps , 2, kIntrin, kIdVroundps , kIntrin , 0, 0, kNone, 1, 0x0Au, kF32V, k32, 4, kNA), // kCeilF32. DEFINE_OP(kIdRoundpd , 2, kIntrin, kIdVroundpd , kIntrin , 0, 0, kNone, 1, 0x0Au, kF64V, k64, 8, kNA), // kCeilF64. - DEFINE_OP(kIdRoundss , 2, kIntrin, kIdVroundss , kIntrin , 0, 0, kNone, 1, 0x08u, kF32S, k32, 4, kNA), // kRoundF32S. - DEFINE_OP(kIdRoundsd , 2, kIntrin, kIdVroundsd , kIntrin , 0, 0, kNone, 1, 0x08u, kF64S, k64, 8, kNA), // kRoundF64S. - DEFINE_OP(kIdRoundps , 2, kIntrin, kIdVroundps , kIntrin , 0, 0, kNone, 1, 0x08u, kF32V, k32, 4, kNA), // kRoundF32. - DEFINE_OP(kIdRoundpd , 2, kIntrin, kIdVroundpd , kIntrin , 0, 0, kNone, 1, 0x08u, kF64V, k64, 8, kNA), // kRoundF64. + DEFINE_OP(kIdRoundss , 2, kIntrin, kIdVroundss , kIntrin , 0, 0, kNone, 1, 0x08u, kF32S, k32, 4, kNA), // kRoundEvenF32S. + DEFINE_OP(kIdRoundsd , 2, kIntrin, kIdVroundsd , kIntrin , 0, 0, kNone, 1, 0x08u, kF64S, k64, 8, kNA), // kRoundEvenF64S. + DEFINE_OP(kIdRoundps , 2, kIntrin, kIdVroundps , kIntrin , 0, 0, kNone, 1, 0x08u, kF32V, k32, 4, kNA), // kRoundEvenF32. + DEFINE_OP(kIdRoundpd , 2, kIntrin, kIdVroundpd , kIntrin , 0, 0, kNone, 1, 0x08u, kF64V, k64, 8, kNA), // kRoundEvenF64. + DEFINE_OP(kIdRoundss , 2, kIntrin, kIdVroundss , kIntrin , 0, 0, kNone, 1, 0x0Bu, kF32S, k32, 4, kNA), // kRoundHalfAwayF32S. + DEFINE_OP(kIdRoundsd , 2, kIntrin, kIdVroundsd , kIntrin , 0, 0, kNone, 1, 0x0Bu, kF64S, k64, 8, kNA), // kRoundHalfAwayF64S. + DEFINE_OP(kIdRoundps , 2, kIntrin, kIdVroundps , kIntrin , 0, 0, kNone, 1, 0x0Bu, kF32V, k32, 4, kNA), // kRoundHalfAwayF32. + DEFINE_OP(kIdRoundpd , 2, kIntrin, kIdVroundpd , kIntrin , 0, 0, kNone, 1, 0x0Bu, kF64V, k64, 8, kNA), // kRoundHalfAwayF64. + DEFINE_OP(kIdRoundss , 2, kIntrin, kIdVroundss , kIntrin , 0, 0, kNone, 1, 0x09u, kF32S, k32, 4, kNA), // kRoundHalfUpF32S. + DEFINE_OP(kIdRoundsd , 2, kIntrin, kIdVroundsd , kIntrin , 0, 0, kNone, 1, 0x09u, kF64S, k64, 8, kNA), // kRoundHalfUpF64S. + DEFINE_OP(kIdRoundps , 2, kIntrin, kIdVroundps , kIntrin , 0, 0, kNone, 1, 0x09u, kF32V, k32, 4, kNA), // kRoundHalfUpF32. + DEFINE_OP(kIdRoundpd , 2, kIntrin, kIdVroundpd , kIntrin , 0, 0, kNone, 1, 0x09u, kF64V, k64, 8, kNA), // kRoundHalfUpF64. DEFINE_OP(kIdNone , 0, kIntrin, kIdNone , kIntrin , 0, 0, kNone, 0, 0x00u, kNone, k32, 4, kNA), // kRcpF32. DEFINE_OP(kIdNone , 0, kIntrin, kIdNone , kIntrin , 0, 0, kNone, 0, 0x00u, kNone, k64, 8, kNA), // kRcpF64. DEFINE_OP(kIdSqrtss , 2, kIntrin, kIdVsqrtss , kIntrin , 0, 0, kNone, 0, 0x00u, kF32S, k32, 4, kNA), // kSqrtF32S. @@ -2239,6 +2284,10 @@ static constexpr UniOpVInfo opcode_info_3v[size_t(UniOpVVV::kMaxValue) + 1] = { DEFINE_OP(kIdDivsd , 2, kSSE2 , kIdVdivsd , kAVX , 0, 0, kNone, 0, 0x00u, kF64S, k64, 8, kNA), // kDivF64S. DEFINE_OP(kIdDivps , 2, kSSE2 , kIdVdivps , kAVX , 0, 0, kNone, 0, 0x00u, kF32V, k32, 4, kNA), // kDivF32. DEFINE_OP(kIdDivpd , 2, kSSE2 , kIdVdivpd , kAVX , 0, 0, kNone, 0, 0x00u, kF64V, k64, 8, kNA), // kDivF64. + DEFINE_OP(kIdNone , 0, kIntrin, kIdNone , kIntrin , 0, 0, kNone, 0, 0x00u, kF32S, k32, 4, kNA), // kModF32S. + DEFINE_OP(kIdNone , 0, kIntrin, kIdNone , kIntrin , 0, 0, kNone, 0, 0x00u, kF64S, k64, 8, kNA), // kModF64S. + DEFINE_OP(kIdNone , 0, kIntrin, kIdNone , kIntrin , 0, 0, kNone, 0, 0x00u, kF32V, k32, 4, kNA), // kModF32. + DEFINE_OP(kIdNone , 0, kIntrin, kIdNone , kIntrin , 0, 0, kNone, 0, 0x00u, kF64V, k64, 8, kNA), // kModF64. DEFINE_OP(kIdMinss , 2, kSSE2 , kIdVminss , kAVX , 0, 0, kSrc , 0, 0x00u, kF32S, k32, 4, kNA), // kMinF32S. DEFINE_OP(kIdMinsd , 2, kSSE2 , kIdVminsd , kAVX , 0, 0, kSrc , 0, 0x00u, kF64S, k64, 8, kNA), // kMinF64S. DEFINE_OP(kIdMinps , 2, kSSE2 , kIdVminps , kAVX , 0, 0, kSrc , 0, 0x00u, kF32V, k32, 4, kNA), // kMinF32. @@ -2537,11 +2586,11 @@ static constexpr UniOpVMInfo opcode_info_2mv[size_t(UniOpMV::kMaxValue) + 1] = { #undef DEFINE_OP -// ujit::UniCompiler - Vector Instructions - Utility Functions -// ============================================================ +// ujit::UniCompiler - Vector Instructions - Utility Functions +// =========================================================== -static ASMJIT_NOINLINE void UniCompiler_loadInto(UniCompiler* pc, const Vec& vec, const Mem& mem, uint32_t broadcast_size = 0) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void UniCompiler_load_into(UniCompiler& uc, const Vec& vec, const Mem& mem, uint32_t broadcast_size = 0) noexcept { + BackendCompiler* cc = uc.cc; Mem m(mem); if (mem.has_broadcast() && broadcast_size) { @@ -2559,7 +2608,7 @@ static ASMJIT_NOINLINE void UniCompiler_loadInto(UniCompiler* pc, const Vec& vec m.set_size(vec.size()); if (vec.is_vec512()) cc->vmovdqu32(vec, m); - else if (pc->has_avx()) + else if (uc.has_avx()) cc->vmovdqu(vec, m); else cc->movdqu(vec, m); @@ -2568,24 +2617,24 @@ static ASMJIT_NOINLINE void UniCompiler_loadInto(UniCompiler* pc, const Vec& vec // TODO: Unused for now... [[maybe_unused]] -static ASMJIT_NOINLINE void UniCompiler_moveToDst(UniCompiler* pc, const Vec& dst, const Operand_& src, uint32_t broadcast_size = 0) noexcept { +static ASMJIT_NOINLINE void UniCompiler_move_to_dst(UniCompiler& uc, const Vec& dst, const Operand_& src, uint32_t broadcast_size = 0) noexcept { if (src.is_reg()) { ASMJIT_ASSERT(src.is_vec()); if (dst.id() != src.as().id()) { - pc->v_mov(dst, src); + uc.v_mov(dst, src); } } else if (src.is_mem()) { - UniCompiler_loadInto(pc, dst, src.as(), broadcast_size); + UniCompiler_load_into(uc, dst, src.as(), broadcast_size); } else { ASMJIT_NOT_REACHED(); } } -static ASMJIT_NOINLINE Vec UniCompiler_loadNew(UniCompiler* pc, const Vec& ref, const Mem& mem, uint32_t broadcast_size = 0) noexcept { - Vec vec = pc->new_similar_reg(ref, "@vec_m"); - UniCompiler_loadInto(pc, vec, mem, broadcast_size); +static ASMJIT_NOINLINE Vec UniCompiler_load_new(UniCompiler& uc, const Vec& ref, const Mem& mem, uint32_t broadcast_size = 0) noexcept { + Vec vec = uc.new_similar_reg(ref, "@vec_m"); + UniCompiler_load_into(uc, vec, mem, broadcast_size); return vec; } @@ -2593,35 +2642,75 @@ static ASMJIT_INLINE bool is_same_vec(const Vec& a, const Operand_& b) noexcept return b.is_reg() && a.id() == b.as().id(); } -static ASMJIT_NOINLINE void sse_mov(UniCompiler* pc, const Vec& dst, const Operand_& src) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_INLINE Operand get_fop_one(UniCompiler& uc, const Vec& dst, FloatMode fm) noexcept { + Operand op; + if (is_f32_op(fm)) + op = uc.simd_const(&uc.ct().f32_1, Bcst::k32, dst); + else + op = uc.simd_const(&uc.ct().f64_1, Bcst::k64, dst); + return op; +} + +static ASMJIT_INLINE Operand get_fop_half_minus_1ulp(UniCompiler& uc, const Vec& dst, FloatMode fm) noexcept { + Operand op; + if (is_f32_op(fm)) + op = uc.simd_const(&uc.ct().f32_0_5_minus_1ulp, Bcst::k32, dst); + else + op = uc.simd_const(&uc.ct().f64_0_5_minus_1ulp, Bcst::k64, dst); + return op; +} + +static ASMJIT_INLINE Operand get_fop_round_magic(UniCompiler& uc, const Vec& dst, FloatMode fm) noexcept { + Operand op; + if (is_f32_op(fm)) + op = uc.simd_const(&uc.ct().f32_round_magic, Bcst::k32, dst); + else + op = uc.simd_const(&uc.ct().f64_round_magic, Bcst::k64, dst); + return op; +} + +static ASMJIT_INLINE Operand get_fop_msb_bit(UniCompiler& uc, const Vec& dst, FloatMode fm) noexcept { + Operand op; + if (is_f32_op(fm)) + op = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst); + else + op = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst); + return op; +} + +static ASMJIT_NOINLINE void sse_mov(UniCompiler& uc, const Vec& dst, const Operand_& src) noexcept { + BackendCompiler* cc = uc.cc; if (src.is_mem()) cc->emit(Inst::kIdMovups, dst, src); else if (dst.id() != src.id()) cc->emit(Inst::kIdMovaps, dst, src); } -static ASMJIT_NOINLINE void sse_fmov(UniCompiler* pc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void sse_fmov(UniCompiler& uc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept { + BackendCompiler* cc = uc.cc; if (src.is_reg()) { - if (dst.id() != src.id()) + if (dst.id() != src.id()) { cc->emit(Inst::kIdMovaps, dst, src); + } + } + else if (is_scalar_fp_op(fm)) { + cc->emit(sse_float_inst[size_t(fm)].fmovs, dst, src); } else { - cc->emit(sse_float_inst[size_t(fm)].fmovs, dst, src); + cc->emit(sse_float_inst[size_t(fm)].fmovu, dst, src); } } -static ASMJIT_NOINLINE Vec sse_copy(UniCompiler* pc, const Vec& vec, const char* name) noexcept { - Vec copy = pc->new_similar_reg(vec, name); - pc->cc->emit(Inst::kIdMovaps, copy, vec); +static ASMJIT_NOINLINE Vec sse_copy(UniCompiler& uc, const Vec& vec, const char* name) noexcept { + Vec copy = uc.new_similar_reg(vec, name); + uc.cc->emit(Inst::kIdMovaps, copy, vec); return copy; } -static ASMJIT_NOINLINE void sse_make_vec(UniCompiler* pc, Operand_& op, const char* name) noexcept { +static ASMJIT_NOINLINE void sse_make_vec(UniCompiler& uc, Operand_& op, const char* name) noexcept { if (op.is_mem()) { - Vec tmp = pc->new_vec128(name); - sse_mov(pc, tmp, op); + Vec tmp = uc.new_vec128(name); + sse_mov(uc, tmp, op); op = tmp; } } @@ -2645,59 +2734,59 @@ static ASMJIT_INLINE uint32_t shuf_imm4_from_swizzle(Swizzle2 s) noexcept { return x86::shuffle_imm(imm1 * 2u + 1u, imm1 * 2u, imm0 * 2u + 1u, imm0 * 2u); } -static ASMJIT_NOINLINE void sse_bit_not(UniCompiler* pc, const Vec& dst, const Operand_& src) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void sse_bit_not(UniCompiler& uc, const Vec& dst, const Operand_& src) noexcept { + BackendCompiler* cc = uc.cc; - sse_mov(pc, dst, src); - Operand ones = pc->simd_const(&pc->ct.p_FFFFFFFFFFFFFFFF, Bcst::k32, dst); + sse_mov(uc, dst, src); + Operand ones = uc.simd_const(&uc.ct().p_FFFFFFFFFFFFFFFF, Bcst::k32, dst); cc->emit(Inst::kIdPxor, dst, ones); } -static ASMJIT_NOINLINE void sse_msb_flip(UniCompiler* pc, const Vec& dst, const Operand_& src, ElementSize sz) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void sse_msb_flip(UniCompiler& uc, const Vec& dst, const Operand_& src, ElementSize sz) noexcept { + BackendCompiler* cc = uc.cc; const void* msk_data {}; switch (sz) { - case ElementSize::k8 : msk_data = &pc->ct.p_8080808080808080; break; - case ElementSize::k16: msk_data = &pc->ct.p_8000800080008000; break; - case ElementSize::k32: msk_data = &pc->ct.p_8000000080000000; break; - case ElementSize::k64: msk_data = &pc->ct.p_8000000000000000; break; + case ElementSize::k8 : msk_data = &uc.ct().p_8080808080808080; break; + case ElementSize::k16: msk_data = &uc.ct().p_8000800080008000; break; + case ElementSize::k32: msk_data = &uc.ct().p_8000000080000000; break; + case ElementSize::k64: msk_data = &uc.ct().p_8000000000000000; break; default: ASMJIT_NOT_REACHED(); } - Operand msk = pc->simd_const(msk_data, Bcst::kNA, dst); - sse_mov(pc, dst, src); + Operand msk = uc.simd_const(msk_data, Bcst::kNA, dst); + sse_mov(uc, dst, src); cc->emit(Inst::kIdPxor, dst, msk); } -static ASMJIT_NOINLINE void sse_fsign_flip(UniCompiler* pc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void sse_fsign_flip(UniCompiler& uc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept { + BackendCompiler* cc = uc.cc; const FloatInst& fi = sse_float_inst[size_t(fm)]; Operand msk; switch (fm) { - case FloatMode::kF32S: msk = pc->simd_const(&pc->ct.sign32_scalar, Bcst::k32, dst); break; - case FloatMode::kF64S: msk = pc->simd_const(&pc->ct.sign64_scalar, Bcst::k64, dst); break; - case FloatMode::kF32V: msk = pc->simd_const(&pc->ct.p_8000000080000000, Bcst::k32, dst); break; - case FloatMode::kF64V: msk = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst); break; + case FloatMode::kF32S: msk = uc.simd_const(&uc.ct().sign32_scalar, Bcst::k32, dst); break; + case FloatMode::kF64S: msk = uc.simd_const(&uc.ct().sign64_scalar, Bcst::k64, dst); break; + case FloatMode::kF32V: msk = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst); break; + case FloatMode::kF64V: msk = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst); break; default: ASMJIT_NOT_REACHED(); } - sse_fmov(pc, dst, src, fm); + sse_fmov(uc, dst, src, fm); cc->emit(fi.fxor, dst, msk); } // Possibly the best solution: // https://stackoverflow.com/questions/65166174/how-to-simulate-pcmpgtq-on-sse2 -static ASMJIT_NOINLINE void sse_cmp_gt_i64(UniCompiler* pc, const Vec& dst, const Operand_& a, const Operand_& b) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void sse_cmp_gt_i64(UniCompiler& uc, const Vec& dst, const Operand_& a, const Operand_& b) noexcept { + BackendCompiler* cc = uc.cc; - if (pc->has_sse4_2()) { + if (uc.has_sse4_2()) { if (is_same_vec(dst, a)) { cc->emit(Inst::kIdPcmpgtq, dst, b); } @@ -2705,9 +2794,9 @@ static ASMJIT_NOINLINE void sse_cmp_gt_i64(UniCompiler* pc, const Vec& dst, cons Operand_ second = b; if (is_same_vec(dst, b)) { second = cc->new_similar_reg(dst, "@tmp"); - sse_mov(pc, second.as(), b); + sse_mov(uc, second.as(), b); } - sse_mov(pc, dst, a); + sse_mov(uc, dst, a); cc->emit(Inst::kIdPcmpgtq, dst, second); } } @@ -2722,13 +2811,13 @@ static ASMJIT_NOINLINE void sse_cmp_gt_i64(UniCompiler* pc, const Vec& dst, cons cc->emit(Inst::kIdPand, tmp1, tmp2); if (!is_same_vec(dst, b)) { - sse_mov(pc, dst, a); + sse_mov(uc, dst, a); cc->emit(Inst::kIdPcmpgtd, dst, b); cc->emit(Inst::kIdPor, dst, tmp1); cc->emit(Inst::kIdPshufd, dst, dst, x86::shuffle_imm(3, 3, 1, 1)); } else { - sse_mov(pc, tmp2, a); + sse_mov(uc, tmp2, a); cc->emit(Inst::kIdPcmpgtd, tmp2, b); cc->emit(Inst::kIdPor, tmp2, tmp1); cc->emit(Inst::kIdPshufd, dst, tmp2, x86::shuffle_imm(3, 3, 1, 1)); @@ -2738,22 +2827,22 @@ static ASMJIT_NOINLINE void sse_cmp_gt_i64(UniCompiler* pc, const Vec& dst, cons // Possibly the best solution: // https://stackoverflow.com/questions/65441496/what-is-the-most-efficient-way-to-do-unsigned-64-bit-comparison-on-sse2 -static ASMJIT_NOINLINE void sse_cmp_gt_u64(UniCompiler* pc, const Vec& dst, const Operand_& a, const Operand_& b) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void sse_cmp_gt_u64(UniCompiler& uc, const Vec& dst, const Operand_& a, const Operand_& b) noexcept { + BackendCompiler* cc = uc.cc; - if (pc->has_sse4_2()) { - Operand msk = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst); + if (uc.has_sse4_2()) { + Operand msk = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst); Vec tmp = cc->new_similar_reg(dst, "@tmp"); if (is_same_vec(dst, a)) { - sse_mov(pc, tmp, msk); + sse_mov(uc, tmp, msk); cc->emit(Inst::kIdPxor, dst, tmp); cc->emit(Inst::kIdPxor, tmp, b); cc->emit(Inst::kIdPcmpgtq, dst, tmp); } else { - sse_mov(pc, tmp, b); - sse_mov(pc, dst, a); + sse_mov(uc, tmp, b); + sse_mov(uc, dst, a); cc->emit(Inst::kIdPxor, dst, msk); cc->emit(Inst::kIdPxor, tmp, msk); cc->emit(Inst::kIdPcmpgtq, dst, tmp); @@ -2764,8 +2853,8 @@ static ASMJIT_NOINLINE void sse_cmp_gt_u64(UniCompiler* pc, const Vec& dst, cons Vec tmp2 = cc->new_similar_reg(dst, "@tmp2"); Vec tmp3 = cc->new_similar_reg(dst, "@tmp3"); - sse_mov(pc, tmp1, b); // tmp1 = b; - sse_mov(pc, tmp2, a); // tmp2 = a; + sse_mov(uc, tmp1, b); // tmp1 = b; + sse_mov(uc, tmp2, a); // tmp2 = a; cc->emit(Inst::kIdMovaps, tmp3, tmp1); // tmp3 = b; cc->emit(Inst::kIdPsubq, tmp3, tmp2); // tmp3 = b - a cc->emit(Inst::kIdPxor, tmp2, tmp1); // tmp2 = b ^ a @@ -2777,26 +2866,26 @@ static ASMJIT_NOINLINE void sse_cmp_gt_u64(UniCompiler* pc, const Vec& dst, cons } } -static ASMJIT_NOINLINE void sse_select(UniCompiler* pc, const Vec& dst, const Vec& a, const Operand_& b, const Vec& msk) noexcept { - BackendCompiler* cc = pc->cc; - sse_mov(pc, dst, a); +static ASMJIT_NOINLINE void sse_select(UniCompiler& uc, const Vec& dst, const Vec& a, const Operand_& b, const Vec& msk) noexcept { + BackendCompiler* cc = uc.cc; + sse_mov(uc, dst, a); cc->emit(Inst::kIdPand, dst, msk); cc->emit(Inst::kIdPandn, msk, b); cc->emit(Inst::kIdPor, dst, msk); } -static ASMJIT_NOINLINE void sse_int_widen(UniCompiler* pc, const Vec& dst, const Vec& src, WideningOp cvt) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void sse_int_widen(UniCompiler& uc, const Vec& dst, const Vec& src, WideningOp cvt) noexcept { + BackendCompiler* cc = uc.cc; WideningOpInfo cvt_info = sse_int_widening_op_info[size_t(cvt)]; - if (pc->has_sse4_1()) { + if (uc.has_sse4_1()) { cc->emit(cvt_info.mov, dst, src); return; } if (!cvt_info.sign_extends && cvt_info.unpack_lo != Inst::kIdNone) { - Operand zero = pc->simd_const(&pc->ct.p_0000000000000000, Bcst::kNA, dst); - sse_mov(pc, dst, src); + Operand zero = uc.simd_const(&uc.ct().p_0000000000000000, Bcst::kNA, dst); + sse_mov(uc, dst, src); cc->emit(cvt_info.unpack_lo, dst, zero); return; } @@ -2816,8 +2905,8 @@ static ASMJIT_NOINLINE void sse_int_widen(UniCompiler* pc, const Vec& dst, const } case WideningOp::kU8ToU32: { - Operand zero = pc->simd_const(&pc->ct.p_0000000000000000, Bcst::kNA, dst); - sse_mov(pc, dst, src); + Operand zero = uc.simd_const(&uc.ct().p_0000000000000000, Bcst::kNA, dst); + sse_mov(uc, dst, src); cc->emit(Inst::kIdPunpcklbw, dst, zero); cc->emit(Inst::kIdPunpcklwd, dst, zero); @@ -2825,8 +2914,8 @@ static ASMJIT_NOINLINE void sse_int_widen(UniCompiler* pc, const Vec& dst, const } case WideningOp::kU8ToU64: { - Operand zero = pc->simd_const(&pc->ct.p_0000000000000000, Bcst::kNA, dst); - sse_mov(pc, dst, src); + Operand zero = uc.simd_const(&uc.ct().p_0000000000000000, Bcst::kNA, dst); + sse_mov(uc, dst, src); cc->emit(Inst::kIdPunpcklbw, dst, zero); cc->emit(Inst::kIdPunpcklwd, dst, zero); @@ -2841,9 +2930,9 @@ static ASMJIT_NOINLINE void sse_int_widen(UniCompiler* pc, const Vec& dst, const } case WideningOp::kI32ToI64: { - Vec tmp = pc->new_similar_reg(dst, "@tmp"); - sse_mov(pc, tmp, src); - sse_mov(pc, dst, src); + Vec tmp = uc.new_similar_reg(dst, "@tmp"); + sse_mov(uc, tmp, src); + sse_mov(uc, dst, src); cc->psrad(tmp, 31); cc->punpckldq(dst, tmp); return; @@ -2854,55 +2943,49 @@ static ASMJIT_NOINLINE void sse_int_widen(UniCompiler* pc, const Vec& dst, const } } -static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Operand& src, FloatMode fm, x86::RoundImm round_mode) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void sse_round(UniCompiler& uc, const Vec& dst, const Operand& src, FloatMode fm, x86::RoundImm round_mode) noexcept { + BackendCompiler* cc = uc.cc; uint32_t is_f32 = fm == FloatMode::kF32S || fm == FloatMode::kF32V; const FloatInst& fi = sse_float_inst[size_t(fm)]; // NOTE: This may be dead code as the compiler handles this case well, however, if this function is // called as a helper we don't want to emit a longer sequence if we can just use a single instruction. - if (pc->has_sse4_1()) { + if (uc.has_sse4_1()) { cc->emit(fi.fround, dst, src, round_mode | x86::RoundImm::kSuppress); return; } - Operand maxn; - // round_max (f32) == 0x4B000000 // round_max (f64) == 0x4330000000000000 - if (fm == FloatMode::kF32S || fm == FloatMode::kF32V) - maxn = pc->simd_const(&pc->ct.f32_round_max, Bcst::k32, dst); - else - maxn = pc->simd_const(&pc->ct.f64_round_max, Bcst::k64, dst); + Operand maxn = get_fop_round_magic(uc, dst, fm); - Vec t1 = pc->new_similar_reg(dst, "@t1"); - Vec t2 = pc->new_similar_reg(dst, "@t2"); - Vec t3 = pc->new_similar_reg(dst, "@t3"); + Vec t1 = uc.new_similar_reg(dst, "@t1"); + Vec t2 = uc.new_similar_reg(dst, "@t2"); + Vec t3 = uc.new_similar_reg(dst, "@t3"); - // Special cases first - float32/float64 truncation can use float32->int32->float32 conversion. if (round_mode == x86::RoundImm::kTrunc) { if (fm == FloatMode::kF32S || (fm == FloatMode::kF64S && cc->is_64bit())) { Gp r; Operand msb; if (fm == FloatMode::kF32S) { - r = pc->new_gp32("@gp_tmp"); - msb = pc->simd_const(&pc->ct.p_8000000080000000, Bcst::k32, dst); + r = uc.new_gp32("@gp_tmp"); + msb = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst); } else { - r = pc->new_gp64("@gp_tmp"); - msb = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst); + r = uc.new_gp64("@gp_tmp"); + msb = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst); } - sse_fmov(pc, dst, src, fm); + sse_fmov(uc, dst, src, fm); if (fm == FloatMode::kF32S) cc->cvttss2si(r, dst); else cc->cvttsd2si(r, dst); - cc->emit(fi.fmov, t2, msb); + cc->emit(fi.fmova, t2, msb); cc->emit(fi.fandn, t2, dst); cc->emit(fi.fxor, t1, t1); @@ -2923,22 +3006,26 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope if (round_mode == x86::RoundImm::kNearest) { // Pure SSE2 round-to-even implementation: // - // float roundeven(float x) { + // float round_even(float x) { // float magic = x >= 0 ? pow(2, 22) : pow(2, 22) + pow(2, 21); // return x >= magic ? x : x + magic - magic; // } // - // double roundeven(double x) { + // double round_even(double x) { // double magic = x >= 0 ? pow(2, 52) : pow(2, 52) + pow(2, 51); // return x >= magic ? x : x + magic - magic; // } - sse_fmov(pc, dst, src, fm); - cc->emit(fi.fmov, t3, dst); - cc->emit(fi.psrl, t3, Imm(is_f32 ? 31 : 63)); - cc->emit(fi.psll, t3, Imm(is_f32 ? 23 : 51)); - cc->emit(fi.for_, t3, maxn); + sse_fmov(uc, dst, src, fm); + cc->emit(fi.fmova, t3, dst); + // cc->emit(fi.psrl, t3, Imm(is_f32 ? 31 : 63)); + // cc->emit(fi.psll, t3, Imm(is_f32 ? 23 : 51)); + // cc->emit(fi.for_, t3, maxn); - cc->emit(fi.fmov, t1, dst); + cc->emit(fi.psrl, t3, Imm(is_f32 ? 31 : 63)); + cc->emit(fi.psll, t3, Imm(is_f32 ? 23 : 52)); + cc->emit(is_f32 ? Inst::kIdPaddd : Inst::kIdPaddq, t3, maxn); + + cc->emit(fi.fmova, t1, dst); cc->emit(fi.fcmp, t1, t3, x86::CmpImm::kLT); cc->emit(fi.fand, t1, t3); @@ -2947,11 +3034,7 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope return; } - Operand one; - if (fm == FloatMode::kF32S || fm == FloatMode::kF32V) - one = pc->simd_const(&pc->ct.f32_1, Bcst::k32, dst); - else - one = pc->simd_const(&pc->ct.f64_1, Bcst::k64, dst); + Operand one = get_fop_one(uc, dst, fm); if (round_mode == x86::RoundImm::kTrunc) { // Should be handled earlier. @@ -2960,11 +3043,11 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope Operand msb; if (fm == FloatMode::kF32V) { - msb = pc->simd_const(&pc->ct.p_8000000080000000, Bcst::k32, dst); - sse_fmov(pc, dst, src, fm); + msb = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst); + sse_fmov(uc, dst, src, fm); cc->cvttps2dq(t1, dst); - cc->emit(fi.fmov, t2, msb); + cc->emit(fi.fmova, t2, msb); cc->emit(fi.fandn, t2, dst); cc->cvtdq2ps(t1, t1); @@ -2972,18 +3055,18 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope cc->emit(fi.fand, t1, t2); cc->emit(fi.fandn, t2, dst); cc->emit(fi.for_, t2, t1); - cc->emit(fi.fmov, dst, t2); + cc->emit(fi.fmova, dst, t2); } else { - msb = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst); + msb = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst); - sse_fmov(pc, dst, src, fm); - cc->emit(fi.fmov, t3, msb); + sse_fmov(uc, dst, src, fm); + cc->emit(fi.fmova, t3, msb); cc->emit(fi.fandn, t3, dst); - cc->emit(fi.fmov, t2, t3); + cc->emit(fi.fmova, t2, t3); cc->emit(fi.fcmp, t2, maxn, x86::CmpImm::kLT); cc->emit(fi.fand, t2, maxn); - cc->emit(fi.fmov, t1, t3); + cc->emit(fi.fmova, t1, t3); cc->emit(fi.fadd, t1, t2); cc->emit(fi.fsub, t1, t2); cc->emit(fi.fcmp, t3, t1, x86::CmpImm::kLT); @@ -3002,23 +3085,23 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope InstId correction_inst_id = round_mode == x86::RoundImm::kDown ? fi.fsub : fi.fadd; x86::CmpImm correction_predicate = round_mode == x86::RoundImm::kDown ? x86::CmpImm::kLT : x86::CmpImm::kNLE; - sse_fmov(pc, dst, src, fm); + sse_fmov(uc, dst, src, fm); // maxn (f32) == 0x4B000000 (f64) == 0x4330000000000000 // t3 (f32) == 0x00800000 (f64) == 0x0008000000000000 - cc->emit(fi.fmov, t3, dst); + cc->emit(fi.fmova, t3, dst); cc->emit(fi.psrl, t3, Imm(is_f32 ? 31 : 63)); - cc->emit(fi.psll, t3, Imm(is_f32 ? 23 : 51)); - cc->emit(fi.for_, t3, maxn); + cc->emit(fi.psll, t3, Imm(is_f32 ? 23 : 52)); + cc->emit(is_f32 ? Inst::kIdPaddd : Inst::kIdPaddq, t3, maxn); - cc->emit(fi.fmov, t1, dst); - cc->emit(fi.fmov, t2, dst); + cc->emit(fi.fmova, t1, dst); + cc->emit(fi.fmova, t2, dst); cc->emit(fi.fadd, t2, t3); cc->emit(fi.fsub, t2, t3); cc->emit(fi.fcmp, t1, t3, x86::CmpImm::kNLT); - cc->emit(fi.fmov, t3, dst); + cc->emit(fi.fmova, t3, dst); cc->emit(fi.fcmp, t3, t2, correction_predicate); cc->emit(fi.fand, t3, one); @@ -3033,8 +3116,8 @@ static ASMJIT_NOINLINE void sse_round(UniCompiler* pc, const Vec& dst, const Ope ASMJIT_NOT_REACHED(); } -static ASMJIT_NOINLINE void avx_mov(UniCompiler* pc, const Vec& dst, const Operand_& src) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void avx_mov(UniCompiler& uc, const Vec& dst, const Operand_& src) noexcept { + BackendCompiler* cc = uc.cc; InstId inst_id = 0; if (dst.is_vec512()) { @@ -3047,8 +3130,8 @@ static ASMJIT_NOINLINE void avx_mov(UniCompiler* pc, const Vec& dst, const Opera cc->emit(inst_id, dst, src); } -static ASMJIT_NOINLINE void avx_fmov(UniCompiler* pc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void avx_fmov(UniCompiler& uc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept { + BackendCompiler* cc = uc.cc; if (src.is_reg()) { if (dst.id() != src.id()) { if (fm <= FloatMode::kF64S) @@ -3057,38 +3140,41 @@ static ASMJIT_NOINLINE void avx_fmov(UniCompiler* pc, const Vec& dst, const Oper cc->emit(Inst::kIdVmovaps, dst, src); } } - else { + else if (is_scalar_fp_op(fm)) { cc->emit(avx_float_inst[size_t(fm)].fmovs, dst, src); } + else { + cc->emit(avx_float_inst[size_t(fm)].fmovu, dst, src); + } } -static ASMJIT_NOINLINE void avx_make_vec(UniCompiler* pc, Operand_& op, const Vec& ref, const char* name) noexcept { +static ASMJIT_NOINLINE void avx_make_vec(UniCompiler& uc, Operand_& op, const Vec& ref, const char* name) noexcept { if (op.is_mem()) { - Vec tmp = pc->new_similar_reg(ref, name); - avx_mov(pc, tmp, op); + Vec tmp = uc.new_similar_reg(ref, name); + avx_mov(uc, tmp, op); op = tmp; } } -static ASMJIT_NOINLINE void avx_zero(UniCompiler* pc, const Vec& dst) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void avx_zero(UniCompiler& uc, const Vec& dst) noexcept { + BackendCompiler* cc = uc.cc; Vec x = dst.xmm(); cc->vpxor(x, x, x); return; } -static ASMJIT_NOINLINE void avx_ones(UniCompiler* pc, const Vec& dst) noexcept { - BackendCompiler* cc = pc->cc; - if (pc->has_avx512()) +static ASMJIT_NOINLINE void avx_ones(UniCompiler& uc, const Vec& dst) noexcept { + BackendCompiler* cc = uc.cc; + if (uc.has_avx512()) cc->emit(Inst::kIdVpternlogd, dst, dst, dst, 0xFF); else cc->emit(Inst::kIdVpcmpeqb, dst, dst, dst); } -static ASMJIT_NOINLINE void avx_bit_not(UniCompiler* pc, const Vec& dst, const Operand_& src) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void avx_bit_not(UniCompiler& uc, const Vec& dst, const Operand_& src) noexcept { + BackendCompiler* cc = uc.cc; - if (pc->has_avx512()) { + if (uc.has_avx512()) { if (src.is_reg()) cc->overwrite().emit(Inst::kIdVpternlogd, dst, src, src, 0x55); else @@ -3096,13 +3182,13 @@ static ASMJIT_NOINLINE void avx_bit_not(UniCompiler* pc, const Vec& dst, const O return; } - Operand ones = pc->simd_const(&pc->ct.p_FFFFFFFFFFFFFFFF, Bcst::k32, dst); + Operand ones = uc.simd_const(&uc.ct().p_FFFFFFFFFFFFFFFF, Bcst::k32, dst); if (!src.is_reg()) { if (ones.is_reg()) { cc->emit(Inst::kIdVpxor, dst, ones, src); } else { - avx_mov(pc, dst, src); + avx_mov(uc, dst, src); cc->emit(Inst::kIdVpxor, dst, dst, ones); } } @@ -3111,17 +3197,17 @@ static ASMJIT_NOINLINE void avx_bit_not(UniCompiler* pc, const Vec& dst, const O } } -static ASMJIT_NOINLINE void avx_isign_flip(UniCompiler* pc, const Vec& dst, const Operand_& src, ElementSize sz) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void avx_isign_flip(UniCompiler& uc, const Vec& dst, const Operand_& src, ElementSize sz) noexcept { + BackendCompiler* cc = uc.cc; Operand msk; - InstId xor_ = (pc->has_avx512() && dst.is_vec512()) ? Inst::kIdVpxord : Inst::kIdVpxor; + InstId xor_ = (uc.has_avx512() && dst.is_vec512()) ? Inst::kIdVpxord : Inst::kIdVpxor; switch (sz) { - case ElementSize::k8: msk = pc->simd_const(&pc->ct.p_8080808080808080, Bcst::kNA, dst); break; - case ElementSize::k16: msk = pc->simd_const(&pc->ct.p_8000800080008000, Bcst::kNA, dst); break; - case ElementSize::k32: msk = pc->simd_const(&pc->ct.p_8000000080000000, Bcst::k32, dst); break; - case ElementSize::k64: msk = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst); break; + case ElementSize::k8: msk = uc.simd_const(&uc.ct().p_8080808080808080, Bcst::kNA, dst); break; + case ElementSize::k16: msk = uc.simd_const(&uc.ct().p_8000800080008000, Bcst::kNA, dst); break; + case ElementSize::k32: msk = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst); break; + case ElementSize::k64: msk = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst); break; } if (src.is_reg()) { @@ -3131,22 +3217,22 @@ static ASMJIT_NOINLINE void avx_isign_flip(UniCompiler* pc, const Vec& dst, cons cc->emit(xor_, dst, msk, src); } else { - avx_mov(pc, dst, src); + avx_mov(uc, dst, src); cc->emit(xor_, dst, dst, msk); } } -static ASMJIT_NOINLINE void avx_fsign_flip(UniCompiler* pc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept { - BackendCompiler* cc = pc->cc; +static ASMJIT_NOINLINE void avx_fsign_flip(UniCompiler& uc, const Vec& dst, const Operand_& src, FloatMode fm) noexcept { + BackendCompiler* cc = uc.cc; const FloatInst& fi = avx_float_inst[size_t(fm)]; Operand msk; switch (fm) { - case FloatMode::kF32S: msk = pc->simd_const(&pc->ct.sign32_scalar, Bcst::kNA, dst); break; - case FloatMode::kF64S: msk = pc->simd_const(&pc->ct.sign64_scalar, Bcst::kNA, dst); break; - case FloatMode::kF32V: msk = pc->simd_const(&pc->ct.p_8000000080000000, Bcst::k32, dst); break; - case FloatMode::kF64V: msk = pc->simd_const(&pc->ct.p_8000000000000000, Bcst::k64, dst); break; + case FloatMode::kF32S: msk = uc.simd_const(&uc.ct().sign32_scalar, Bcst::kNA, dst); break; + case FloatMode::kF64S: msk = uc.simd_const(&uc.ct().sign64_scalar, Bcst::kNA, dst); break; + case FloatMode::kF32V: msk = uc.simd_const(&uc.ct().p_8000000080000000, Bcst::k32, dst); break; + case FloatMode::kF64V: msk = uc.simd_const(&uc.ct().p_8000000000000000, Bcst::k64, dst); break; default: ASMJIT_NOT_REACHED(); @@ -3159,13 +3245,13 @@ static ASMJIT_NOINLINE void avx_fsign_flip(UniCompiler* pc, const Vec& dst, cons cc->emit(fi.fxor, dst, msk, src); } else { - avx_fmov(pc, dst, src, fm); + avx_fmov(uc, dst, src, fm); cc->emit(fi.fxor, dst, dst, msk); } } -// ujit::UniCompiler - Vector Instructions - OpArray Iterator -// =========================================================== +// ujit::UniCompiler - Vector Instructions - OpArray Iterator +// ========================================================== template class OpArrayIter { @@ -3190,70 +3276,70 @@ public: }; template -static ASMJIT_INLINE void emit_2v_t(UniCompiler* pc, UniOpVV op, const OpArray& dst_, const Src& src_) noexcept { +static ASMJIT_INLINE void emit_2v_t(UniCompiler& uc, UniOpVV op, const OpArray& dst_, const Src& src_) noexcept { size_t n = dst_.size(); OpArrayIter src(src_); for (size_t i = 0; i < n; i++) { - pc->emit_2v(op, dst_[i], src.op()); + uc.emit_2v(op, dst_[i], src.op()); src.next(); } } template -static ASMJIT_INLINE void emit_2vi_t(UniCompiler* pc, UniOpVVI op, const OpArray& dst_, const Src& src_, uint32_t imm) noexcept { +static ASMJIT_INLINE void emit_2vi_t(UniCompiler& uc, UniOpVVI op, const OpArray& dst_, const Src& src_, uint32_t imm) noexcept { size_t n = dst_.size(); OpArrayIter src(src_); for (size_t i = 0; i < n; i++) { - pc->emit_2vi(op, dst_[i], src.op(), imm); + uc.emit_2vi(op, dst_[i], src.op(), imm); src.next(); } } template -static ASMJIT_INLINE void emit_3v_t(UniCompiler* pc, UniOpVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_) noexcept { +static ASMJIT_INLINE void emit_3v_t(UniCompiler& uc, UniOpVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_) noexcept { size_t n = dst_.size(); OpArrayIter src1(src1_); OpArrayIter src2(src2_); for (size_t i = 0; i < n; i++) { - pc->emit_3v(op, dst_[i], src1.op(), src2.op()); + uc.emit_3v(op, dst_[i], src1.op(), src2.op()); src1.next(); src2.next(); } } template -static ASMJIT_INLINE void emit_3vi_t(UniCompiler* pc, UniOpVVVI op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, uint32_t imm) noexcept { +static ASMJIT_INLINE void emit_3vi_t(UniCompiler& uc, UniOpVVVI op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, uint32_t imm) noexcept { size_t n = dst_.size(); OpArrayIter src1(src1_); OpArrayIter src2(src2_); for (size_t i = 0; i < n; i++) { - pc->emit_3vi(op, dst_[i], src1.op(), src2.op(), imm); + uc.emit_3vi(op, dst_[i], src1.op(), src2.op(), imm); src1.next(); src2.next(); } } template -static ASMJIT_INLINE void emit_4v_t(UniCompiler* pc, UniOpVVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, const Src3& src3_) noexcept { +static ASMJIT_INLINE void emit_4v_t(UniCompiler& uc, UniOpVVVV op, const OpArray& dst_, const Src1& src1_, const Src2& src2_, const Src3& src3_) noexcept { size_t n = dst_.size(); OpArrayIter src1(src1_); OpArrayIter src2(src2_); OpArrayIter src3(src3_); for (size_t i = 0; i < n; i++) { - pc->emit_4v(op, dst_[i], src1.op(), src2.op(), src3.op()); + uc.emit_4v(op, dst_[i], src1.op(), src2.op(), src3.op()); src1.next(); src2.next(); src3.next(); } } -// ujit::UniCompiler - Vector Instructions - Emit 2V -// ================================================== +// ujit::UniCompiler - Vector Instructions - Emit 2V +// ================================================= void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_) noexcept { ASMJIT_ASSERT(dst_.is_vec()); @@ -3347,8 +3433,8 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ // AVX doesn't provide 8-bit and 16-bit broadcasts - the simplest way is to just use VPSHUFB to repeat the byte. InstId insert_inst_id = element_size == ElementSize::k8 ? Inst::kIdVpinsrb : Inst::kIdVpinsrw; - const void* pred_data = element_size == ElementSize::k8 ? static_cast(&ct.p_0000000000000000) - : static_cast(&ct.p_0100010001000100); + const void* pred_data = element_size == ElementSize::k8 ? static_cast(&ct().p_0000000000000000) + : static_cast(&ct().p_0100010001000100); Vec pred = simd_vec_const(pred_data, Bcst::k32, dst_xmm); if (src.is_mem()) { @@ -3409,7 +3495,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ // 128-bit broadcast is like 128-bit mov in this case as we don't have a wider destination. if (dst.is_vec128()) { - avx_mov(this, dst, src); + avx_mov(*this, dst, src); return; } @@ -3455,7 +3541,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ // Cannot broadcast 256-bit vector to a 128-bit or 256-bit vector... if (!dst.is_vec512()) { - avx_mov(this, dst.ymm(), src); + avx_mov(*this, dst.ymm(), src); return; } @@ -3483,7 +3569,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kNotU64: case UniOpVV::kNotF32: case UniOpVV::kNotF64: { - avx_bit_not(this, dst, src); + avx_bit_not(*this, dst, src); return; } @@ -3547,19 +3633,30 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ return; } + case UniOpVV::kAbsF32S: + case UniOpVV::kAbsF64S: case UniOpVV::kAbsF32: case UniOpVV::kAbsF64: + case UniOpVV::kNegF32S: + case UniOpVV::kNegF64S: case UniOpVV::kNegF32: case UniOpVV::kNegF64: { // Intrinsic. - const void* msk_data = op == UniOpVV::kAbsF32 ? static_cast(&ct.p_7FFFFFFF7FFFFFFF) : - op == UniOpVV::kAbsF64 ? static_cast(&ct.p_7FFFFFFFFFFFFFFF) : - op == UniOpVV::kNegF32 ? static_cast(&ct.p_8000000080000000) : - static_cast(&ct.p_8000000000000000); + FloatMode fm = FloatMode(op_info.float_mode); + + const void* msk_data = + op == UniOpVV::kAbsF32 || op == UniOpVV::kAbsF32S ? static_cast(&ct().p_7FFFFFFF7FFFFFFF) : + op == UniOpVV::kAbsF64 || op == UniOpVV::kAbsF64S ? static_cast(&ct().p_7FFFFFFFFFFFFFFF) : + op == UniOpVV::kNegF32 || op == UniOpVV::kNegF32S ? static_cast(&ct().p_8000000080000000) : + static_cast(&ct().p_8000000000000000); Operand msk = simd_const(msk_data, Bcst(op_info.broadcast_size), dst); - if (src.is_mem() && msk.is_mem()) { - avx_mov(this, dst, msk); + if (src.is_mem() && is_scalar_fp_op(fm)) { + avx_fmov(*this, dst, src, fm); + cc->emit(inst_id, dst, dst, msk); + } + else if (src.is_mem() && msk.is_mem()) { + avx_fmov(*this, dst, msk, fm); cc->emit(inst_id, dst, dst, src); } else if (src.is_mem()) { @@ -3573,14 +3670,14 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kRcpF32: { // Intrinsic. - Vec one = simd_vec_const(&ct.f32_1, Bcst::k32, dst); + Vec one = simd_vec_const(&ct().f32_1, Bcst::k32, dst); cc->emit(Inst::kIdVdivps, dst, one, src); return; } case UniOpVV::kRcpF64: { // Intrinsic. - Vec one = simd_vec_const(&ct.f64_1, Bcst::k32, dst); + Vec one = simd_vec_const(&ct().f64_1, Bcst::k32, dst); cc->emit(Inst::kIdVdivpd, dst, one, src); return; } @@ -3597,11 +3694,17 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kCeilF64S: case UniOpVV::kCeilF32: case UniOpVV::kCeilF64: - case UniOpVV::kRoundF32S: - case UniOpVV::kRoundF64S: - case UniOpVV::kRoundF32: - case UniOpVV::kRoundF64: { - if (has_avx512()) { + case UniOpVV::kRoundEvenF32S: + case UniOpVV::kRoundEvenF64S: + case UniOpVV::kRoundEvenF32: + case UniOpVV::kRoundEvenF64: { + FloatMode fm = FloatMode(op_info.float_mode); + + if (is_scalar_fp_op(fm)) { + dst = dst.xmm(); + } + + if (has_avx512() && dst.is_vec512()) { // AVX512 uses a different name. constexpr uint16_t avx512_rndscale[4] = { Inst::kIdVrndscaless, @@ -3612,10 +3715,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ inst_id = avx512_rndscale[(size_t(op) - size_t(UniOpVV::kTruncF32S)) & 0x3]; } - FloatMode fm = FloatMode(op_info.float_mode); - - if (fm == FloatMode::kF32S || fm == FloatMode::kF64S) { - dst = dst.xmm(); + if (is_scalar_fp_op(fm)) { // These instructions use 3 operand form for historical reasons. if (src.is_mem()) { cc->emit(avx_float_inst[size_t(op_info.float_mode)].fmovs, dst, src); @@ -3632,13 +3732,86 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ return; } + case UniOpVV::kRoundHalfAwayF32S: + case UniOpVV::kRoundHalfAwayF64S: + case UniOpVV::kRoundHalfAwayF32: + case UniOpVV::kRoundHalfAwayF64: { + // Intrinsic. + FloatMode fm = FloatMode(op_info.float_mode); + const FloatInst& fi = avx_float_inst[fm]; + + if (is_scalar_fp_op(fm)) { + dst = dst.xmm(); + if (src.is_vec()) { + src = src.as().clone_as(dst); + } + } + + if (src.is_mem()) { + avx_fmov(*this, dst, src, fm); + src = dst; + } + + Operand half = get_fop_half_minus_1ulp(*this, dst, fm); + Operand msb = get_fop_msb_bit(*this, dst, fm); + Vec tmp = new_similar_reg(dst, "@tmp"); + + if (has_avx512()) { + cc->emit(fi.fmova, tmp, msb); + cc->emit(Inst::kIdVpternlogd, tmp, src, half, 0xEAu); // tmp = (msb & src) | half + } + else { + cc->emit(fi.fand, tmp, src, msb); + cc->emit(fi.for_, tmp, tmp, half); + } + + cc->emit(fi.fadd, dst, src, tmp); + + if (is_scalar_fp_op(fm)) { + cc->emit(fi.fround, dst, dst, dst, x86::RoundImm::kTrunc | x86::RoundImm::kSuppress); + } + else { + InstId round_inst = dst.is_vec512() ? fi.frndscale : fi.fround; + cc->emit(round_inst, dst, dst, x86::RoundImm::kTrunc | x86::RoundImm::kSuppress); + } + return; + } + + case UniOpVV::kRoundHalfUpF32S: + case UniOpVV::kRoundHalfUpF64S: + case UniOpVV::kRoundHalfUpF32: + case UniOpVV::kRoundHalfUpF64: { + // Intrinsic. + FloatMode fm = FloatMode(op_info.float_mode); + + if (is_scalar_fp_op(fm)) { + dst = dst.xmm(); + } + + Operand half = get_fop_half_minus_1ulp(*this, dst, fm); + UniOpVVV add_op = translate_op(op, UniOpVV::kRoundHalfUpF32S, UniOpVVV::kAddF32S); + UniOpVV floor_op = translate_op(op, UniOpVV::kRoundHalfUpF32S, UniOpVV::kFloorF32S); + + if (src.is_mem()) { + Vec tmp = new_similar_reg(dst, "@tmp"); + avx_fmov(*this, tmp, src, fm); + emit_3v(add_op, tmp, tmp, half); + emit_2v(floor_op, dst, tmp); + } + else { + emit_3v(add_op, dst, src.as().clone_as(dst), half); + emit_2v(floor_op, dst, dst); + } + return; + } + case UniOpVV::kSqrtF32S: case UniOpVV::kSqrtF64S: { dst = dst.xmm(); // Intrinsic - these instructions use 3 operand form for historical reasons. if (src.is_mem()) { - avx_fmov(this, dst, src, FloatMode(op_info.float_mode)); + avx_fmov(*this, dst, src, FloatMode(op_info.float_mode)); cc->emit(inst_id, dst, dst, dst); } else { @@ -3655,7 +3828,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ src = src.as().xmm(); // Intrinsic - these instructions use 3 operand form for historical reasons. - Vec zeros = simd_vec_const(&ct.p_0000000000000000, Bcst::k32, dst); + Vec zeros = simd_vec_const(&ct().p_0000000000000000, Bcst::k32, dst); cc->emit(inst_id, dst, zeros, src); return; } @@ -3680,7 +3853,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kCvtI32HiToF64: { if (src.is_reg()) { uint32_t w = dst.size() >> 6; - Vec tmp = new_vec(VecWidth(w), "@tmp"); + Vec tmp = new_vec_with_width(VecWidth(w), "@tmp"); src.set_signature(signature_of_xmm_ymm_zmm[w]); if (dst.is_vec512()) { @@ -3731,7 +3904,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kCvtTruncF64ToI32Hi: case UniOpVV::kCvtRoundF64ToI32Hi: { uint32_t w = dst.size() >> 6; - Vec tmp = new_vec(VecWidth(w), "@tmp"); + Vec tmp = new_vec_with_width(VecWidth(w), "@tmp"); if (src.is_mem()) src.as().set_size(dst.size()); @@ -3816,16 +3989,16 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ if (has_ssse3()) { if (element_size == ElementSize::k8 || (element_size == ElementSize::k16 && is_same_vec(dst, src))) { - Operand predicate = element_size == ElementSize::k8 ? simd_const(&ct.p_0000000000000000, Bcst::kNA, dst.as()) - : simd_const(&ct.p_0100010001000100, Bcst::kNA, dst.as()); - sse_mov(this, dst, src); + Operand predicate = element_size == ElementSize::k8 ? simd_const(&ct().p_0000000000000000, Bcst::kNA, dst.as()) + : simd_const(&ct().p_0100010001000100, Bcst::kNA, dst.as()); + sse_mov(*this, dst, src); cc->emit(Inst::kIdPshufb, dst, predicate); return; } } if (element_size == ElementSize::k8) { - sse_mov(this, dst, src); + sse_mov(*this, dst, src); cc->emit(Inst::kIdPunpcklbw, dst, dst); src = dst; } @@ -3885,7 +4058,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kBroadcastV128_F32: case UniOpVV::kBroadcastV128_F64: { // 128-bit broadcast is like 128-bit mov in this case as we don't have wider vectors. - sse_mov(this, dst, src); + sse_mov(*this, dst, src); return; } @@ -3926,7 +4099,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ Vec tmp = new_similar_reg(dst, "@tmp"); cc->emit(Inst::kIdMovaps, tmp, src); cc->emit(Inst::kIdPsrad, tmp, 31); - sse_mov(this, dst, src); + sse_mov(*this, dst, src); cc->emit(Inst::kIdPxor, dst, tmp); cc->emit(Inst::kIdPsubd, dst, tmp); return; @@ -3937,7 +4110,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ Vec tmp = new_similar_reg(dst, "@tmp"); cc->emit(Inst::kIdPshufd, tmp, src, x86::shuffle_imm(3, 3, 1, 1)); cc->emit(Inst::kIdPsrad, tmp, 31); - sse_mov(this, dst, src); + sse_mov(*this, dst, src); cc->emit(Inst::kIdPxor, dst, tmp); cc->emit(Inst::kIdPsubq, dst, tmp); return; @@ -3947,7 +4120,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kNotU64: case UniOpVV::kNotF32: case UniOpVV::kNotF64: { - sse_bit_not(this, dst, src); + sse_bit_not(*this, dst, src); return; } @@ -3967,7 +4140,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ } WideningOp cvt = (op == UniOpVV::kCvtI8ToI32) ? WideningOp::kI8ToI32 : WideningOp::kU8ToU32; - sse_int_widen(this, dst, src.as(), cvt); + sse_int_widen(*this, dst, src.as(), cvt); return; } @@ -3985,7 +4158,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kCvtI16HiToI32: case UniOpVV::kCvtI32HiToI64: if (src.is_vec()) { - sse_mov(this, dst, src); + sse_mov(*this, dst, src); switch (op) { case UniOpVV::kCvtI8HiToI16: { @@ -3995,7 +4168,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ } case UniOpVV::kCvtU8HiToU16: { - cc->emit(Inst::kIdPunpckhbw, dst, simd_const(&ct.p_0000000000000000, Bcst::kNA, dst)); + cc->emit(Inst::kIdPunpckhbw, dst, simd_const(&ct().p_0000000000000000, Bcst::kNA, dst)); break; } @@ -4006,20 +4179,20 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ } case UniOpVV::kCvtU16HiToU32: { - cc->emit(Inst::kIdPunpckhwd, dst, simd_const(&ct.p_0000000000000000, Bcst::kNA, dst)); + cc->emit(Inst::kIdPunpckhwd, dst, simd_const(&ct().p_0000000000000000, Bcst::kNA, dst)); break; } case UniOpVV::kCvtI32HiToI64: { Vec tmp = new_vec128("@tmp"); - sse_mov(this, tmp, dst); + sse_mov(*this, tmp, dst); cc->psrad(tmp, 31); cc->punpckhdq(dst, tmp); break; } case UniOpVV::kCvtU32HiToU64: { - cc->emit(Inst::kIdPunpckhdq, dst, simd_const(&ct.p_0000000000000000, Bcst::kNA, dst)); + cc->emit(Inst::kIdPunpckhdq, dst, simd_const(&ct().p_0000000000000000, Bcst::kNA, dst)); break; } @@ -4068,7 +4241,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ ASMJIT_NOT_REACHED(); } - sse_int_widen(this, dst, src.as(), cvt); + sse_int_widen(*this, dst, src.as(), cvt); return; } @@ -4078,8 +4251,8 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kFloorF64: case UniOpVV::kCeilF32: case UniOpVV::kCeilF64: - case UniOpVV::kRoundF32: - case UniOpVV::kRoundF64: + case UniOpVV::kRoundEvenF32: + case UniOpVV::kRoundEvenF64: // Native operation requires SSE4.1. if (has_sse4_1()) { cc->emit(inst_id, dst, src, Imm(op_info.imm)); @@ -4093,63 +4266,105 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kFloorF64S: case UniOpVV::kCeilF32S: case UniOpVV::kCeilF64S: - case UniOpVV::kRoundF32S: - case UniOpVV::kRoundF64S: { + case UniOpVV::kRoundEvenF32S: + case UniOpVV::kRoundEvenF64S: { // Native operation requires SSE4.1. if (has_sse4_1()) { - if (!is_same_vec(dst, src)) - sse_fmov(this, dst, src, FloatMode(op_info.float_mode)); + sse_fmov(*this, dst, src, FloatMode(op_info.float_mode)); cc->emit(inst_id, dst, dst, Imm(op_info.imm)); return; } - sse_round(this, dst, src, FloatMode(op_info.float_mode), x86::RoundImm(op_info.imm & 0x7)); + sse_round(*this, dst, src, FloatMode(op_info.float_mode), x86::RoundImm(op_info.imm & 0x7)); return; } + case UniOpVV::kRoundHalfAwayF32S: + case UniOpVV::kRoundHalfAwayF64S: + case UniOpVV::kRoundHalfAwayF32: + case UniOpVV::kRoundHalfAwayF64: { + // Intrinsic. + FloatMode fm = FloatMode(op_info.float_mode); + const FloatInst& fi = sse_float_inst[fm]; + + Operand half = get_fop_half_minus_1ulp(*this, dst, fm); + Operand msb = get_fop_msb_bit(*this, dst, fm); + + Vec tmp = new_similar_reg(dst, "@tmp"); + + sse_fmov(*this, dst, src, fm); + sse_mov(*this, tmp, msb); + cc->emit(fi.fand, tmp, dst); + cc->emit(fi.for_, tmp, half); + cc->emit(fi.fadd, dst, tmp); + + sse_round(*this, dst, dst, fm, x86::RoundImm(op_info.imm & 0x7)); + return; + } + + case UniOpVV::kRoundHalfUpF32S: + case UniOpVV::kRoundHalfUpF64S: + case UniOpVV::kRoundHalfUpF32: + case UniOpVV::kRoundHalfUpF64: { + // Intrinsic. + FloatMode fm = FloatMode(op_info.float_mode); + const FloatInst& fi = sse_float_inst[fm]; + + Operand half = get_fop_half_minus_1ulp(*this, dst, fm); + sse_fmov(*this, dst, src, fm); + cc->emit(fi.fadd, dst, half); + sse_round(*this, dst, dst, fm, x86::RoundImm(op_info.imm & 0x7)); + return; + } + + case UniOpVV::kAbsF32S: + case UniOpVV::kAbsF64S: case UniOpVV::kAbsF32: case UniOpVV::kAbsF64: + case UniOpVV::kNegF32S: + case UniOpVV::kNegF64S: case UniOpVV::kNegF32: case UniOpVV::kNegF64: { // Intrinsic. - const void* msk_data = op == UniOpVV::kAbsF32 ? static_cast(&ct.p_7FFFFFFF7FFFFFFF) : - op == UniOpVV::kAbsF64 ? static_cast(&ct.p_7FFFFFFFFFFFFFFF) : - op == UniOpVV::kNegF32 ? static_cast(&ct.p_8000000080000000) : - static_cast(&ct.p_8000000000000000) ; - Operand msk = simd_const(msk_data, Bcst::k32, dst); + FloatMode fm = FloatMode(op_info.float_mode); - if (!is_same_vec(dst, src)) - sse_mov(this, dst, src); + const void* msk_data = + op == UniOpVV::kAbsF32 || op == UniOpVV::kAbsF32S ? static_cast(&ct().p_7FFFFFFF7FFFFFFF) : + op == UniOpVV::kAbsF64 || op == UniOpVV::kAbsF64S ? static_cast(&ct().p_7FFFFFFFFFFFFFFF) : + op == UniOpVV::kNegF32 || op == UniOpVV::kNegF32S ? static_cast(&ct().p_8000000080000000) : + static_cast(&ct().p_8000000000000000); + Operand msk = simd_const(msk_data, Bcst(op_info.broadcast_size), dst); + sse_fmov(*this, dst, src, fm); cc->emit(inst_id, dst, msk); return; } case UniOpVV::kRcpF32: { - Operand one = simd_const(&ct.f32_1, Bcst::k32, dst); + Operand one = simd_const(&ct().f32_1, Bcst::k32, dst); if (is_same_vec(dst, src)) { Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, one); + sse_mov(*this, tmp, one); cc->emit(Inst::kIdDivps, tmp, src); - sse_mov(this, dst, tmp); + sse_mov(*this, dst, tmp); } else { - sse_mov(this, dst, one); + sse_mov(*this, dst, one); cc->emit(Inst::kIdDivps, dst, src); } return; } case UniOpVV::kRcpF64: { - Operand one = simd_const(&ct.f64_1, Bcst::k64, dst); + Operand one = simd_const(&ct().f64_1, Bcst::k64, dst); if (is_same_vec(dst, src)) { Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, one); + sse_mov(*this, tmp, one); cc->emit(Inst::kIdDivpd, tmp, src); - sse_mov(this, dst, tmp); + sse_mov(*this, dst, tmp); } else { - sse_mov(this, dst, one); + sse_mov(*this, dst, one); cc->emit(Inst::kIdDivpd, dst, src); } return; @@ -4157,7 +4372,7 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ case UniOpVV::kSqrtF32S: case UniOpVV::kSqrtF64S: { - sse_mov(this, dst, src); + sse_mov(*this, dst, src); cc->emit(inst_id, dst, dst); return; } @@ -4210,11 +4425,11 @@ void UniCompiler::emit_2v(UniOpVV op, const Operand_& dst_, const Operand_& src_ } } -void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const Operand_& src_) noexcept { emit_2v_t(this, op, dst_, src_); } -void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const OpArray& src_) noexcept { emit_2v_t(this, op, dst_, src_); } +void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const Operand_& src_) noexcept { emit_2v_t(*this, op, dst_, src_); } +void UniCompiler::emit_2v(UniOpVV op, const OpArray& dst_, const OpArray& src_) noexcept { emit_2v_t(*this, op, dst_, src_); } -// ujit::UniCompiler - Vector Instructions - Emit 2VI -// =================================================== +// ujit::UniCompiler - Vector Instructions - Emit 2VI +// ================================================== void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& src_, uint32_t imm) noexcept { ASMJIT_ASSERT(dst_.is_vec()); @@ -4249,7 +4464,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr case UniOpVVI::kSrlbU128: { // This instruction requires AVX-512 if the source is a memory operand. if (src.is_mem()) { - avx_mov(this, dst, src); + avx_mov(*this, dst, src); cc->emit(inst_id, dst, dst, imm); } else { @@ -4261,7 +4476,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr case UniOpVVI::kSraI64: { // Native operation requires AVX-512, which is not supported by the target. if (imm == 0) { - avx_mov(this, dst, src); + avx_mov(*this, dst, src); return; } @@ -4274,7 +4489,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr Vec tmp = new_similar_reg(dst, "@tmp"); if (src.is_mem()) { - avx_mov(this, dst, src); + avx_mov(*this, dst, src); src = dst; } @@ -4433,7 +4648,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr ASMJIT_ASSERT(inst_id != Inst::kIdNone); if (op_info.sse_op_count == 2) { - sse_mov(this, dst, src); + sse_mov(*this, dst, src); cc->emit(inst_id, dst, imm); return; } @@ -4449,7 +4664,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr case UniOpVVI::kSraI64: { // Intrinsic (SSE2). if (imm == 0) { - sse_mov(this, dst, src); + sse_mov(*this, dst, src); return; } @@ -4462,15 +4677,15 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr Vec tmp = new_similar_reg(dst, "@tmp"); if (imm <= 32 && has_sse4_1()) { - sse_mov(this, dst, src); - sse_mov(this, tmp, src.is_reg() ? src.as() : dst); + sse_mov(*this, dst, src); + sse_mov(*this, tmp, src.is_reg() ? src.as() : dst); cc->emit(Inst::kIdPsrad, tmp, Support::min(imm, 31u)); cc->emit(Inst::kIdPsrlq, dst, imm); cc->emit(Inst::kIdPblendw, dst, tmp, 0xCC); return; } - sse_mov(this, dst, src); + sse_mov(*this, dst, src); cc->emit(Inst::kIdPshufd, tmp, src.is_reg() ? src.as() : dst, x86::shuffle_imm(3, 3, 1, 1)); cc->emit(Inst::kIdPsrad, tmp, 31); cc->emit(Inst::kIdPsrlq, dst, imm); @@ -4503,7 +4718,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr case UniOpVVI::kSwizzleU64x2: { // Intrinsic (SSE2 | SSE3). if (Swizzle2{imm} == swizzle(1, 0)) { - sse_mov(this, dst, src); + sse_mov(*this, dst, src); } else if (Swizzle2{imm} == swizzle(0, 0) && has_sse3()) { cc->emit(Inst::kIdMovddup, dst, src); @@ -4534,7 +4749,7 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr case UniOpVVI::kSwizzleF64x2: { // Intrinsic (SSE2 | SSE3). if (Swizzle2{imm} == swizzle(1, 0)) { - sse_mov(this, dst, src); + sse_mov(*this, dst, src); } else if (Swizzle2{imm} == swizzle(0, 0) && has_sse3()) { cc->emit(Inst::kIdMovddup, dst, src); @@ -4575,11 +4790,11 @@ void UniCompiler::emit_2vi(UniOpVVI op, const Operand_& dst_, const Operand_& sr } } -void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept { emit_2vi_t(this, op, dst_, src_, imm); } -void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept { emit_2vi_t(this, op, dst_, src_, imm); } +void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const Operand_& src_, uint32_t imm) noexcept { emit_2vi_t(*this, op, dst_, src_, imm); } +void UniCompiler::emit_2vi(UniOpVVI op, const OpArray& dst_, const OpArray& src_, uint32_t imm) noexcept { emit_2vi_t(*this, op, dst_, src_, imm); } -// ujit::UniCompiler - Vector Instructions - Emit 2VS -// =================================================== +// ujit::UniCompiler - Vector Instructions - Emit 2VS +// ================================================== void UniCompiler::emit_2vs(UniOpVR op, const Operand_& dst_, const Operand_& src_, uint32_t idx) noexcept { UniOpVInfo op_info = opcode_info_2vs[size_t(op)]; @@ -4867,8 +5082,8 @@ void UniCompiler::emit_2vs(UniOpVR op, const Operand_& dst_, const Operand_& src } } -// ujit::UniCompiler - Vector Instructions - Emit 2VM -// =================================================== +// ujit::UniCompiler - Vector Instructions - Emit 2VM +// ================================================== void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignment alignment, uint32_t idx) noexcept { ASMJIT_ASSERT(dst_.is_vec()); @@ -4886,7 +5101,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen case UniOpVM::kLoad8: { dst = dst.xmm(); src.set_size(1); - avx_zero(this, dst); + avx_zero(*this, dst); cc->vpinsrb(dst, dst, src, 0); return; } @@ -4895,7 +5110,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen if (!has_avx512_fp16()) { dst = dst.xmm(); src.set_size(1); - avx_zero(this, dst); + avx_zero(*this, dst); cc->vpinsrw(dst, dst, src, 0); } [[fallthrough]]; @@ -5147,7 +5362,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen case UniOpVM::kLoadCvt32_U32ToU64: { src.set_size(4); cc->vmovd(dst, src); - sse_int_widen(this, dst, dst, WideningOp(op_info.cvt)); + sse_int_widen(*this, dst, dst, WideningOp(op_info.cvt)); return; } @@ -5170,7 +5385,7 @@ void UniCompiler::emit_vm(UniOpVM op, const Vec& dst_, const Mem& src_, Alignmen } else { cc->movq(dst, src); - sse_int_widen(this, dst, dst, WideningOp(op_info.cvt)); + sse_int_widen(*this, dst, dst, WideningOp(op_info.cvt)); } return; } @@ -5651,8 +5866,8 @@ void UniCompiler::emit_mv(UniOpMV op, const Mem& dst_, const OpArray& src_, Alig } } -// ujit::UniCompiler - Vector Instructions - Emit 3V -// ================================================== +// ujit::UniCompiler - Vector Instructions - Emit 3V +// ================================================= void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_) noexcept { ASMJIT_ASSERT(dst_.is_vec()); @@ -5678,9 +5893,9 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src if (is_same_vec(src1v, src2)) { switch (SameVecOp(op_info.same_vec_op)) { - case SameVecOp::kZero: avx_zero(this, dst); return; - case SameVecOp::kOnes: avx_ones(this, dst); return; - case SameVecOp::kSrc: avx_mov(this, dst, src1v); return; + case SameVecOp::kZero: avx_zero(*this, dst); return; + case SameVecOp::kOnes: avx_ones(*this, dst); return; + case SameVecOp::kSrc: avx_mov(*this, dst, src1v); return; default: break; @@ -5691,8 +5906,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src ASMJIT_ASSERT(inst_id != Inst::kIdNone); FloatMode fm = FloatMode(op_info.float_mode); - - if (fm == FloatMode::kF32S || fm == FloatMode::kF64S) { + if (is_scalar_fp_op(fm)) { dst.set_signature(signature_of_xmm_ymm_zmm[0]); src1v.set_signature(signature_of_xmm_ymm_zmm[0]); @@ -5752,17 +5966,35 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src inst_id = Inst::kIdVpandn; if (src2.is_mem()) { - src2 = UniCompiler_loadNew(this, dst, src2.as(), op_info.broadcast_size); + src2 = UniCompiler_load_new(*this, dst, src2.as(), op_info.broadcast_size); } cc->emit(inst_id, dst, src2, src1v); return; } + // dst = a - (floor(a / b) * b). + case UniOpVVV::kModF32S: + case UniOpVVV::kModF64S: + case UniOpVVV::kModF32: + case UniOpVVV::kModF64: { + FloatMode fm = FloatMode(op_info.float_mode); + UniOpVV trunc_op = translate_op(op, UniOpVVV::kModF32S, UniOpVV::kTruncF32); + const FloatInst& fi = avx_float_inst[fm]; + + x86::Vec tmp = new_similar_reg(dst, "@mod_tmp"); + cc->emit(fi.fdiv, tmp, src1v, src2); + emit_2v(trunc_op, tmp, tmp); + cc->emit(fi.fmul, tmp, tmp, src2); + cc->emit(fi.fsub, dst, src1v, tmp); + + return; + } + case UniOpVVV::kMulU64: { // Native operation requires AVX512, which is not supported by the target. if (src2.is_mem()) { - src2 = UniCompiler_loadNew(this, dst, src2.as(), op_info.broadcast_size); + src2 = UniCompiler_load_new(*this, dst, src2.as(), op_info.broadcast_size); } Vec src2v = src2.as().clone_as(dst); @@ -5789,7 +6021,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src Vec tmp = new_similar_reg(dst.as(), "@tmp"); if (has_avx512()) { - Vec msk = simd_vec_const(&ct.p_FFFFFFFF00000000, Bcst::k64, dst); + Vec msk = simd_vec_const(&ct().p_FFFFFFFF00000000, Bcst::k64, dst); cc->emit(Inst::kIdVpandnq, tmp, msk, src2); cc->emit(Inst::kIdVpmullq, dst, src1v, tmp); } @@ -5807,7 +6039,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kMaxI64: { // Native operation requires AVX512, which is not supported by the target. if (src2.is_mem()) { - src2 = UniCompiler_loadNew(this, dst, src2.as(), op_info.broadcast_size); + src2 = UniCompiler_load_new(*this, dst, src2.as(), op_info.broadcast_size); } ASMJIT_ASSERT(src2.is_vec()); @@ -5829,7 +6061,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kMinU64: case UniOpVVV::kMaxU64: { if (src2.is_mem()) { - src2 = UniCompiler_loadNew(this, dst, src2.as(), op_info.broadcast_size); + src2 = UniCompiler_load_new(*this, dst, src2.as(), op_info.broadcast_size); } ASMJIT_ASSERT(src2.is_vec()); @@ -5842,8 +6074,8 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src tmp1 = new_similar_reg(dst, "@tmp1"); } - avx_isign_flip(this, tmp1, src1v, ElementSize::k64); - avx_isign_flip(this, tmp2, src2v, ElementSize::k64); + avx_isign_flip(*this, tmp1, src1v, ElementSize::k64); + avx_isign_flip(*this, tmp2, src2v, ElementSize::k64); cc->vpcmpgtq(tmp1, tmp1, tmp2); // tmp1 = src1 > src2 if (op == UniOpVVV::kMinU64) @@ -5867,19 +6099,19 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src cc->emit(inst.pmin, dst, src1v, src2); cc->emit(inst.peq, dst, dst, src1v); } - avx_bit_not(this, dst, dst); + avx_bit_not(*this, dst, dst); return; } case UniOpVVV::kCmpGtU64: case UniOpVVV::kCmpLeU64: { Vec tmp = new_similar_reg(dst, "@tmp"); - avx_isign_flip(this, tmp, src2, ElementSize::k64); - avx_isign_flip(this, dst, src1v, ElementSize::k64); + avx_isign_flip(*this, tmp, src2, ElementSize::k64); + avx_isign_flip(*this, dst, src1v, ElementSize::k64); cc->emit(Inst::kIdVpcmpgtq, dst, dst, tmp); if (op == UniOpVVV::kCmpLeU64) { - avx_bit_not(this, dst, dst); + avx_bit_not(*this, dst, dst); } return; } @@ -5918,7 +6150,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCmpGeI64: { if (!src2.is_reg()) { Vec tmp = new_similar_reg(dst, "@tmp"); - avx_mov(this, tmp, src2); + avx_mov(*this, tmp, src2); src2 = tmp; } @@ -5926,7 +6158,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src cc->emit(inst.pgt, dst, src2, src1v); if (op == UniOpVVV::kCmpGeI64) { - avx_bit_not(this, dst, dst); + avx_bit_not(*this, dst, dst); } return; } @@ -5937,14 +6169,14 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCmpLtU64: case UniOpVVV::kCmpGeU64: { Vec tmp = new_similar_reg(dst, "@tmp"); - avx_isign_flip(this, tmp, src2, ElementSize(op_info.element_size)); - avx_isign_flip(this, dst, src1v, ElementSize(op_info.element_size)); + avx_isign_flip(*this, tmp, src2, ElementSize(op_info.element_size)); + avx_isign_flip(*this, dst, src1v, ElementSize(op_info.element_size)); CmpMinMaxInst inst = avx_cmp_min_max[(size_t(op) - size_t(UniOpVVV::kCmpLtI8)) & 0x7u]; cc->emit(inst.pgt, dst, tmp, dst); if (op == UniOpVVV::kCmpGeU64) { - avx_bit_not(this, dst, dst); + avx_bit_not(*this, dst, dst); } return; } @@ -5979,7 +6211,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCmpLeI64: { cc->emit(Inst::kIdVpcmpgtq, dst, src1v, src2); - avx_bit_not(this, dst, dst); + avx_bit_not(*this, dst, dst); return; } @@ -6004,7 +6236,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src // Intrinsic - dst = {src1.u64[0], src2.64[1]} - combining low part of src1 and high part of src1. if (!src2.is_reg()) { Vec tmp = new_similar_reg(dst, "@tmp"); - avx_mov(this, tmp, src2); + avx_mov(*this, tmp, src2); src2 = tmp; } @@ -6026,7 +6258,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src if (!src2.is_reg()) { Vec tmp = new_similar_reg(dst, "@tmp"); - avx_mov(this, tmp, src2); + avx_mov(*this, tmp, src2); src2 = tmp; } @@ -6072,7 +6304,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src return; case SameVecOp::kSrc: - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); return; default: @@ -6086,11 +6318,11 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src if (!is_same_vec(dst, src1v)) { if (is_same_vec(dst, src2)) { Vec tmp = new_similar_reg(dst, "tmp"); - sse_mov(this, tmp, src2); + sse_mov(*this, tmp, src2); src2 = tmp; } - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); } if (op_info.use_imm) @@ -6112,15 +6344,37 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src if (is_same_vec(dst, src1v)) { Vec tmp = new_similar_reg(dst); - sse_mov(this, tmp, src1v); + sse_mov(*this, tmp, src1v); src1v = tmp; } - sse_mov(this, dst, src2); + sse_mov(*this, dst, src2); cc->emit(inst_id, dst, src1v); return; } + // dst = a - (floor(a / b) * b). + case UniOpVVV::kModF32S: + case UniOpVVV::kModF64S: + case UniOpVVV::kModF32: + case UniOpVVV::kModF64: { + FloatMode fm = FloatMode(op_info.float_mode); + UniOpVV trunc_op = translate_op(op, UniOpVVV::kModF32S, UniOpVV::kTruncF32); + const FloatInst& fi = sse_float_inst[fm]; + + x86::Vec tmp = new_similar_reg(dst, "@mod_tmp"); + + cc->emit(fi.fmova, tmp, src1v); + cc->emit(fi.fdiv, tmp, src2); + + emit_2v(trunc_op, tmp, tmp); + cc->emit(fi.fmul, tmp, src2); + + sse_fmov(*this, dst, src1v, fm); + cc->emit(fi.fsub, dst, tmp); + return; + } + case UniOpVVV::kMulU32: { // Native operation requires SSE4.1, which is not supported by the target. Vec tmp1 = new_similar_reg(dst, "tmp1"); @@ -6130,7 +6384,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src cc->emit(Inst::kIdPshufd, tmp2, src2, x86::shuffle_imm(3, 3, 1, 1)); cc->emit(Inst::kIdPmuludq, tmp1, tmp2); - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdPmuludq, dst, src2); cc->emit(Inst::kIdShufps, dst, tmp1, x86::shuffle_imm(2, 0, 2, 0)); cc->emit(Inst::kIdPshufd, dst, dst, x86::shuffle_imm(3, 1, 2, 0)); @@ -6149,7 +6403,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src cc->emit(Inst::kIdPmuludq, ah_bl, src2); cc->emit(Inst::kIdPaddq, al_bh, ah_bl); - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdPmuludq, dst, src2); cc->emit(Inst::kIdPsllq, al_bh, 32); cc->emit(Inst::kIdPaddq, dst, al_bh); @@ -6166,7 +6420,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src cc->emit(Inst::kIdPmuludq, dst, src1v); } else { - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdPmuludq, dst, src2); } cc->emit(Inst::kIdPsllq, tmp, 32); @@ -6179,8 +6433,8 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kMinI64: if (!has_sse4_2()) { Vec msk = new_vec128("@msk"); - sse_cmp_gt_i64(this, msk, src2, src1v); - sse_select(this, dst, src1v, src2, msk); + sse_cmp_gt_i64(*this, msk, src2, src1v); + sse_select(*this, dst, src1v, src2, msk); return; } [[fallthrough]]; @@ -6193,7 +6447,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src Vec msk = new_vec128("@msk"); cc->emit(Inst::kIdMovaps, msk, src2); cc->emit(cmp_inst_id, msk, src1v); - sse_select(this, dst, src1v, src2, msk); + sse_select(*this, dst, src1v, src2, msk); return; } @@ -6201,8 +6455,8 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src // Native operation requires AVX512, which is not supported by the target. if (!has_sse4_2()) { Vec msk = new_vec128("@msk"); - sse_cmp_gt_i64(this, msk, src1v, src2); - sse_select(this, dst, src1v, src2, msk); + sse_cmp_gt_i64(*this, msk, src1v, src2); + sse_select(*this, dst, src1v, src2, msk); return; } [[fallthrough]]; @@ -6215,7 +6469,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src Vec msk = new_vec128("@msk"); cc->emit(Inst::kIdMovaps, msk, src1v); cc->emit(cmp_inst_id, msk, src2); - sse_select(this, dst, src1v, src2, msk); + sse_select(*this, dst, src1v, src2, msk); return; } @@ -6224,14 +6478,14 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src Vec tmp = new_vec128("@tmp"); cc->emit(Inst::kIdMovaps, tmp, src1v); cc->emit(Inst::kIdPsubusw, tmp, src2); - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdPsubw, dst, tmp); return; } case UniOpVVV::kMaxU16: { // Native operation requires SSE4.1, which is not supported by the target. - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdPsubusw, dst, src2); cc->emit(Inst::kIdPaddw, dst, src2); return; @@ -6240,47 +6494,47 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kMinU32: case UniOpVVV::kMaxU32: { // Native operation requires SSE4.1, which is not supported by the target. - Operand flip_mask = simd_const(&ct.p_8000000080000000, Bcst::kNA, dst); + Operand flip_mask = simd_const(&ct().p_8000000080000000, Bcst::kNA, dst); Vec tmp1 = new_similar_reg(dst, "@tmp1"); Vec tmp2 = new_similar_reg(dst, "@tmp2"); if (op == UniOpVVV::kMinU32) { - sse_mov(this, tmp1, src2); - sse_mov(this, tmp2, src1v); + sse_mov(*this, tmp1, src2); + sse_mov(*this, tmp2, src1v); } else { - sse_mov(this, tmp1, src1v); - sse_mov(this, tmp2, src2); + sse_mov(*this, tmp1, src1v); + sse_mov(*this, tmp2, src2); } cc->emit(Inst::kIdPxor, tmp1, flip_mask); cc->emit(Inst::kIdPxor, tmp2, flip_mask); cc->emit(Inst::kIdPcmpgtd, tmp1, tmp2); - sse_select(this, dst, src1v, src2, tmp1); + sse_select(*this, dst, src1v, src2, tmp1); return; } case UniOpVVV::kMinU64: { // Native operation requires AVX512, which is not supported by the target. Vec msk = new_similar_reg(dst, "@tmp1"); - sse_cmp_gt_u64(this, msk, src2, src1v); - sse_select(this, dst, src1v, src2, msk); + sse_cmp_gt_u64(*this, msk, src2, src1v); + sse_select(*this, dst, src1v, src2, msk); return; } case UniOpVVV::kMaxU64: { // Native operation requires AVX512, which is not supported by the target. Vec msk = new_similar_reg(dst, "@tmp1"); - sse_cmp_gt_u64(this, msk, src1v, src2); - sse_select(this, dst, src1v, src2, msk); + sse_cmp_gt_u64(*this, msk, src1v, src2); + sse_select(*this, dst, src1v, src2, msk); return; } case UniOpVVV::kCmpEqU64: { // Native operation requires SSE4.1, which is not supported by the target. Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdPcmpeqd, dst, src2); cc->emit(Inst::kIdPshufd, tmp, dst, x86::shuffle_imm(2, 3, 0, 1)); cc->emit(Inst::kIdPand, dst, tmp); @@ -6289,7 +6543,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCmpGtI64: { // Native operation requires SSE4.2, which is not supported by the target. - sse_cmp_gt_i64(this, dst, src1v, src2); + sse_cmp_gt_i64(*this, dst, src1v, src2); return; } @@ -6315,20 +6569,20 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src cc->emit(inst.peq, dst, src1v); } - sse_bit_not(this, dst, dst); + sse_bit_not(*this, dst, dst); return; } Vec tmp = new_similar_reg(dst, "@tmp"); - sse_msb_flip(this, tmp, src2, ElementSize(op_info.element_size)); - sse_msb_flip(this, dst, src1v, ElementSize(op_info.element_size)); + sse_msb_flip(*this, tmp, src2, ElementSize(op_info.element_size)); + sse_msb_flip(*this, dst, src1v, ElementSize(op_info.element_size)); cc->emit(inst.pgt, dst, tmp); return; } case UniOpVVV::kCmpGtU64: { // Native operation requires AVX512, which is not supported by the target. - sse_cmp_gt_u64(this, dst, src1v, src2); + sse_cmp_gt_u64(*this, dst, src1v, src2); return; } @@ -6363,11 +6617,11 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src if (op == UniOpVVV::kCmpGeU16) { Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, src1v); + sse_mov(*this, tmp, src1v); cc->emit(Inst::kIdPsubusw, tmp, src2); cc->emit(Inst::kIdPaddw, tmp, src2); - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdPcmpeqw, dst, tmp); return; } @@ -6378,7 +6632,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src // Native operation requires AVX512, which is not supported by the target. if (src2.is_mem()) { Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, src2); + sse_mov(*this, tmp, src2); src2 = tmp; } @@ -6393,7 +6647,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src ASMJIT_NOT_REACHED(); } - sse_bit_not(this, dst, dst); + sse_bit_not(*this, dst, dst); return; case UniOpVVV::kCmpLtI8: @@ -6401,11 +6655,11 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCmpLtI32: { if (is_same_vec(dst, src1v)) { Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, src1v); + sse_mov(*this, tmp, src1v); src1v = tmp; } - sse_mov(this, dst, src2); + sse_mov(*this, dst, src2); cc->emit(inst_id, dst, src1v); return; } @@ -6414,36 +6668,36 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCmpLtU16: case UniOpVVV::kCmpLtU32: { Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, src1v); - sse_msb_flip(this, tmp, src1v, ElementSize(op_info.element_size)); - sse_msb_flip(this, dst, src2, ElementSize(op_info.element_size)); + sse_mov(*this, tmp, src1v); + sse_msb_flip(*this, tmp, src1v, ElementSize(op_info.element_size)); + sse_msb_flip(*this, dst, src2, ElementSize(op_info.element_size)); cc->emit(inst_id, dst, tmp); return; } case UniOpVVV::kCmpLtI64: { // Native operation requires AVX512, which is not supported by the target. - sse_cmp_gt_i64(this, dst, src2, src1v); + sse_cmp_gt_i64(*this, dst, src2, src1v); return; } case UniOpVVV::kCmpLtU64: { // Native operation requires AVX512, which is not supported by the target. - sse_cmp_gt_u64(this, dst, src2, src1v); + sse_cmp_gt_u64(*this, dst, src2, src1v); return; } case UniOpVVV::kCmpLeU8: { if (is_same_vec(dst, src2)) { Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, src2); + sse_mov(*this, tmp, src2); src2 = tmp; } - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdPsubusb, dst, src2); - Vec zeros = simd_vec_const(&ct.p_0000000000000000, Bcst::k32, dst); + Vec zeros = simd_vec_const(&ct().p_0000000000000000, Bcst::k32, dst); cc->emit(Inst::kIdPcmpeqb, dst, zeros); return; } @@ -6489,7 +6743,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src ASMJIT_NOT_REACHED(); } - sse_bit_not(this, dst, dst); + sse_bit_not(*this, dst, dst); return; case UniOpVVV::kCmpLtF32S: @@ -6506,8 +6760,8 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src // Unfortunately we have to do two moves, because there are no predicates that // we could use in case of reversed operands (AVX is much better in this regard). Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, src2); - sse_mov(this, dst, src1v); + sse_mov(*this, tmp, src2); + sse_mov(*this, dst, src1v); cc->emit(inst_id, dst, tmp, pred); return; } @@ -6530,7 +6784,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCmpUnordF32: case UniOpVVV::kCmpUnordF64: { uint8_t pred = sse_fcmp_imm_table[(size_t(op) - size_t(UniOpVVV::kCmpEqF32S)) / 4u]; - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(inst_id, dst, src2, pred); return; } @@ -6547,14 +6801,14 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src uint8_t pred = sse_fcmp_imm_table[(size_t(op) - size_t(UniOpVVV::kCmpEqF32S)) / 4u]; if (dst.id() != src1v.id()) { - sse_mov(this, dst, src2); + sse_mov(*this, dst, src2); cc->emit(inst_id, dst, src1v, pred); } else { Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, src2); + sse_mov(*this, tmp, src2); cc->emit(inst_id, tmp, src1v, pred); - sse_mov(this, dst, tmp); + sse_mov(*this, dst, tmp); } return; } @@ -6579,7 +6833,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src if (src2.is_mem()) { Mem m(src2.as()); - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); v_swap_f64(tmp, dst); cc->movhpd(dst, m); @@ -6588,16 +6842,16 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src cc->addpd(dst, tmp); } else if (is_same_vec(dst, src2)) { - sse_mov(this, tmp, src1v); + sse_mov(*this, tmp, src1v); cc->unpcklpd(tmp, src2.as()); cc->movhlps(dst, src1v); cc->addpd(dst, tmp); } else { - sse_mov(this, tmp, src1v); + sse_mov(*this, tmp, src1v); cc->unpckhpd(tmp, src2.as()); - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->unpcklpd(dst, src2.as()); cc->addpd(dst, tmp.as()); @@ -6642,7 +6896,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src case UniOpVVV::kCombineHiLoF64: { // Intrinsic - dst = {src1.u64[1], src2.64[0]} - combining high part of src1 and low part of src2. if (src2.is_mem()) { - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdMovlpd, dst, src2); } else if (is_same_vec(dst, src2)) { @@ -6651,7 +6905,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src } else { // dst = {src1.u64[1], src2.u64[0]} - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdMovsd, dst, src2); } return; @@ -6669,15 +6923,15 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src // In general, if you hit this code-path (not having SSE4.1 and still needing exactly this instruction) I would // recommend using a different strategy in this case, completely avoiding this code path. Usually, inputs are not // arbitrary and knowing the range could help a lot to reduce the approach to use a native 'packssdw' instruction. - Operand bias = simd_const(&ct.p_0000800000008000, Bcst::kNA, dst); - Operand unbias = simd_const(&ct.p_8000800080008000, Bcst::kNA, dst); + Operand bias = simd_const(&ct().p_0000800000008000, Bcst::kNA, dst); + Operand unbias = simd_const(&ct().p_8000800080008000, Bcst::kNA, dst); if (is_same_vec(src1v, src2)) { Vec tmp = dst; if (is_same_vec(dst, src1v)) tmp = new_similar_reg(dst, "@tmp1"); - sse_mov(this, tmp, src1v); + sse_mov(*this, tmp, src1v); cc->emit(Inst::kIdPsrad, tmp, 31); cc->emit(Inst::kIdPandn, tmp, src1v); @@ -6685,14 +6939,14 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src cc->emit(Inst::kIdPackssdw, tmp, tmp); cc->emit(Inst::kIdPaddw, tmp, unbias); - sse_mov(this, dst, tmp); + sse_mov(*this, dst, tmp); } else { Vec tmp1 = new_similar_reg(dst, "@tmp1"); Vec tmp2 = new_similar_reg(dst, "@tmp2"); - sse_mov(this, tmp1, src1v); - sse_mov(this, tmp2, src2); + sse_mov(*this, tmp1, src1v); + sse_mov(*this, tmp2, src2); cc->emit(Inst::kIdPsrad, tmp1, 31); cc->emit(Inst::kIdPsrad, tmp2, 31); @@ -6703,7 +6957,7 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src cc->emit(Inst::kIdPackssdw, tmp1, tmp2); cc->emit(Inst::kIdPaddw, tmp1, unbias); - sse_mov(this, dst, tmp1); + sse_mov(*this, dst, tmp1); } return; } @@ -6723,9 +6977,9 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src // The trick is to AND all indexes by 0x0F and then to do unsigned minimum so all indexes are in [0, 17) range, // where index 16 maps to zero. Vec tmp = new_similar_reg(dst, "@tmp"); - cc->vmovaps(tmp, simd_mem_const(&ct.p_0F0F0F0F0F0F0F0F, Bcst::kNA, tmp)); + cc->vmovaps(tmp, simd_mem_const(&ct().p_0F0F0F0F0F0F0F0F, Bcst::kNA, tmp)); cc->pand(tmp, src2.as()); - cc->pminub(tmp, simd_mem_const(&ct.p_1010101010101010, Bcst::kNA, tmp)); + cc->pminub(tmp, simd_mem_const(&ct().p_1010101010101010, Bcst::kNA, tmp)); cc->movaps(m_pred, tmp); cc->mov(m_data.clone_adjusted(16), 0); @@ -6758,12 +7012,12 @@ void UniCompiler::emit_3v(UniOpVVV op, const Operand_& dst_, const Operand_& src } } -void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); } -void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); } -void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_) noexcept { emit_3v_t(this, op, dst_, src1_, src2_); } +void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); } +void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); } +void UniCompiler::emit_3v(UniOpVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_) noexcept { emit_3v_t(*this, op, dst_, src1_, src2_); } -// ujit::UniCompiler - Vector Instructions - Emit 3VI -// =================================================== +// ujit::UniCompiler - Vector Instructions - Emit 3VI +// ================================================== void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_, uint32_t imm) noexcept { ASMJIT_ASSERT(dst_.is_vec()); @@ -6791,7 +7045,7 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s // Intrin - short-circuit if possible based on the predicate. case UniOpVVVI::kAlignr_U128: { if (imm == 0) { - avx_mov(this, dst, src2); + avx_mov(*this, dst, src2); return; } @@ -6896,7 +7150,7 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s // Intrin - short-circuit if possible based on the predicate. case UniOpVVVI::kAlignr_U128: { if (imm == 0) { - sse_mov(this, dst, src2); + sse_mov(*this, dst, src2); return; } @@ -6913,11 +7167,11 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s if (has_ssse3()) { if (is_same_vec(dst, src2) && !is_same_vec(dst, src1v)) { Vec tmp = new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, src2); + sse_mov(*this, tmp, src2); src2 = tmp; } - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(Inst::kIdPalignr, dst, src2, imm); return; } @@ -6927,13 +7181,13 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s uint32_t src2_shift = imm; if (is_same_vec(dst, src1v)) { - sse_mov(this, tmp, src2); + sse_mov(*this, tmp, src2); cc->emit(Inst::kIdPsrldq, tmp, src2_shift); cc->emit(Inst::kIdPslldq, dst, src1_shift); } else { - sse_mov(this, tmp, src1v); - sse_mov(this, dst, src2); + sse_mov(*this, tmp, src1v); + sse_mov(*this, dst, src2); cc->emit(Inst::kIdPslldq, tmp, src1_shift); cc->emit(Inst::kIdPsrldq, dst, src2_shift); } @@ -6956,7 +7210,7 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s shuf_imm = shuf_imm2_from_swizzle(Swizzle2{imm}); if (is_same_vec(src1v, src2)) { - UniOpVVI vvi_op = UniOpVVI(uint32_t(UniOpVVI::kSwizzleU32x4) + (uint32_t(op) - uint32_t(UniOpVVVI::kInterleaveShuffleU32x4))); + UniOpVVI vvi_op = translate_op(op, UniOpVVVI::kInterleaveShuffleU32x4, UniOpVVI::kSwizzleU32x4); emit_2vi(vvi_op, dst, src1v, imm); return; } @@ -6975,7 +7229,7 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s cc->emit(Inst::kIdPshufd, dst, dst, x86::shuffle_imm(1, 0, 3, 2)); } else { - sse_mov(this, dst, src1v); + sse_mov(*this, dst, src1v); cc->emit(inst_id, dst, src2, shuf_imm); } return; @@ -6998,12 +7252,12 @@ void UniCompiler::emit_3vi(UniOpVVVI op, const Operand_& dst_, const Operand_& s } } -void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); } -void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); } -void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(this, op, dst_, src1_, src2_, imm); } +void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); } +void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); } +void UniCompiler::emit_3vi(UniOpVVVI op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, uint32_t imm) noexcept { emit_3vi_t(*this, op, dst_, src1_, src2_, imm); } -// ujit::UniCompiler - Vector Instructions - Emit 4V -// ================================================== +// ujit::UniCompiler - Vector Instructions - Emit 4V +// ================================================= void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& src1_, const Operand_& src2_, const Operand_& src3_) noexcept { ASMJIT_ASSERT(dst_.is_vec()); @@ -7035,7 +7289,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr switch (op) { case UniOpVVVV::kBlendV_U8: { // Blend(a, b, cond) == (a & ~cond) | (b & cond) - avx_make_vec(this, src3, dst, "msk"); + avx_make_vec(*this, src3, dst, "msk"); cc->emit(op_info.avx_inst_id, dst, src1, src2, src3); return; } @@ -7100,7 +7354,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr size_t fma_id = size_t(op) - size_t(UniOpVVVV::kMAddF32S); FloatMode fm = FloatMode(op_info.float_mode); - if (fm == FloatMode::kF32S || fm == FloatMode::kF64S) { + if (is_scalar_fp_op(fm)) { dst.set_signature(signature_of_xmm_ymm_zmm[0]); src1.set_signature(signature_of_xmm_ymm_zmm[0]); @@ -7150,7 +7404,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr cc->emit(fma_bc_add_a[fma_id], dst, src1, src2); } else { - avx_mov(this, dst, src1); + avx_mov(*this, dst, src1); if (!src2.is_reg()) cc->emit(fma_ac_add_b[fma_id], dst, src3, src2); else if (!src3.is_reg()) @@ -7183,7 +7437,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr else { // NMAdd or NMSub Operation. Vec tmp = new_similar_reg(dst, "@tmp"); - avx_fsign_flip(this, tmp, src1, fm); + avx_fsign_flip(*this, tmp, src1, fm); cc->emit(fi.fmul, tmp, tmp, src2); cc->emit(fi_facc, dst, tmp, src3); @@ -7205,8 +7459,8 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr // Blend(a, b, cond) == (a & ~cond) | (b & cond) if (has_sse4_1()) { if (is_same_vec(dst, src1) || (!is_same_vec(dst, src2) && !is_same_vec(dst, src3))) { - sse_make_vec(this, src3, "tmp"); - sse_mov(this, dst, src1); + sse_make_vec(*this, src3, "tmp"); + sse_mov(*this, dst, src1); cc->emit(op_info.sse_inst_id, dst, src2, src3); return; } @@ -7277,7 +7531,7 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr if (is_same_vec(dst, src2)) { // Unfortunately, to follow the FMA behavior in scalar case, we have to copy. if (fm <= FloatMode::kF64S) - src2 = sse_copy(this, src2.as(), "@copy_src2"); + src2 = sse_copy(*this, src2.as(), "@copy_src2"); else std::swap(src1, src2.as()); } @@ -7288,11 +7542,11 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr if (is_same_vec(dst, src3)) { if (fm <= FloatMode::kF64S || !mul_add) { // Copy if we couldn't avoid the extra move. - src3 = sse_copy(this, src3.as(), "@copy_src3"); + src3 = sse_copy(*this, src3.as(), "@copy_src3"); } else { Vec tmp = cc->new_similar_reg(dst, "@tmp"); - sse_mov(this, tmp, src1); + sse_mov(*this, tmp, src1); cc->emit(fi.fmul, tmp, src2); cc->emit(neg_mul ? fi.fsub : fi.fadd, dst, tmp); return; @@ -7300,9 +7554,9 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr } if (neg_mul) - sse_fsign_flip(this, dst, src1, fm); + sse_fsign_flip(*this, dst, src1, fm); else - sse_mov(this, dst, src1); + sse_mov(*this, dst, src1); cc->emit(fi.fmul, dst, src2); cc->emit(fi_facc, dst, src3); @@ -7315,13 +7569,13 @@ void UniCompiler::emit_4v(UniOpVVVV op, const Operand_& dst_, const Operand_& sr } } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } -void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const Operand_& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const Operand_& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const Operand& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } +void UniCompiler::emit_4v(UniOpVVVV op, const OpArray& dst_, const OpArray& src1_, const OpArray& src2_, const OpArray& src3_) noexcept { emit_4v_t(*this, op, dst_, src1_, src2_, src3_); } ASMJIT_END_SUB_NAMESPACE diff --git a/src/asmjit/ujit/unicondition.h b/src/asmjit/ujit/unicondition.h new file mode 100644 index 0000000..3e6ee81 --- /dev/null +++ b/src/asmjit/ujit/unicondition.h @@ -0,0 +1,293 @@ +// This file is part of AsmJit project +// +// See or LICENSE.md for license and copyright information +// SPDX-License-Identifier: Zlib + +#ifndef ASMJIT_UJIT_UNICONDITION_H_INCLUDED +#define ASMJIT_UJIT_UNICONDITION_H_INCLUDED + +#include "ujitbase.h" +#include "uniop.h" + +#if !defined(ASMJIT_NO_UJIT) + +ASMJIT_BEGIN_SUB_NAMESPACE(ujit) + +//! \addtogroup asmjit_ujit +//! \{ + +//! Condition represents either a condition or an assignment operation that can be checked. +class UniCondition { +public: + //! \name Members + //! \{ + + UniOpCond op; + CondCode cond; + Operand a; + Operand b; + + //! \} + + //! \name Construction & Destruction + //! \{ + + ASMJIT_INLINE_NODEBUG UniCondition(UniOpCond op, CondCode cond, const Operand& a, const Operand& b) noexcept + : op(op), + cond(cond), + a(a), + b(b) {} + + ASMJIT_INLINE_NODEBUG UniCondition(const UniCondition& other) noexcept = default; + + //! \} + + //! \name Overloaded Operators + //! \{ + + ASMJIT_INLINE_NODEBUG UniCondition& operator=(const UniCondition& other) noexcept = default; + + //! \} +}; + +//! Constructs a condition that would be `true` when `a = (a & b)` becomes zero. +static ASMJIT_INLINE UniCondition and_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a & b)` becomes zero. +static ASMJIT_INLINE UniCondition and_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a & b)` becomes zero. +static ASMJIT_INLINE UniCondition and_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a & b)` becomes non-zero. +static ASMJIT_INLINE UniCondition and_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a & b)` becomes non-zero. +static ASMJIT_INLINE UniCondition and_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a & b)` becomes non-zero. +static ASMJIT_INLINE UniCondition and_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAnd, CondCode::kNotZero, a, b); } + +//! Constructs a condition that would be `true` when `a = (a | b)` becomes zero. +static ASMJIT_INLINE UniCondition or_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a | b)` becomes zero. +static ASMJIT_INLINE UniCondition or_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a | b)` becomes zero. +static ASMJIT_INLINE UniCondition or_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a | b)` becomes non-zero. +static ASMJIT_INLINE UniCondition or_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a | b)` becomes non-zero. +static ASMJIT_INLINE UniCondition or_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a | b)` becomes non-zero. +static ASMJIT_INLINE UniCondition or_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignOr, CondCode::kNotZero, a, b); } + +//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes zero. +static ASMJIT_INLINE UniCondition xor_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes zero. +static ASMJIT_INLINE UniCondition xor_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes zero. +static ASMJIT_INLINE UniCondition xor_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes non-zero. +static ASMJIT_INLINE UniCondition xor_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes non-zero. +static ASMJIT_INLINE UniCondition xor_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a ^ b)` becomes non-zero. +static ASMJIT_INLINE UniCondition xor_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignXor, CondCode::kNotZero, a, b); } + +//! Constructs a condition that would be `true` when `a = (a + b)` becomes zero. +static ASMJIT_INLINE UniCondition add_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` becomes zero. +static ASMJIT_INLINE UniCondition add_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` becomes zero. +static ASMJIT_INLINE UniCondition add_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` becomes non-zero. +static ASMJIT_INLINE UniCondition add_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` becomes non-zero. +static ASMJIT_INLINE UniCondition add_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` becomes non-zero. +static ASMJIT_INLINE UniCondition add_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotZero, a, b); } + +//! Constructs a condition that would be `true` when `a = (a + b)` wraps (sets carry flag). +static ASMJIT_INLINE UniCondition add_c(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` wraps (sets carry flag). +static ASMJIT_INLINE UniCondition add_c(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` wraps (sets carry flag). +static ASMJIT_INLINE UniCondition add_c(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kCarry, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` doesn't wrap (doesn't set carry flag). +static ASMJIT_INLINE UniCondition add_nc(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` doesn't wrap (doesn't set carry flag). +static ASMJIT_INLINE UniCondition add_nc(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` doesn't wrap (doesn't set carry flag). +static ASMJIT_INLINE UniCondition add_nc(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotCarry, a, b); } + +//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit set. +static ASMJIT_INLINE UniCondition add_s(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit set. +static ASMJIT_INLINE UniCondition add_s(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit set. +static ASMJIT_INLINE UniCondition add_s(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kSign, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit unset. +static ASMJIT_INLINE UniCondition add_ns(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit unset. +static ASMJIT_INLINE UniCondition add_ns(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); } +//! Constructs a condition that would be `true` when `a = (a + b)` ends with the msb/sign bit unset. +static ASMJIT_INLINE UniCondition add_ns(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignAdd, CondCode::kNotSign, a, b); } + +//! Constructs a condition that would be `true` when `a = (a - b)` becomes zero. +static ASMJIT_INLINE UniCondition sub_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` becomes zero. +static ASMJIT_INLINE UniCondition sub_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` becomes zero. +static ASMJIT_INLINE UniCondition sub_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` becomes non-zero. +static ASMJIT_INLINE UniCondition sub_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` becomes non-zero. +static ASMJIT_INLINE UniCondition sub_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` becomes non-zero. +static ASMJIT_INLINE UniCondition sub_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotZero, a, b); } + +//! Constructs a condition that would be `true` when `a = (a - b)` wraps. +static ASMJIT_INLINE UniCondition sub_c(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` wraps. +static ASMJIT_INLINE UniCondition sub_c(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` wraps. +static ASMJIT_INLINE UniCondition sub_c(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedLT, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` doesn't wrap. +static ASMJIT_INLINE UniCondition sub_nc(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` doesn't wrap. +static ASMJIT_INLINE UniCondition sub_nc(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` doesn't wrap. +static ASMJIT_INLINE UniCondition sub_nc(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGE, a, b); } + +//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit set. +static ASMJIT_INLINE UniCondition sub_s(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kSign, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit set. +static ASMJIT_INLINE UniCondition sub_s(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kSign, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit set. +static ASMJIT_INLINE UniCondition sub_s(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kSign, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit unset. +static ASMJIT_INLINE UniCondition sub_ns(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit unset. +static ASMJIT_INLINE UniCondition sub_ns(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); } +//! Constructs a condition that would be `true` when `a = (a - b)` ends with the msb/sign bit unset. +static ASMJIT_INLINE UniCondition sub_ns(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kNotSign, a, b); } + +static ASMJIT_INLINE UniCondition sub_ugt(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); } +static ASMJIT_INLINE UniCondition sub_ugt(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); } +static ASMJIT_INLINE UniCondition sub_ugt(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignSub, CondCode::kUnsignedGT, a, b); } + +//! Constructs a condition that would be `true` when `a = (a << b)` becomes zero. +static ASMJIT_INLINE UniCondition shr_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a << b)` becomes zero. +static ASMJIT_INLINE UniCondition shr_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a << b)` becomes zero. +static ASMJIT_INLINE UniCondition shr_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a << b)` becomes non-zero. +static ASMJIT_INLINE UniCondition shr_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a << b)` becomes non-zero. +static ASMJIT_INLINE UniCondition shr_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a = (a << b)` becomes non-zero. +static ASMJIT_INLINE UniCondition shr_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kAssignShr, CondCode::kNotZero, a, b); } + +//! Constructs a condition that would be `true` when `a == b)`. +static ASMJIT_INLINE UniCondition cmp_eq(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kEqual, a, b); } +//! Constructs a condition that would be `true` when `a == b)`. +static ASMJIT_INLINE UniCondition cmp_eq(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kEqual, a, b); } +//! Constructs a condition that would be `true` when `a == b)`. +static ASMJIT_INLINE UniCondition cmp_eq(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kEqual, a, b); } + +//! Constructs a condition that would be `true` when `a != b)`. +static ASMJIT_INLINE UniCondition cmp_ne(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); } +//! Constructs a condition that would be `true` when `a != b)`. +static ASMJIT_INLINE UniCondition cmp_ne(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); } +//! Constructs a condition that would be `true` when `a != b)`. +static ASMJIT_INLINE UniCondition cmp_ne(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kNotEqual, a, b); } + +//! Constructs a condition that would be `true` when `a < b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_lt(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); } +//! Constructs a condition that would be `true` when `a < b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_lt(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); } +//! Constructs a condition that would be `true` when `a < b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_lt(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLT, a, b); } + +//! Constructs a condition that would be `true` when `a <= b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_le(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); } +//! Constructs a condition that would be `true` when `a <= b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_le(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); } +//! Constructs a condition that would be `true` when `a <= b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_le(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedLE, a, b); } + +//! Constructs a condition that would be `true` when `a > b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_gt(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); } +//! Constructs a condition that would be `true` when `a > b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_gt(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); } +//! Constructs a condition that would be `true` when `a > b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_gt(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGT, a, b); } + +//! Constructs a condition that would be `true` when `a >= b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_ge(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); } +//! Constructs a condition that would be `true` when `a >= b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_ge(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); } +//! Constructs a condition that would be `true` when `a >= b` (signed comparison). +static ASMJIT_INLINE UniCondition scmp_ge(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kSignedGE, a, b); } + +//! Constructs a condition that would be `true` when `a < b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_lt(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); } +//! Constructs a condition that would be `true` when `a < b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_lt(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); } +//! Constructs a condition that would be `true` when `a < b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_lt(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLT, a, b); } + +//! Constructs a condition that would be `true` when `a <= b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_le(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); } +//! Constructs a condition that would be `true` when `a <= b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_le(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); } +//! Constructs a condition that would be `true` when `a <= b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_le(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedLE, a, b); } + +//! Constructs a condition that would be `true` when `a > b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_gt(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); } +//! Constructs a condition that would be `true` when `a > b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_gt(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); } +//! Constructs a condition that would be `true` when `a > b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_gt(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGT, a, b); } + +//! Constructs a condition that would be `true` when `a >= b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_ge(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); } +//! Constructs a condition that would be `true` when `a >= b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_ge(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); } +//! Constructs a condition that would be `true` when `a >= b` (unsigned comparison). +static ASMJIT_INLINE UniCondition ucmp_ge(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kUnsignedGE, a, b); } + +//! Constructs a condition that would be `true` when `a` is zero. +static ASMJIT_INLINE UniCondition test_z(const Gp& a) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kEqual, a, Imm(0)); } +//! Constructs a condition that would be `true` when `a` is non-zero. +static ASMJIT_INLINE UniCondition test_nz(const Gp& a) noexcept { return UniCondition(UniOpCond::kCompare, CondCode::kNotEqual, a, Imm(0)); } + +//! Constructs a condition that would be `true` when `a & b` is zero. +static ASMJIT_INLINE UniCondition test_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a & b` is zero. +static ASMJIT_INLINE UniCondition test_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a & b` is zero. +static ASMJIT_INLINE UniCondition test_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kZero, a, b); } +//! Constructs a condition that would be `true` when `a & b` is non-zero. +static ASMJIT_INLINE UniCondition test_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a & b` is non-zero. +static ASMJIT_INLINE UniCondition test_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kNotZero, a, b); } +//! Constructs a condition that would be `true` when `a & b` is non-zero. +static ASMJIT_INLINE UniCondition test_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kTest, CondCode::kNotZero, a, b); } + +//! Constructs a condition that would be `true` when a bit in `a` at `b` is zero (`((a >> b) & 1) == 0`). +static ASMJIT_INLINE UniCondition bt_z(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); } +//! Constructs a condition that would be `true` when a bit in `a` at `b` is zero (`((a >> b) & 1) == 0`). +static ASMJIT_INLINE UniCondition bt_z(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); } +//! Constructs a condition that would be `true` when a bit in `a` at `b` is zero (`((a >> b) & 1) == 0`). +static ASMJIT_INLINE UniCondition bt_z(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTZero, a, b); } +//! Constructs a condition that would be `true` when a bit in `a` at `b` is non-zero (`((a >> b) & 1) == 1`). +static ASMJIT_INLINE UniCondition bt_nz(const Gp& a, const Gp& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); } +//! Constructs a condition that would be `true` when a bit in `a` at `b` is non-zero (`((a >> b) & 1) == 1`). +static ASMJIT_INLINE UniCondition bt_nz(const Gp& a, const Mem& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); } +//! Constructs a condition that would be `true` when a bit in `a` at `b` is non-zero (`((a >> b) & 1) == 1`). +static ASMJIT_INLINE UniCondition bt_nz(const Gp& a, const Imm& b) noexcept { return UniCondition(UniOpCond::kBitTest, CondCode::kBTNotZero, a, b); } + +//! \} + +ASMJIT_END_SUB_NAMESPACE + +#endif // !ASMJIT_NO_UJIT +#endif // ASMJIT_UJIT_UNICONDITION_H_INCLUDED diff --git a/src/asmjit/ujit/uniop.h b/src/asmjit/ujit/uniop.h index d7a3697..8e18de9 100644 --- a/src/asmjit/ujit/uniop.h +++ b/src/asmjit/ujit/uniop.h @@ -15,241 +15,264 @@ ASMJIT_BEGIN_SUB_NAMESPACE(ujit) //! \addtogroup asmjit_ujit //! \{ +//! Instruction that can be used by \ref UniCondition. enum class UniOpCond : uint32_t { - kAssignAnd, - kAssignOr, - kAssignXor, - kAssignAdd, - kAssignSub, - kAssignShr, - kTest, - kBitTest, - kCompare, + kAssignAnd, //!< Assign-and `a &= b`. + kAssignOr, //!< Assign-or `a |= b`. + kAssignXor, //!< Assign-xor `a ^= b`. + kAssignAdd, //!< Assign-add `a += b`. + kAssignSub, //!< Assign-sub `a -= b`. + kAssignShr, //!< Assign-shr `a >>= b`. + kTest, //!< Test `a & b`. + kBitTest, //!< Bit-test `a & (1 << b)`. + kCompare, //!< Compare `a <=> b`. kMaxValue = kCompare }; +//! Instruction with a single memory operand. enum class UniOpM : uint32_t { - kStoreZeroReg, - kStoreZeroU8, - kStoreZeroU16, - kStoreZeroU32, - kStoreZeroU64 + kStoreZeroReg, //!< Store zero (data-width depends on register size). + kStoreZeroU8, //!< Store zero (8-bit). + kStoreZeroU16, //!< Store zero (16-bit). + kStoreZeroU32, //!< Store zero (32-bit). + kStoreZeroU64 //!< Store zero (64-bit). }; +//! Instruction with `[reg, mem]` operands. enum class UniOpRM : uint32_t { - kLoadReg, - kLoadI8, - kLoadU8, - kLoadI16, - kLoadU16, - kLoadI32, - kLoadU32, - kLoadI64, - kLoadU64, - kLoadMergeU8, - kLoadShiftU8, - kLoadMergeU16, - kLoadShiftU16 + kLoadReg, //!< N-bit load (the size depends on the register size). + kLoadI8, //!< 8-bit load, sign extended. + kLoadU8, //!< 8-bit load, zero extended. + kLoadI16, //!< 16-bit load, sign extended. + kLoadU16, //!< 16-bit load, zero extended. + kLoadI32, //!< 32-bit load, sign extended. + kLoadU32, //!< 32-bit load, zero extended. + kLoadI64, //!< 64-bit load. + kLoadU64, //!< 64-bit load. + kLoadMergeU8, //!< 8-bit load and merge. + kLoadShiftU8, //!< 8-bit load, shift, and merge. + kLoadMergeU16, //!< 16-bit load and merge. + kLoadShiftU16 //!< 16-bit load, shift, and merge. }; +//! Instruction with `[mem, reg]` operands. enum class UniOpMR : uint32_t { - kStoreReg, - kStoreU8, - kStoreU16, - kStoreU32, - kStoreU64, - kAddReg, - kAddU8, - kAddU16, - kAddU32, - kAddU64 + kStoreReg, //!< N-bit store (the size depends on the register size). + kStoreU8, //!< 8-bit store. + kStoreU16, //!< 16-bit store. + kStoreU32, //!< 32-bit store. + kStoreU64, //!< 64-bit store. + kAddReg, //!< N-bit load+add+store (the size depends on the register size). + kAddU8, //!< 8-bit load+add+store. + kAddU16, //!< 16-bit load+add+store. + kAddU32, //!< 32-bit load+add+store. + kAddU64 //!< 64-bit load+add+store. }; -//! Arithmetic operation having 2 operands (dst, src). +//! Instruction with `[reg, reg]` operands. +//! +//! Arithmetic operations having 2 operands (dst, src). +//! +//! \note For convenience, the second operand can be register, memory, or immediate value. enum class UniOpRR : uint32_t { - kAbs, - kNeg, - kNot, - kBSwap, - kCLZ, - kCTZ, - kReflect, + kAbs, //!< Absolute value of a signed integer - `dst = abs(src)`. + kNeg, //!< Arithmetic negation - `dst = -src` (`dst = ~src + 1`). + kNot, //!< Bitwise-not - `dst = ~src`. + kBSwap, //!< Byteswap - `dst = bswap(src)`. + kCLZ, //!< Count leading zeros - `dst = clz(src)`. + kCTZ, //!< Count trailing zeros - `dst = ctz(src)`. + kReflect, //!< Integer reflection. + kMaxValue = kReflect }; +//! Instruction with `[reg, reg, reg]` operands. +//! //! Arithmetic operation having 3 operands (dst, src1, src2). +//! +//! \note For convenience, the third operand can be register, memory, or immediate value. enum class UniOpRRR : uint32_t { - kAnd, - kOr, - kXor, - kBic, - kAdd, - kSub, - kMul, - kUDiv, - kUMod, - kSMin, - kSMax, - kUMin, - kUMax, - kSll, - kSrl, - kSra, - kRol, - kRor, - kSBound, + kAnd, //!< Bitwise AND `dst = src1 & src2`. + kOr, //!< Bitwise OR `dst = src1 | src2`. + kXor, //!< Bitwise XOR `dst = src1 ^ src2`. + kBic, //!< Bitwise BIC `dst = src1 & ~src2`. + kAdd, //!< Add `dst = src1 + src2`. + kSub, //!< Subtract `dst = src1 - src2`. + kMul, //!< Multiply `dst = src1 * src2`. + kUDiv, //!< Unsigned divide `dst = src1 / src2`. + kUMod, //!< Unsigned modulo `dst = src1 & src2`. + kSMin, //!< Signed minimum `dst = smin(src1, src2)`. + kSMax, //!< Signed maximum `dst = smax(src1, src2)`. + kUMin, //!< Unsigned minimum `dst = umin(src1, src2)`. + kUMax, //!< Unsigned maximum `dst = umax(src1, src2)`. + kSll, //!< Shift left logical `dst = src1 << src2`. + kSrl, //!< Shift left logical `dst = src1 >> src2`. + kSra, //!< Shift left logical `dst = sra(src1, src2)`. + kRol, //!< Rotate left `dst = (src1 << src2) | (src1 >> (N_BITS - src2))`. + kRor, //!< Rotate right `dst = (src1 >> src2) | (src1 << (N_BITS - src2))`. + kSBound, //!< Signed bounds. kMaxValue = kSBound }; +//! Instruction with `[vec, reg]` operands. +//! +//! Describes instructions where general-purpose is either moved, converted, or inserted to +//! a vector register. enum class UniOpVR : uint32_t { - kMov, - kMovU32, - kMovU64, - kInsertU8, - kInsertU16, - kInsertU32, - kInsertU64, - kExtractU8, - kExtractU16, - kExtractU32, - kExtractU64, - kCvtIntToF32, - kCvtIntToF64, - kCvtTruncF32ToInt, - kCvtRoundF32ToInt, - kCvtTruncF64ToInt, - kCvtRoundF64ToInt, + kMov, //!< N-bit move into a vector register (the size depends on source register width). + kMovU32, //!< 32-bit move into a vector register. + kMovU64, //!< 64-bit move into a vector register. + kInsertU8, //!< 8-bit insertion into a vector register. + kInsertU16, //!< 16-bit insertion into a vector register. + kInsertU32, //!< 32-bit insertion into a vector register. + kInsertU64, //!< 64-bit insertion into a vector register. + kExtractU8, //!< 8-bit extraction from a vector register. + kExtractU16, //!< 16-bit extraction from a vector register. + kExtractU32, //!< 32-bit extraction from a vector register. + kExtractU64, //!< 64-bit extraction from a vector register. + kCvtIntToF32, //!< Int to float32 conversion. + kCvtIntToF64, //!< Int to float64 conversion. + kCvtTruncF32ToInt, //!< Float32 to int conversion with truncation semantics. + kCvtRoundF32ToInt, //!< Float64 to int conversion with round-to-even semantics. + kCvtTruncF64ToInt, //!< Float32 to int conversion with truncation semantics. + kCvtRoundF64ToInt, //!< Float64 to int conversion with round-to-even semantics. kMaxValue = kCvtRoundF64ToInt }; +//! Instruction with `[vec, mem]` operands. +//! +//! Describes load, convert, and insert instructions. enum class UniOpVM : uint32_t { - kLoad8, - kLoad16_U16, - kLoad32_U32, - kLoad32_F32, + kLoad8, //!< 8-bit load into a vector register (the rest is cleared). + kLoad16_U16, //!< 16-bit load into a vector register (the rest is cleared). + kLoad32_U32, //!< 32-bit load (int) into a vector register (the rest is cleared). + kLoad32_F32, //!< 32-bit load (f32) into a vector register (the rest is cleared). - kLoad64_U32, - kLoad64_U64, - kLoad64_F32, - kLoad64_F64, + kLoad64_U32, //!< 32-bit load (int) into a vector register (the rest is cleared). + kLoad64_U64, //!< 64-bit load (int) into a vector register (the rest is cleared). + kLoad64_F32, //!< 32-bit load (f32) into a vector register (the rest is cleared). + kLoad64_F64, //!< 64-bit load (f64) into a vector register (the rest is cleared). - kLoad128_U32, - kLoad128_U64, - kLoad128_F32, - kLoad128_F64, + kLoad128_U32, //!< 128-bit load (int) into a vector register (the rest is cleared). + kLoad128_U64, //!< 128-bit load (int) into a vector register (the rest is cleared). + kLoad128_F32, //!< 128-bit load (f32) into a vector register (the rest is cleared). + kLoad128_F64, //!< 128-bit load (f64) into a vector register (the rest is cleared). - kLoad256_U32, - kLoad256_U64, - kLoad256_F32, - kLoad256_F64, + kLoad256_U32, //!< 256-bit load (int) into a vector register (the rest is cleared). + kLoad256_U64, //!< 256-bit load (int) into a vector register (the rest is cleared). + kLoad256_F32, //!< 256-bit load (f32) into a vector register (the rest is cleared). + kLoad256_F64, //!< 256-bit load (f64) into a vector register (the rest is cleared). - kLoad512_U32, - kLoad512_U64, - kLoad512_F32, - kLoad512_F64, + kLoad512_U32, //!< 512-bit load (int) into a vector register (the rest is cleared). + kLoad512_U64, //!< 512-bit load (int) into a vector register (the rest is cleared). + kLoad512_F32, //!< 512-bit load (f32) into a vector register (the rest is cleared). + kLoad512_F64, //!< 512-bit load (f64) into a vector register (the rest is cleared). - kLoadN_U32, - kLoadN_U64, - kLoadN_F32, - kLoadN_F64, + kLoadN_U32, //!< N-bit load (int) into a vector register (the size depends on the vector width). + kLoadN_U64, //!< N-bit load (int) into a vector register (the size depends on the vector width). + kLoadN_F32, //!< N-bit load (f32) into a vector register (the size depends on the vector width). + kLoadN_F64, //!< N-bit load (f64) into a vector register (the size depends on the vector width). - kLoadCvt16_U8ToU64, - kLoadCvt32_U8ToU64, - kLoadCvt64_U8ToU64, + kLoadCvt16_U8ToU64, //!< 16-bit load into a vector register with 8-bit to 64-bit zero extension (128-bit result). + kLoadCvt32_U8ToU64, //!< 32-bit load into a vector register with 8-bit to 64-bit zero extension (256-bit result). + kLoadCvt64_U8ToU64, //!< 64-bit load into a vector register with 8-bit to 64-bit zero extension (512-bit result). - kLoadCvt32_I8ToI16, - kLoadCvt32_U8ToU16, - kLoadCvt32_I8ToI32, - kLoadCvt32_U8ToU32, - kLoadCvt32_I16ToI32, - kLoadCvt32_U16ToU32, - kLoadCvt32_I32ToI64, - kLoadCvt32_U32ToU64, + kLoadCvt32_I8ToI16, //!< 32-bit load into a vector register with 8-bit to 16-bit sign extension (64-bit result). + kLoadCvt32_U8ToU16, //!< 32-bit load into a vector register with 8-bit to 16-bit zero extension (64-bit result). + kLoadCvt32_I8ToI32, //!< 32-bit load into a vector register with 8-bit to 32-bit sign extension (128-bit result). + kLoadCvt32_U8ToU32, //!< 32-bit load into a vector register with 8-bit to 32-bit zero extension (128-bit result). + kLoadCvt32_I16ToI32, //!< 32-bit load into a vector register with 16-bit to 32-bit sign extension (64-bit result). + kLoadCvt32_U16ToU32, //!< 32-bit load into a vector register with 16-bit to 32-bit zero extension (64-bit result). + kLoadCvt32_I32ToI64, //!< 32-bit load into a vector register with 32-bit to 64-bit sign extension (64-bit result). + kLoadCvt32_U32ToU64, //!< 32-bit load into a vector register with 32-bit to 64-bit zero extension (64-bit result). - kLoadCvt64_I8ToI16, - kLoadCvt64_U8ToU16, - kLoadCvt64_I8ToI32, - kLoadCvt64_U8ToU32, - kLoadCvt64_I16ToI32, - kLoadCvt64_U16ToU32, - kLoadCvt64_I32ToI64, - kLoadCvt64_U32ToU64, + kLoadCvt64_I8ToI16, //!< 64-bit load into a vector register with 8-bit to 16-bit sign extension (128-bit result). + kLoadCvt64_U8ToU16, //!< 64-bit load into a vector register with 8-bit to 16-bit zero extension (128-bit result). + kLoadCvt64_I8ToI32, //!< 64-bit load into a vector register with 8-bit to 32-bit sign extension (256-bit result). + kLoadCvt64_U8ToU32, //!< 64-bit load into a vector register with 8-bit to 32-bit zero extension (256-bit result). + kLoadCvt64_I16ToI32, //!< 64-bit load into a vector register with 16-bit to 32-bit sign extension (128-bit result). + kLoadCvt64_U16ToU32, //!< 64-bit load into a vector register with 16-bit to 32-bit zero extension (128-bit result). + kLoadCvt64_I32ToI64, //!< 64-bit load into a vector register with 32-bit to 64-bit sign extension (128-bit result). + kLoadCvt64_U32ToU64, //!< 64-bit load into a vector register with 32-bit to 64-bit zero extension (128-bit result). - kLoadCvt128_I8ToI16, - kLoadCvt128_U8ToU16, - kLoadCvt128_I8ToI32, - kLoadCvt128_U8ToU32, - kLoadCvt128_I16ToI32, - kLoadCvt128_U16ToU32, - kLoadCvt128_I32ToI64, - kLoadCvt128_U32ToU64, + kLoadCvt128_I8ToI16, //!< 128-bit load into a vector register with 8-bit to 16-bit sign extension (256-bit result). + kLoadCvt128_U8ToU16, //!< 128-bit load into a vector register with 8-bit to 16-bit zero extension (256-bit result). + kLoadCvt128_I8ToI32, //!< 128-bit load into a vector register with 8-bit to 32-bit sign extension (512-bit result). + kLoadCvt128_U8ToU32, //!< 128-bit load into a vector register with 8-bit to 32-bit zero extension (512-bit result). + kLoadCvt128_I16ToI32, //!< 128-bit load into a vector register with 16-bit to 32-bit sign extension (256-bit result). + kLoadCvt128_U16ToU32, //!< 128-bit load into a vector register with 16-bit to 32-bit zero extension (256-bit result). + kLoadCvt128_I32ToI64, //!< 128-bit load into a vector register with 32-bit to 64-bit sign extension (256-bit result). + kLoadCvt128_U32ToU64, //!< 128-bit load into a vector register with 32-bit to 64-bit zero extension (256-bit result). - kLoadCvt256_I8ToI16, - kLoadCvt256_U8ToU16, - kLoadCvt256_I16ToI32, - kLoadCvt256_U16ToU32, - kLoadCvt256_I32ToI64, - kLoadCvt256_U32ToU64, + kLoadCvt256_I8ToI16, //!< 256-bit load into a vector register with 8-bit to 16-bit sign extension (512-bit result). + kLoadCvt256_U8ToU16, //!< 256-bit load into a vector register with 8-bit to 16-bit zero extension (512-bit result). + kLoadCvt256_I16ToI32, //!< 256-bit load into a vector register with 16-bit to 32-bit sign extension (512-bit result). + kLoadCvt256_U16ToU32, //!< 256-bit load into a vector register with 16-bit to 32-bit zero extension (512-bit result). + kLoadCvt256_I32ToI64, //!< 256-bit load into a vector register with 32-bit to 64-bit sign extension (512-bit result). + kLoadCvt256_U32ToU64, //!< 256-bit load into a vector register with 32-bit to 64-bit zero extension (512-bit result). - kLoadCvtN_U8ToU64, + kLoadCvtN_U8ToU64, //!< N-bit load with 8-bit to 64-bit zero extension (the size depends on the vector width). - kLoadCvtN_I8ToI16, - kLoadCvtN_U8ToU16, - kLoadCvtN_I8ToI32, - kLoadCvtN_U8ToU32, - kLoadCvtN_I16ToI32, - kLoadCvtN_U16ToU32, - kLoadCvtN_I32ToI64, - kLoadCvtN_U32ToU64, + kLoadCvtN_I8ToI16, //!< N-bit load with 8-bit to 16-bit sign extension (the size depends on the vector width). + kLoadCvtN_U8ToU16, //!< N-bit load with 8-bit to 16-bit zero extension (the size depends on the vector width). + kLoadCvtN_I8ToI32, //!< N-bit load with 8-bit to 32-bit sign extension (the size depends on the vector width). + kLoadCvtN_U8ToU32, //!< N-bit load with 8-bit to 32-bit zero extension (the size depends on the vector width). + kLoadCvtN_I16ToI32, //!< N-bit load with 16-bit to 32-bit sign extension (the size depends on the vector width). + kLoadCvtN_U16ToU32, //!< N-bit load with 16-bit to 32-bit zero extension (the size depends on the vector width). + kLoadCvtN_I32ToI64, //!< N-bit load with 32-bit to 64-bit sign extension (the size depends on the vector width). + kLoadCvtN_U32ToU64, //!< N-bit load with 32-bit to 64-bit zero extension (the size depends on the vector width). - kLoadInsertU8, - kLoadInsertU16, - kLoadInsertU32, - kLoadInsertU64, - kLoadInsertF32, - kLoadInsertF32x2, - kLoadInsertF64, + kLoadInsertU8, //!< 8-bit insert (int) into a vector register from memory. + kLoadInsertU16, //!< 16-bit insert (int) into a vector register from memory. + kLoadInsertU32, //!< 32-bit insert (int) into a vector register from memory. + kLoadInsertU64, //!< 64-bit insert (int) into a vector register from memory. + kLoadInsertF32, //!< 32-bit insert (f32) into a vector register from memory. + kLoadInsertF32x2, //!< 64-bit insert (f32x2) into a vector register from memory. + kLoadInsertF64, //!< 64-bit insert (f64) into a vector register from memory. kMaxValue = kLoadInsertF64 }; +//! Instruction with `[mem, vec]` operands. +//! +//! Describes store and extract instructions. enum class UniOpMV : uint32_t { - kStore8, - kStore16_U16, - kStore32_U32, - kStore32_F32, + kStore8, //!< 8-bit store (int) of a vector register. + kStore16_U16, //!< 16-bit store (int) of a vector register. + kStore32_U32, //!< 16-bit store (int) of a vector register. + kStore32_F32, //!< 16-bit store (f32) of a vector register. - kStore64_U32, - kStore64_U64, - kStore64_F32, - kStore64_F64, + kStore64_U32, //!< 64-bit store (int) of a vector register. + kStore64_U64, //!< 64-bit store (int) of a vector register. + kStore64_F32, //!< 64-bit store (f32) of a vector register. + kStore64_F64, //!< 64-bit store (f64) of a vector register. - kStore128_U32, - kStore128_U64, - kStore128_F32, - kStore128_F64, + kStore128_U32, //!< 128-bit store (int) of a vector register. + kStore128_U64, //!< 128-bit store (int) of a vector register. + kStore128_F32, //!< 128-bit store (f32) of a vector register. + kStore128_F64, //!< 128-bit store (f64) of a vector register. - kStore256_U32, - kStore256_U64, - kStore256_F32, - kStore256_F64, + kStore256_U32, //!< 256-bit store (int) of a vector register. + kStore256_U64, //!< 256-bit store (int) of a vector register. + kStore256_F32, //!< 256-bit store (f32) of a vector register. + kStore256_F64, //!< 256-bit store (f64) of a vector register. - kStore512_U32, - kStore512_U64, - kStore512_F32, - kStore512_F64, + kStore512_U32, //!< 512-bit store (int) of a vector register. + kStore512_U64, //!< 512-bit store (int) of a vector register. + kStore512_F32, //!< 512-bit store (f32) of a vector register. + kStore512_F64, //!< 512-bit store (f64) of a vector register. - kStoreN_U32, - kStoreN_U64, - kStoreN_F32, - kStoreN_F64, + kStoreN_U32, //!< N-bit store (int) of a vector register (the size depends on the vector width). + kStoreN_U64, //!< N-bit store (int) of a vector register (the size depends on the vector width). + kStoreN_F32, //!< N-bit store (f32) of a vector register (the size depends on the vector width). + kStoreN_F64, //!< N-bit store (f64) of a vector register (the size depends on the vector width). - kStoreExtractU16, - kStoreExtractU32, - kStoreExtractU64, + kStoreExtractU16, //!< 16-bit extract from lane and store. + kStoreExtractU32, //!< 32-bit extract from lane and store. + kStoreExtractU64, //!< 64-bit extract from lane and store. /* kStoreCvtz64_U16ToU8, @@ -300,86 +323,105 @@ enum class UniOpMV : uint32_t { kMaxValue = kStoreExtractU64 }; +//! Instruction with `[vec, vec]` operands. +//! +//! Describes vector arithmetic that has one destination and one source. +//! +//! \note For convenience, the second operand can be register, memory, or immediate value. enum class UniOpVV : uint32_t { - kMov, - kMovU64, + kMov, //!< Vector move. + kMovU64, //!< Vector move of the low 64-bit data, the rest is set to zero. - kBroadcastU8Z, - kBroadcastU16Z, - kBroadcastU8, - kBroadcastU16, - kBroadcastU32, - kBroadcastU64, - kBroadcastF32, - kBroadcastF64, - kBroadcastV128_U32, - kBroadcastV128_U64, - kBroadcastV128_F32, - kBroadcastV128_F64, - kBroadcastV256_U32, - kBroadcastV256_U64, - kBroadcastV256_F32, - kBroadcastV256_F64, + kBroadcastU8Z, //!< Vector u8 broadcast with an assumption that the rest of the source vector is zero. + kBroadcastU16Z, //!< Vector u16 broadcast with an assumption that the rest of the source vector is zero. + kBroadcastU8, //!< Vector u8 broadcast to all lanes. + kBroadcastU16, //!< Vector u16 broadcast to all lanes. + kBroadcastU32, //!< Vector u32 broadcast to all lanes. + kBroadcastU64, //!< Vector u64 broadcast to all lanes. + kBroadcastF32, //!< Vector f32 broadcast to all lanes. + kBroadcastF64, //!< Vector f64 broadcast to all lanes. + kBroadcastV128_U32, //!< Vector broadcast of 128-bit lanes. + kBroadcastV128_U64, //!< Vector broadcast of 128-bit lanes. + kBroadcastV128_F32, //!< Vector broadcast of 128-bit lanes. + kBroadcastV128_F64, //!< Vector broadcast of 128-bit lanes. + kBroadcastV256_U32, //!< Vector broadcast of 256-bit lanes. + kBroadcastV256_U64, //!< Vector broadcast of 256-bit lanes. + kBroadcastV256_F32, //!< Vector broadcast of 256-bit lanes. + kBroadcastV256_F64, //!< Vector broadcast of 256-bit lanes. - kAbsI8, - kAbsI16, - kAbsI32, - kAbsI64, + kAbsI8, //!< Vector i8 absolute value - `dst = abs(src)`. + kAbsI16, //!< Vector i16 absolute value - `dst = abs(src)`. + kAbsI32, //!< Vector i32 absolute value - `dst = abs(src)`. + kAbsI64, //!< Vector i64 absolute value - `dst = abs(src)`. - kNotU32, - kNotU64, + kNotU32, //!< Vector u32 bitwise NOT - `dst = ~src`. + kNotU64, //!< Vector u64 bitwise NOT - `dst = ~src`. - kCvtI8LoToI16, - kCvtI8HiToI16, - kCvtU8LoToU16, - kCvtU8HiToU16, - kCvtI8ToI32, - kCvtU8ToU32, - kCvtI16LoToI32, - kCvtI16HiToI32, - kCvtU16LoToU32, - kCvtU16HiToU32, - kCvtI32LoToI64, - kCvtI32HiToI64, - kCvtU32LoToU64, - kCvtU32HiToU64, + kCvtI8LoToI16, //!< Vector sign extend low i8 to i16. + kCvtI8HiToI16, //!< Vector sign extend high i8 to i16. + kCvtU8LoToU16, //!< Vector zero extend low u8 to u16. + kCvtU8HiToU16, //!< Vector zero extend high u8 to u16. + kCvtI8ToI32, //!< Vector zero extend low i8 to i32. + kCvtU8ToU32, //!< Vector zero extend high u8 to u32. + kCvtI16LoToI32, //!< Vector sign extend low i16 to i32. + kCvtI16HiToI32, //!< Vector sign extend high i16 to i32. + kCvtU16LoToU32, //!< Vector zero extend low u16 to u32. + kCvtU16HiToU32, //!< Vector zero extend high u16 to u32. + kCvtI32LoToI64, //!< Vector sign extend low i32 to i64. + kCvtI32HiToI64, //!< Vector sign extend high i32 to i64. + kCvtU32LoToU64, //!< Vector zero extend low u32 to u64. + kCvtU32HiToU64, //!< Vector zero extend high u32 to u64. - kAbsF32, - kAbsF64, + kAbsF32S, //!< Scalar f32 absolute value. + kAbsF64S, //!< Scalar f64 absolute value. + kAbsF32, //!< Vector f32 absolute value. + kAbsF64, //!< Vector f64 absolute value. - kNegF32, - kNegF64, + kNegF32S, //!< Scalar f32 negate. + kNegF64S, //!< Scalar f64 negate. + kNegF32, //!< Vector f32 negate. + kNegF64, //!< Vector f64 negate. - kNotF32, - kNotF64, + kNotF32, //!< Vector f32 bitwise NOT. + kNotF64, //!< Vector f64 bitwise NOT. - kTruncF32S, - kTruncF64S, - kTruncF32, - kTruncF64, + kTruncF32S, //!< Scalar f32 truncate. + kTruncF64S, //!< Scalar f64 truncate. + kTruncF32, //!< Vector f32 truncate. + kTruncF64, //!< Vector f64 truncate. - kFloorF32S, - kFloorF64S, - kFloorF32, - kFloorF64, + kFloorF32S, //!< Scalar f32 floor. + kFloorF64S, //!< Scalar f64 floor. + kFloorF32, //!< Vector f32 floor. + kFloorF64, //!< Vector f64 floor. - kCeilF32S, - kCeilF64S, - kCeilF32, - kCeilF64, + kCeilF32S, //!< Scalar f32 ceil. + kCeilF64S, //!< Scalar f64 ceil. + kCeilF32, //!< Vector f32 ceil. + kCeilF64, //!< Vector f64 ceil. - kRoundF32S, - kRoundF64S, - kRoundF32, - kRoundF64, + kRoundEvenF32S, //!< Scalar f32 round-even. + kRoundEvenF64S, //!< Scalar f64 round-even. + kRoundEvenF32, //!< Vector f32 round-even. + kRoundEvenF64, //!< Vector f64 round-even. - kRcpF32, - kRcpF64, + kRoundHalfAwayF32S, //!< Scalar f32 round-half-away (0.5 and greater fraction rounds away from zero). + kRoundHalfAwayF64S, //!< Scalar f64 round-half-away (0.5 and greater fraction rounds away from zero). + kRoundHalfAwayF32, //!< Vector f32 round-half-away (0.5 and greater fraction rounds away from zero). + kRoundHalfAwayF64, //!< Vector f64 round-half-away (0.5 and greater fraction rounds away from zero). - kSqrtF32S, - kSqrtF64S, - kSqrtF32, - kSqrtF64, + kRoundHalfUpF32S, //!< Scalar f32 round-half-up (0.5 and greater fraction rounds up). + kRoundHalfUpF64S, //!< Scalar f64 round-half-up (0.5 and greater fraction rounds up). + kRoundHalfUpF32, //!< Vector f32 round-half-up (0.5 and greater fraction rounds up). + kRoundHalfUpF64, //!< Vector f64 round-half-up (0.5 and greater fraction rounds up). + + kRcpF32, //!< Vector f32 reciprocal - `dst = 1.0 / src`. + kRcpF64, //!< Vector f64 reciprocal - `dst = 1.0 / src`. + + kSqrtF32S, //!< Scalar f32 square root. + kSqrtF64S, //!< Scalar f64 square root. + kSqrtF32, //!< Vector f32 square root. + kSqrtF64, //!< Vector f64 square root. kCvtF32ToF64S, kCvtF64ToF32S, @@ -400,35 +442,40 @@ enum class UniOpVV : uint32_t { kMaxValue = kCvtRoundF64ToI32Hi }; +//! Instruction with `[vec, vec, imm]` operands. +//! +//! Describes vector arithmetic that has one destination, one source, and one immediate. +//! +//! \note For convenience, the second operand can be register, memory, or immediate value. enum class UniOpVVI : uint32_t { - kSllU16, - kSllU32, - kSllU64, - kSrlU16, - kSrlU32, - kSrlU64, - kSraI16, - kSraI32, - kSraI64, - kSllbU128, - kSrlbU128, - kSwizzleU16x4, - kSwizzleLoU16x4, - kSwizzleHiU16x4, - kSwizzleU32x4, - kSwizzleU64x2, - kSwizzleF32x4, - kSwizzleF64x2, - kSwizzleU64x4, - kSwizzleF64x4, - kExtractV128_I32, - kExtractV128_I64, - kExtractV128_F32, - kExtractV128_F64, - kExtractV256_I32, - kExtractV256_I64, - kExtractV256_F32, - kExtractV256_F64, + kSllU16, //!< Vector u16 shift left logical. + kSllU32, //!< Vector u32 shift left logical. + kSllU64, //!< Vector u64 shift left logical. + kSrlU16, //!< Vector u16 shift right logical. + kSrlU32, //!< Vector u32 shift right logical. + kSrlU64, //!< Vector u64 shift right logical. + kSraI16, //!< Vector u16 shift right arithmetic. + kSraI32, //!< Vector u32 shift right arithmetic. + kSraI64, //!< Vector u64 shift right arithmetic. + kSllbU128, //!< Vector shift bytes (128-bit lanes). + kSrlbU128, //!< Vector shift bytes (128-bit lanes). + kSwizzleU16x4, //!< Vector swizzle u16x4 (128-bit lanes). + kSwizzleLoU16x4, //!< Vector swizzle u16x4 (low 64-bit lanes). + kSwizzleHiU16x4, //!< Vector swizzle u16x4 (high 64-bit lanes) + kSwizzleU32x4, //!< Vector swizzle u32x4 (128-bit lanes). + kSwizzleU64x2, //!< Vector swizzle u64x2 (128-bit lanes). + kSwizzleF32x4, //!< Vector swizzle f32x4 (128-bit lanes). + kSwizzleF64x2, //!< Vector swizzle f64x2 (128-bit lanes). + kSwizzleU64x4, //!< Vector swizzle u64x4 (256-bit lanes). + kSwizzleF64x4, //!< Vector swizzle f64x4 (256-bit lanes). + kExtractV128_I32, //!< Vector extract 128-bit lane from 256-bit or 512-bit vector. + kExtractV128_I64, //!< Vector extract 128-bit lane from 256-bit or 512-bit vector. + kExtractV128_F32, //!< Vector extract 128-bit lane from 256-bit or 512-bit vector. + kExtractV128_F64, //!< Vector extract 128-bit lane from 256-bit or 512-bit vector. + kExtractV256_I32, //!< Vector extract 256-bit lane from 512-bit vector. + kExtractV256_I64, //!< Vector extract 256-bit lane from 512-bit vector. + kExtractV256_F32, //!< Vector extract 256-bit lane from 512-bit vector. + kExtractV256_F64, //!< Vector extract 256-bit lane from 512-bit vector. #if defined(ASMJIT_UJIT_AARCH64) kSrlRndU16, @@ -466,188 +513,197 @@ enum class UniOpVVI : uint32_t { #endif // ASMJIT_UJIT_AARCH64 }; +//! Instruction with `[vec, vec, vec]` operands. +//! +//! Describes vector arithmetic that has one destination and two sources. +//! +//! \note For convenience, the third operand can be register, memory, or immediate value. enum class UniOpVVV : uint32_t { - kAndU32, - kAndU64, - kOrU32, - kOrU64, - kXorU32, - kXorU64, - kAndnU32, - kAndnU64, - kBicU32, - kBicU64, - kAvgrU8, - kAvgrU16, - kAddU8, - kAddU16, - kAddU32, - kAddU64, - kSubU8, - kSubU16, - kSubU32, - kSubU64, - kAddsI8, - kAddsU8, - kAddsI16, - kAddsU16, - kSubsI8, - kSubsU8, - kSubsI16, - kSubsU16, - kMulU16, - kMulU32, - kMulU64, - kMulhI16, - kMulhU16, - kMulU64_LoU32, - kMHAddI16_I32, - kMinI8, - kMinU8, - kMinI16, - kMinU16, - kMinI32, - kMinU32, - kMinI64, - kMinU64, - kMaxI8, - kMaxU8, - kMaxI16, - kMaxU16, - kMaxI32, - kMaxU32, - kMaxI64, - kMaxU64, - kCmpEqU8, - kCmpEqU16, - kCmpEqU32, - kCmpEqU64, - kCmpGtI8, - kCmpGtU8, - kCmpGtI16, - kCmpGtU16, - kCmpGtI32, - kCmpGtU32, - kCmpGtI64, - kCmpGtU64, - kCmpGeI8, - kCmpGeU8, - kCmpGeI16, - kCmpGeU16, - kCmpGeI32, - kCmpGeU32, - kCmpGeI64, - kCmpGeU64, - kCmpLtI8, - kCmpLtU8, - kCmpLtI16, - kCmpLtU16, - kCmpLtI32, - kCmpLtU32, - kCmpLtI64, - kCmpLtU64, - kCmpLeI8, - kCmpLeU8, - kCmpLeI16, - kCmpLeU16, - kCmpLeI32, - kCmpLeU32, - kCmpLeI64, - kCmpLeU64, + kAndU32, //!< Vector u32 bitwise AND - `dst = src1 & src2`. + kAndU64, //!< Vector u64 bitwise AND - `dst = src1 & src2`. + kOrU32, //!< Vector u32 bitwise OR - `dst = src1 | src2`. + kOrU64, //!< Vector u64 bitwise OR - `dst = src1 | src2`. + kXorU32, //!< Vector u32 bitwise XOR - `dst = src1 ^ src2`. + kXorU64, //!< Vector u64 bitwise XOR - `dst = src1 ^ src2`. + kAndnU32, //!< Vector u32 bitwise ANDN - `dst = ~src1 & src2`. + kAndnU64, //!< Vector u64 bitwise ANDN - `dst = ~src1 & src2`. + kBicU32, //!< Vector u32 bitwise BIC - `dst = src1 & ~src2`. + kBicU64, //!< Vector u64 bitwise BIC - `dst = src1 & ~src2`. + kAvgrU8, //!< Vector u8 average rounded half up `dst = (src1 + src2 + 1) >> 1`. + kAvgrU16, //!< Vector u16 average rounded half up `dst = (src1 + src2 + 1) >> 1`. + kAddU8, //!< Vector u8 add. + kAddU16, //!< Vector u16 add. + kAddU32, //!< Vector u32 add. + kAddU64, //!< Vector u64 add. + kSubU8, //!< Vector u8 sub. + kSubU16, //!< Vector u16 sub. + kSubU32, //!< Vector u32 sub. + kSubU64, //!< Vector u64 sub. + kAddsI8, //!< Vector i8 add with saturation (signed). + kAddsU8, //!< Vector u8 add with saturation (unsigned). + kAddsI16, //!< Vector i16 add with saturation (signed). + kAddsU16, //!< Vector u16 add with saturation (unsigned). + kSubsI8, //!< Vector i8 sub with saturation (signed). + kSubsU8, //!< Vector u8 sub with saturation (unsigned). + kSubsI16, //!< Vector i16 sub with saturation (signed). + kSubsU16, //!< Vector u16 sub with saturation (unsigned). + kMulU16, //!< Vector u16 multiply. + kMulU32, //!< Vector u32 multiply. + kMulU64, //!< Vector u64 multiply. + kMulhI16, //!< Vector i16 multiply high - `dst = (src1 * src2) >> 16`. + kMulhU16, //!< Vector u16 multiply high - `dst = (src1 * src2) >> 16`. + kMulU64_LoU32, //!< Vector u64xu32 multiply. + kMHAddI16_I32, //!< Vector i16 multiply with horizontal widening add to form a 32-bit result. + kMinI8, //!< Vector i8 minimum. + kMinU8, //!< Vector u8 minimum. + kMinI16, //!< Vector i16 minimum. + kMinU16, //!< Vector u16 minimum. + kMinI32, //!< Vector i32 minimum. + kMinU32, //!< Vector u32 minimum. + kMinI64, //!< Vector i64 minimum. + kMinU64, //!< Vector u64 minimum. + kMaxI8, //!< Vector i8 maximum. + kMaxU8, //!< Vector u8 maximum. + kMaxI16, //!< Vector i16 maximum. + kMaxU16, //!< Vector u16 maximum. + kMaxI32, //!< Vector i32 maximum. + kMaxU32, //!< Vector u32 maximum. + kMaxI64, //!< Vector i64 maximum. + kMaxU64, //!< Vector u64 maximum. + kCmpEqU8, //!< Vector u8 compare equal. + kCmpEqU16, //!< Vector u16 compare equal. + kCmpEqU32, //!< Vector u32 compare equal. + kCmpEqU64, //!< Vector u64 compare equal. + kCmpGtI8, //!< Vector i8 compare greater-than. + kCmpGtU8, //!< Vector u8 compare greater-than. + kCmpGtI16, //!< Vector i16 compare greater-than. + kCmpGtU16, //!< Vector u16 compare greater-than. + kCmpGtI32, //!< Vector i32 compare greater-than. + kCmpGtU32, //!< Vector u32 compare greater-than. + kCmpGtI64, //!< Vector i64 compare greater-than. + kCmpGtU64, //!< Vector u64 compare greater-than. + kCmpGeI8, //!< Vector i8 compare greater-or-equal. + kCmpGeU8, //!< Vector u8 compare greater-or-equal. + kCmpGeI16, //!< Vector i16 compare greater-or-equal. + kCmpGeU16, //!< Vector u16 compare greater-or-equal. + kCmpGeI32, //!< Vector i32 compare greater-or-equal. + kCmpGeU32, //!< Vector u32 compare greater-or-equal. + kCmpGeI64, //!< Vector i64 compare greater-or-equal. + kCmpGeU64, //!< Vector u64 compare greater-or-equal. + kCmpLtI8, //!< Vector i8 compare lesser-than. + kCmpLtU8, //!< Vector u8 compare lesser-than. + kCmpLtI16, //!< Vector i16 compare lesser-than. + kCmpLtU16, //!< Vector u16 compare lesser-than. + kCmpLtI32, //!< Vector i32 compare lesser-than. + kCmpLtU32, //!< Vector u32 compare lesser-than. + kCmpLtI64, //!< Vector i64 compare lesser-than. + kCmpLtU64, //!< Vector u64 compare lesser-than. + kCmpLeI8, //!< Vector i8 compare lesser-or-equal. + kCmpLeU8, //!< Vector u8 compare lesser-or-equal. + kCmpLeI16, //!< Vector i16 compare lesser-or-equal. + kCmpLeU16, //!< Vector u16 compare lesser-or-equal. + kCmpLeI32, //!< Vector i32 compare lesser-or-equal. + kCmpLeU32, //!< Vector u32 compare lesser-or-equal. + kCmpLeI64, //!< Vector i64 compare lesser-or-equal. + kCmpLeU64, //!< Vector u64 compare lesser-or-equal. - kAndF32, - kAndF64, - kOrF32, - kOrF64, - kXorF32, - kXorF64, - kAndnF32, - kAndnF64, - kBicF32, - kBicF64, - kAddF32S, - kAddF64S, - kAddF32, - kAddF64, - kSubF32S, - kSubF64S, - kSubF32, - kSubF64, - kMulF32S, - kMulF64S, - kMulF32, - kMulF64, - kDivF32S, - kDivF64S, - kDivF32, - kDivF64, - kMinF32S, - kMinF64S, - kMinF32, - kMinF64, - kMaxF32S, - kMaxF64S, - kMaxF32, - kMaxF64, - kCmpEqF32S, - kCmpEqF64S, - kCmpEqF32, - kCmpEqF64, - kCmpNeF32S, - kCmpNeF64S, - kCmpNeF32, - kCmpNeF64, - kCmpGtF32S, - kCmpGtF64S, - kCmpGtF32, - kCmpGtF64, - kCmpGeF32S, - kCmpGeF64S, - kCmpGeF32, - kCmpGeF64, - kCmpLtF32S, - kCmpLtF64S, - kCmpLtF32, - kCmpLtF64, - kCmpLeF32S, - kCmpLeF64S, - kCmpLeF32, - kCmpLeF64, - kCmpOrdF32S, - kCmpOrdF64S, - kCmpOrdF32, - kCmpOrdF64, - kCmpUnordF32S, - kCmpUnordF64S, - kCmpUnordF32, - kCmpUnordF64, + kAndF32, //!< Vector f32 bitwise AND - `dst = src1 & src2`. + kAndF64, //!< Vector f64 bitwise AND - `dst = src1 & src2`. + kOrF32, //!< Vector f32 bitwise OR - `dst = src1 | src2`. + kOrF64, //!< Vector f64 bitwise OR - `dst = src1 | src2`. + kXorF32, //!< Vector f32 bitwise XOR - `dst = src1 ^ src2`. + kXorF64, //!< Vector f64 bitwise XOR - `dst = src1 ^ src2`. + kAndnF32, //!< Vector f32 bitwise ANDN - `dst = ~src1 & src2`. + kAndnF64, //!< Vector f64 bitwise ANDN - `dst = ~src1 & src2`. + kBicF32, //!< Vector f32 bitwise BIC - `dst = src1 & ~src2`. + kBicF64, //!< Vector f64 bitwise BIC - `dst = src1 & ~src2`. + kAddF32S, //!< Scalar f32 add. + kAddF64S, //!< Scalar f64 add. + kAddF32, //!< Vector f32 add. + kAddF64, //!< Vector f64 add. + kSubF32S, //!< Scalar f32 sub. + kSubF64S, //!< Scalar f64 sub. + kSubF32, //!< Vector f32 sub. + kSubF64, //!< Vector f64 sub. + kMulF32S, //!< Scalar f32 mul. + kMulF64S, //!< Scalar f64 mul. + kMulF32, //!< Vector f32 mul. + kMulF64, //!< Vector f64 mul. + kDivF32S, //!< Scalar f32 div. + kDivF64S, //!< Scalar f64 div. + kDivF32, //!< Vector f32 div. + kDivF64, //!< Vector f64 div. + kModF32S, //!< Scalar f32 modulo. + kModF64S, //!< Scalar f64 modulo. + kModF32, //!< Vector f32 modulo. + kModF64, //!< Vector f64 modulo. + kMinF32S, //!< Scalar f32 minimum. + kMinF64S, //!< Scalar f64 minimum. + kMinF32, //!< Vector f32 minimum. + kMinF64, //!< Vector f64 minimum. + kMaxF32S, //!< Scalar f32 maximum. + kMaxF64S, //!< Scalar f64 maximum. + kMaxF32, //!< Vector f32 maximum. + kMaxF64, //!< Vector f64 maximum. + kCmpEqF32S, //!< Scalar f32 compare equal (ordered). + kCmpEqF64S, //!< Scalar f64 compare equal (ordered). + kCmpEqF32, //!< Vector f32 compare equal (ordered). + kCmpEqF64, //!< Vector f64 compare equal (ordered). + kCmpNeF32S, //!< Scalar f32 compare not-equal (ordered) + kCmpNeF64S, //!< Scalar f64 compare not-equal (ordered) + kCmpNeF32, //!< Vector f32 compare not-equal (ordered) + kCmpNeF64, //!< Vector f64 compare not-equal (ordered) + kCmpGtF32S, //!< Scalar f32 compare greater-than (ordered) + kCmpGtF64S, //!< Scalar f64 compare greater-than (ordered) + kCmpGtF32, //!< Vector f32 compare greater-than (ordered) + kCmpGtF64, //!< Vector f64 compare greater-than (ordered) + kCmpGeF32S, //!< Scalar f32 compare greater-or-equal (ordered) + kCmpGeF64S, //!< Scalar f64 compare greater-or-equal (ordered) + kCmpGeF32, //!< Vector f32 compare greater-or-equal (ordered) + kCmpGeF64, //!< Vector f64 compare greater-or-equal (ordered) + kCmpLtF32S, //!< Scalar f32 compare lesser-than (ordered) + kCmpLtF64S, //!< Scalar f64 compare lesser-than (ordered) + kCmpLtF32, //!< Vector f32 compare lesser-than (ordered) + kCmpLtF64, //!< Vector f64 compare lesser-than (ordered) + kCmpLeF32S, //!< Scalar f32 compare lesser-or-equal (ordered) + kCmpLeF64S, //!< Scalar f64 compare lesser-or-equal (ordered) + kCmpLeF32, //!< Vector f32 compare lesser-or-equal (ordered) + kCmpLeF64, //!< Vector f64 compare lesser-or-equal (ordered) + kCmpOrdF32S, //!< Scalar f32 compare ordered. + kCmpOrdF64S, //!< Scalar f64 compare ordered. + kCmpOrdF32, //!< Vector f32 compare ordered. + kCmpOrdF64, //!< Vector f64 compare ordered. + kCmpUnordF32S, //!< Scalar f32 compare unordered. + kCmpUnordF64S, //!< Scalar f64 compare unordered. + kCmpUnordF32, //!< Vector f32 compare unordered. + kCmpUnordF64, //!< Vector f64 compare unordered. - kHAddF64, + kHAddF64, //!< Vector f64 horizontal-add. - kCombineLoHiU64, - kCombineLoHiF64, - kCombineHiLoU64, - kCombineHiLoF64, + kCombineLoHiU64, //!< Combine low and high u64 lanes. + kCombineLoHiF64, //!< Combine low and high f64 lanes. + kCombineHiLoU64, //!< Combine low and high u64 lanes. + kCombineHiLoF64, //!< Combine low and high f64 lanes. - kInterleaveLoU8, - kInterleaveHiU8, - kInterleaveLoU16, - kInterleaveHiU16, - kInterleaveLoU32, - kInterleaveHiU32, - kInterleaveLoU64, - kInterleaveHiU64, - kInterleaveLoF32, - kInterleaveHiF32, - kInterleaveLoF64, - kInterleaveHiF64, + kInterleaveLoU8, //!< Interleave low u8 lanes. + kInterleaveHiU8, //!< Interleave high u8 lanes. + kInterleaveLoU16, //!< Interleave low u16 lanes. + kInterleaveHiU16, //!< Interleave high u16 lanes. + kInterleaveLoU32, //!< Interleave low u32 lanes. + kInterleaveHiU32, //!< Interleave high u32 lanes. + kInterleaveLoU64, //!< Interleave low u64 lanes. + kInterleaveHiU64, //!< Interleave high u64 lanes. + kInterleaveLoF32, //!< Interleave low f32 lanes. + kInterleaveHiF32, //!< Interleave high f32 lanes. + kInterleaveLoF64, //!< Interleave low f64 lanes. + kInterleaveHiF64, //!< Interleave high f64 lanes. - kPacksI16_I8, - kPacksI16_U8, - kPacksI32_I16, - kPacksI32_U16, + kPacksI16_I8, //!< Pack i16 to i8 with saturation. + kPacksI16_U8, //!< Pack i16 to u8 with saturation. + kPacksI32_I16, //!< Pack i32 to i16 with saturation. + kPacksI32_U16, //!< Pack i32 to u16 with saturation. - kSwizzlev_U8, + kSwizzlev_U8, //!< Swizzle 16xu8 elements in each 128-bit lane. #if defined(ASMJIT_UJIT_AARCH64) @@ -681,10 +737,10 @@ enum class UniOpVVV : uint32_t { #elif defined(ASMJIT_UJIT_X86) - kPermuteU8, - kPermuteU16, - kPermuteU32, - kPermuteU64, + kPermuteU8, //!< Permute u8 elements across the vector. + kPermuteU16, //!< Permute u16 elements across the vector. + kPermuteU32, //!< Permute u32 elements across the vector. + kPermuteU64, //!< Permute u64 elements across the vector. kMaxValue = kPermuteU64 @@ -695,86 +751,65 @@ enum class UniOpVVV : uint32_t { #endif // ASMJIT_UJIT_AARCH64 }; +//! Instruction with `[vec, vec, vec, imm]` operands. +//! +//! Describes vector arithmetic that has one destination, two sources, and immediate. +//! +//! \note For convenience, the third operand can be register, memory, or immediate value. enum class UniOpVVVI : uint32_t { - kAlignr_U128, - kInterleaveShuffleU32x4, - kInterleaveShuffleU64x2, - kInterleaveShuffleF32x4, - kInterleaveShuffleF64x2, - kInsertV128_U32, - kInsertV128_F32, - kInsertV128_U64, - kInsertV128_F64, - kInsertV256_U32, - kInsertV256_F32, - kInsertV256_U64, - kInsertV256_F64, + kAlignr_U128, //!< Align-right 8-bit elements in 128-bit. + kInterleaveShuffleU32x4, //!< Interleaved u32x4 shuffle. + kInterleaveShuffleU64x2, //!< Interleaved u64x2 shuffle. + kInterleaveShuffleF32x4, //!< Interleaved f32x4 shuffle. + kInterleaveShuffleF64x2, //!< Interleaved f64x2 shuffle. + kInsertV128_U32, //!< Insert a 128-bit lane (u32) into 256-bit or 512-bit vector. + kInsertV128_F32, //!< Insert a 128-bit lane (f32) into 256-bit or 512-bit vector. + kInsertV128_U64, //!< Insert a 128-bit lane (u64) into 256-bit or 512-bit vector. + kInsertV128_F64, //!< Insert a 128-bit lane (f64) into 256-bit or 512-bit vector. + kInsertV256_U32, //!< Insert a 256-bit lane (u32) into 512-bit vector. + kInsertV256_F32, //!< Insert a 256-bit lane (f32) into 512-bit vector. + kInsertV256_U64, //!< Insert a 256-bit lane (u64) into 512-bit vector. + kInsertV256_F64, //!< Insert a 256-bit lane (f64) into 512-bit vector. kMaxValue = kInsertV256_F64 }; +//! Instruction with `[vec, vec, vec, vec]` operands. +//! +//! Describes vector arithmetic that has one destination and three sources. +//! +//! \note For convenience, the fourth operand can be register, memory, or immediate value. +//! +//! \remarks For FMA functionality, check also \ref FMAddOpBehavior. enum class UniOpVVVV : uint32_t { kBlendV_U8, - kMAddU16, - kMAddU32, + kMAddU16, //!< Vector u16 multiply-add. + kMAddU32, //!< Vector u32 multiply-add. - kMAddF32S, - kMAddF64S, - kMAddF32, - kMAddF64, + kMAddF32S, //!< Scalar f32 multiply-add (FMA if available, or separate MUL+ADD if not). + kMAddF64S, //!< Scalar f64 multiply-add (FMA if available, or separate MUL+ADD if not). + kMAddF32, //!< Vector f32 multiply-add (FMA if available, or separate MUL+ADD if not). + kMAddF64, //!< Vector f64 multiply-add (FMA if available, or separate MUL+ADD if not). - kMSubF32S, - kMSubF64S, - kMSubF32, - kMSubF64, + kMSubF32S, //!< Scalar f32 multiply-sub (FMA if available, or separate MUL+ADD if not). + kMSubF64S, //!< Scalar f64 multiply-sub (FMA if available, or separate MUL+ADD if not). + kMSubF32, //!< Vector f32 multiply-sub (FMA if available, or separate MUL+ADD if not). + kMSubF64, //!< Vector f64 multiply-sub (FMA if available, or separate MUL+ADD if not). - kNMAddF32S, - kNMAddF64S, - kNMAddF32, - kNMAddF64, + kNMAddF32S, //!< Scalar f32 negated-multiply-add (FMA if available, or separate MUL+ADD if not) + kNMAddF64S, //!< Scalar f64 negated-multiply-add (FMA if available, or separate MUL+ADD if not) + kNMAddF32, //!< Vector f32 negated-multiply-add (FMA if available, or separate MUL+ADD if not). + kNMAddF64, //!< Vector f64 negated-multiply-add (FMA if available, or separate MUL+ADD if not). - kNMSubF32S, - kNMSubF64S, - kNMSubF32, - kNMSubF64, + kNMSubF32S, //!< Scalar f32 negated-multiply-sub (FMA if available, or separate MUL+ADD if not). + kNMSubF64S, //!< Scalar f64 negated-multiply-sub (FMA if available, or separate MUL+ADD if not). + kNMSubF32, //!< Vector f32 negated-multiply-sub (FMA if available, or separate MUL+ADD if not). + kNMSubF64, //!< Vector f64 negated-multiply-sub (FMA if available, or separate MUL+ADD if not). kMaxValue = kNMSubF64 }; -//! Pipeline optimization flags used by \ref UniCompiler. -enum class UniOptFlags : uint32_t { - //! No flags. - kNone = 0x0u, - - //! CPU has instructions that can perform 8-bit masked loads and stores. - kMaskOps8Bit = 0x00000001u, - - //! CPU has instructions that can perform 16-bit masked loads and stores. - kMaskOps16Bit = 0x00000002u, - - //! CPU has instructions that can perform 32-bit masked loads and stores. - kMaskOps32Bit = 0x00000004u, - - //! CPU has instructions that can perform 64-bit masked loads and stores. - kMaskOps64Bit = 0x00000008u, - - //! CPU provides low-latency 32-bit multiplication (AMD CPUs). - kFastVpmulld = 0x00000010u, - - //! CPU provides low-latency 64-bit multiplication (AMD CPUs). - kFastVpmullq = 0x00000020u, - - //! CPU performs hardware gathers faster than a sequence of loads and packing. - kFastGather = 0x00000040u, - - //! CPU has fast stores with mask. - //! - //! \note This is a hint to the compiler to emit a masked store instead of a sequence having branches. - kFastStoreWithMask = 0x00000080u -}; -ASMJIT_DEFINE_ENUM_FLAGS(UniOptFlags) - //! \} ASMJIT_END_SUB_NAMESPACE diff --git a/src/asmjit/ujit/vecconsttable.h b/src/asmjit/ujit/vecconsttable.h index 7444aa6..b0779ff 100644 --- a/src/asmjit/ujit/vecconsttable.h +++ b/src/asmjit/ujit/vecconsttable.h @@ -6,7 +6,7 @@ #ifndef ASMJIT_UJIT_VECCONSTTABLE_H_INCLUDED #define ASMJIT_UJIT_VECCONSTTABLE_H_INCLUDED -#include "ujitbase.h" +#include "../core/globals.h" #if !defined(ASMJIT_NO_UJIT) @@ -18,6 +18,8 @@ ASMJIT_BEGIN_SUB_NAMESPACE(ujit) template struct VecConst; +//! \cond + //! A 64-bit vector constant of type `T` aligned to 64 bits. template struct ASMJIT_MAY_ALIAS ASMJIT_ALIGNAS(8) VecConst { @@ -30,6 +32,12 @@ struct ASMJIT_MAY_ALIAS ASMJIT_ALIGNAS(8) VecConst { static_assert(kElementCount > 0u, "Vector constant must have at least one element"); ElementType data[kElementCount]; + + template + ASMJIT_INLINE_NODEBUG const DstT& as() const noexcept { + static_assert(sizeof(DstT) <= sizeof(*this), "Size of the destination type DstT must be <= 8"); + return *static_cast(static_cast(this)); + } }; //! A 128-bit vector constant of type `T` aligned to 128 bits. @@ -44,6 +52,12 @@ struct ASMJIT_MAY_ALIAS ASMJIT_ALIGNAS(16) VecConst { static_assert(kElementCount > 0u, "Vector constant must have at least one element"); ElementType data[kElementCount]; + + template + ASMJIT_INLINE_NODEBUG const DstT& as() const noexcept { + static_assert(sizeof(DstT) <= sizeof(*this), "Size of the destination type DstT must be <= 16"); + return *static_cast(static_cast(this)); + } }; //! A 256-bit vector constant of type `T` aligned to 256 bits. @@ -58,6 +72,12 @@ struct ASMJIT_MAY_ALIAS ASMJIT_ALIGNAS(32) VecConst { static_assert(kElementCount > 0u, "Vector constant must have at least one element"); ElementType data[kElementCount]; + + template + ASMJIT_INLINE_NODEBUG const DstT& as() const noexcept { + static_assert(sizeof(DstT) <= sizeof(*this), "Size of the destination type DstT must be <= 32"); + return *static_cast(static_cast(this)); + } }; //! A 512-bit vector constant of type `T` aligned to 512 bits. @@ -72,8 +92,16 @@ struct ASMJIT_MAY_ALIAS ASMJIT_ALIGNAS(64) VecConst { static_assert(kElementCount > 0u, "Vector constant must have at least one element"); ElementType data[kElementCount]; + + template + ASMJIT_INLINE_NODEBUG const DstT& as() const noexcept { + static_assert(sizeof(DstT) <= sizeof(*this), "Size of the destination type DstT must be <= 64"); + return *static_cast(static_cast(this)); + } }; +//! \endcond + template using VecConst64 = VecConst; template using VecConst128 = VecConst; template using VecConst256 = VecConst; @@ -397,18 +425,27 @@ struct VecConstTable { VecConstNative p_0000800000008000 = make_const>(uint64_t(0x0000800000008000u)); - VecConst128 sign32_scalar = make_const>(0u, 0u, 0u, uint32_t(0x80000000u)); - VecConst128 sign64_scalar = make_const>(uint64_t(0u), uint64_t(0x8000000000000000u)); + VecConst128 sign32_scalar = make_const>(0u, 0u, 0u, uint32_t(0x80000000u)); + VecConst128 sign64_scalar = make_const>(uint64_t(0u), uint64_t(0x8000000000000000u)); - VecConstNative f32_1 = make_const>(1.0f); - VecConstNative f32_round_max = make_const>(8388608.0f); + VecConstNative f32_0_5_minus_1ulp = make_const>(0x3EFFFFFF3EFFFFFFu); // 0.49999997 (0.5f - 1ulp) + VecConstNative f32_0_5 = make_const>(0.5f); + VecConstNative f32_1 = make_const>(1.0f); + VecConstNative f32_round_magic = make_const>(8388608.0f); - VecConstNative f64_1 = make_const>(1.0); - VecConstNative f64_round_max = make_const>(4503599627370496.0); + VecConstNative f64_0_5_minus_1ulp = make_const>(0x3FDFFFFFFFFFFFFFu); // 0.49999999999999994 (0.5 - 1ulp). + VecConstNative f64_0_5 = make_const>(0.5); + VecConstNative f64_1 = make_const>(1.0); + VecConstNative f64_round_magic = make_const>(4503599627370496.0); }; ASMJIT_VARAPI const VecConstTable vec_const_table; +struct VecConstTableRef { + const VecConstTable& table; + size_t size; +}; + //! \} ASMJIT_END_SUB_NAMESPACE diff --git a/src/asmjit/x86/x86compiler.h b/src/asmjit/x86/x86compiler.h index 219a418..ce4eabe 100644 --- a/src/asmjit/x86/x86compiler.h +++ b/src/asmjit/x86/x86compiler.h @@ -511,99 +511,182 @@ public: //! \name Virtual Registers //! \{ -#ifndef ASMJIT_NO_LOGGING -# define ASMJIT_NEW_REG_FMT(OUT, PARAM, FORMAT, ARGS) \ - _new_reg_fmt(Out{OUT}, PARAM, FORMAT, ARGS) -#else -# define ASMJIT_NEW_REG_FMT(OUT, PARAM, FORMAT, ARGS) \ - Support::maybe_unused(FORMAT); \ - Support::maybe_unused(std::forward(args)...); \ - _new_reg(Out{OUT}, PARAM) -#endif + //! Creates a new general-purpose register with `type_id` type and optional name passed via `args`. + //! + //! \note Using \ref TypeId is too generic. In general it's recommended to use \ref new_gp8(), + //! \ref new_gp16(), \ref new_gp32(), \ref new_gp64(), and \ref new_gpz() or \ref new_gp_ptr(). + template + ASMJIT_INLINE_NODEBUG Gp new_gp(TypeId type_id, Args&&... args) { return new_reg(type_id, std::forward(args)...); } -#define ASMJIT_NEW_REG_CUSTOM(FUNC, REG) \ - ASMJIT_INLINE_NODEBUG REG FUNC(TypeId type_id) { \ - REG reg(Globals::NoInit); \ - _new_reg(Out{reg}, type_id); \ - return reg; \ - } \ - \ - template \ - ASMJIT_INLINE_NODEBUG REG FUNC(TypeId type_id, const char* fmt, Args&&... args) {\ - REG reg(Globals::NoInit); \ - ASMJIT_NEW_REG_FMT(reg, type_id, fmt, std::forward(args)...); \ - return reg; \ - } + //! Creates a new vector register with `type_id` type and optional name passed via `args`. + //! + //! \note Using \ref TypeId is too generic. In general it's recommended to use \ref new_vec128(), + //! \ref new_vec256(), \ref new_vec512(), or alternatively \ref new_xmm(), \ref new_ymm(), and \ref new_zmm(). + template + ASMJIT_INLINE_NODEBUG Vec new_vec(TypeId type_id, Args&&... args) { return new_reg(type_id, std::forward(args)...); } -#define ASMJIT_NEW_REG_TYPED(FUNC, REG, TYPE_ID) \ - ASMJIT_INLINE_NODEBUG REG FUNC() { \ - REG reg(Globals::NoInit); \ - _new_reg(Out{reg}, TYPE_ID); \ - return reg; \ - } \ - \ - template \ - ASMJIT_INLINE_NODEBUG REG FUNC(const char* fmt, Args&&... args) { \ - REG reg(Globals::NoInit); \ - ASMJIT_NEW_REG_FMT(reg, TYPE_ID, fmt, std::forward(args)...); \ - return reg; \ - } + //! Creates a new mask register with `type_id` type and optional name passed via `args`. + template + ASMJIT_INLINE_NODEBUG KReg new_k(TypeId type_id, Args&&... args) { return new_reg(type_id, std::forward(args)...); } - template - ASMJIT_INLINE_NODEBUG RegT new_similar_reg(const RegT& ref) { - RegT reg(Globals::NoInit); - _new_reg(Out(reg), ref); - return reg; - } + //! Creates a new 8-bit general purpose register mapped to low 8 bits of a full register. + //! + //! \note Using 8-bit registers is not recommended, use at least 32-bit registers in portable code. + template + ASMJIT_INLINE_NODEBUG Gp new_gp8(Args&&... args) { return new_reg(TypeId::kUInt8, std::forward(args)...); } - template - ASMJIT_INLINE_NODEBUG RegT new_similar_reg(const RegT& ref, const char* fmt, Args&&... args) { - RegT reg(Globals::NoInit); - ASMJIT_NEW_REG_FMT(reg, ref, fmt, std::forward(args)...); - return reg; - } + //! Creates a new 16-bit general purpose register mapped to low 16 bits of a full register. + //! + //! \note Using 16-bit registers is not recommended, use at least 32-bit registers in portable code. + template + ASMJIT_INLINE_NODEBUG Gp new_gp16(Args&&... args) { return new_reg(TypeId::kUInt16, std::forward(args)...); } - ASMJIT_NEW_REG_CUSTOM(new_reg , Reg ) - ASMJIT_NEW_REG_CUSTOM(new_gp , Gp ) - ASMJIT_NEW_REG_CUSTOM(new_vec , Vec ) - ASMJIT_NEW_REG_CUSTOM(new_kreg , KReg) + //! Creates a new 32-bit general purpose register mapped to low 32 bits of a full register (on 64-bit targets). + template + ASMJIT_INLINE_NODEBUG Gp new_gp32(Args&&... args) { return new_reg(TypeId::kUInt32, std::forward(args)...); } - ASMJIT_NEW_REG_TYPED(new_gp8 , Gp , TypeId::kUInt8) - ASMJIT_NEW_REG_TYPED(new_gp16 , Gp , TypeId::kUInt16) - ASMJIT_NEW_REG_TYPED(new_gp32 , Gp , TypeId::kUInt32) - ASMJIT_NEW_REG_TYPED(new_gp64 , Gp , TypeId::kUInt64) + //! Creates a new 64-bit general purpose register. + //! + //! \warning The target must be 64-bit in order to create 64-bit registers. + template + ASMJIT_INLINE_NODEBUG Gp new_gp64(Args&&... args) { return new_reg(TypeId::kUInt64, std::forward(args)...); } - ASMJIT_NEW_REG_TYPED(new_gpb , Gp , TypeId::kUInt8) - ASMJIT_NEW_REG_TYPED(new_gpw , Gp , TypeId::kUInt16) - ASMJIT_NEW_REG_TYPED(new_gpd , Gp , TypeId::kUInt32) - ASMJIT_NEW_REG_TYPED(new_gpq , Gp , TypeId::kUInt64) - ASMJIT_NEW_REG_TYPED(new_gpz , Gp , TypeId::kUIntPtr) - ASMJIT_NEW_REG_TYPED(new_gp_ptr, Gp , TypeId::kUIntPtr) + //! Creates a new 32-bit or 64-bit general purpose register depending on the target register width. + template + ASMJIT_INLINE_NODEBUG Gp new_gpz(Args&&... args) { return new_reg(TypeId::kUIntPtr, std::forward(args)...); } - ASMJIT_NEW_REG_TYPED(new_xmm , Vec , TypeId::kInt32x4) - ASMJIT_NEW_REG_TYPED(new_xmm_ss, Vec , TypeId::kFloat32x1) - ASMJIT_NEW_REG_TYPED(new_xmm_sd, Vec , TypeId::kFloat64x1) - ASMJIT_NEW_REG_TYPED(new_xmm_ps, Vec , TypeId::kFloat32x4) - ASMJIT_NEW_REG_TYPED(new_xmm_pd, Vec , TypeId::kFloat64x2) + //! Creates a new 32-bit or 64-bit general purpose register depending on the target register width. + //! + //! \note This is just an alternative name that maps more closely to C's `uintptr_t`, it's the same function as + //! \ref new_gpz(). + template + ASMJIT_INLINE_NODEBUG Gp new_gp_ptr(Args&&... args) { return new_reg(TypeId::kUIntPtr, std::forward(args)...); } - ASMJIT_NEW_REG_TYPED(new_ymm , Vec , TypeId::kInt32x8) - ASMJIT_NEW_REG_TYPED(new_ymm_ps, Vec , TypeId::kFloat32x8) - ASMJIT_NEW_REG_TYPED(new_ymm_pd, Vec , TypeId::kFloat64x4) + //! Creates a new 128-bit vector register (XMM). + template + ASMJIT_INLINE_NODEBUG Vec new_vec128(Args&&... args) { return new_reg(TypeId::kInt32x4, std::forward(args)...); } - ASMJIT_NEW_REG_TYPED(new_zmm , Vec , TypeId::kInt32x16) - ASMJIT_NEW_REG_TYPED(new_zmm_ps, Vec , TypeId::kFloat32x16) - ASMJIT_NEW_REG_TYPED(new_zmm_pd, Vec , TypeId::kFloat64x8) + //! Creates a new 128-bit vector register (XMM) that will be used for scalar 32-bit floating point operation. + template + ASMJIT_INLINE_NODEBUG Vec new_vec128_f32x1(Args&&... args) { return new_reg(TypeId::kFloat32x1, std::forward(args)...); } - ASMJIT_NEW_REG_TYPED(new_mm , Mm , TypeId::kMmx64) + //! Creates a new 128-bit vector register (XMM) that will be used for scalar 64-bit floating point operation. + template + ASMJIT_INLINE_NODEBUG Vec new_vec128_f64x1(Args&&... args) { return new_reg(TypeId::kFloat64x1, std::forward(args)...); } - ASMJIT_NEW_REG_TYPED(new_kb , KReg, TypeId::kMask8) - ASMJIT_NEW_REG_TYPED(new_kw , KReg, TypeId::kMask16) - ASMJIT_NEW_REG_TYPED(new_kd , KReg, TypeId::kMask32) - ASMJIT_NEW_REG_TYPED(new_kq , KReg, TypeId::kMask64) + //! Creates a new 128-bit vector register (XMM) that will be used for packed 32-bit floating point operation. + template + ASMJIT_INLINE_NODEBUG Vec new_vec128_f32x4(Args&&... args) { return new_reg(TypeId::kFloat32x4, std::forward(args)...); } -#undef ASMJIT_NEW_REG_TYPED -#undef ASMJIT_NEW_REG_CUSTOM -#undef ASMJIT_NEW_REG_FMT + //! Creates a new 128-bit vector register (XMM) that will be used for packed 64-bit floating point operation. + template + ASMJIT_INLINE_NODEBUG Vec new_vec128_f64x2(Args&&... args) { return new_reg(TypeId::kFloat64x2, std::forward(args)...); } + + //! Creates a new 256-bit vector register (YMM). + template + ASMJIT_INLINE_NODEBUG Vec new_vec256(Args&&... args) { return new_reg(TypeId::kInt32x8, std::forward(args)...); } + + //! Creates a new 256-bit vector register (YMM) that will be used for packed 32-bit floating point operation. + template + ASMJIT_INLINE_NODEBUG Vec new_vec256_f32x8(Args&&... args) { return new_reg(TypeId::kFloat32x8, std::forward(args)...); } + + //! Creates a new 256-bit vector register (YMM) that will be used for packed 64-bit floating point operation. + template + ASMJIT_INLINE_NODEBUG Vec new_vec256_f64x4(Args&&... args) { return new_reg(TypeId::kFloat64x4, std::forward(args)...); } + + //! Creates a new 512-bit vector register (ZMM). + template + ASMJIT_INLINE_NODEBUG Vec new_vec512(Args&&... args) { return new_reg(TypeId::kInt32x16, std::forward(args)...); } + + //! Creates a new 512-bit vector register (ZMM) that will be used for packed 32-bit floating point operation. + template + ASMJIT_INLINE_NODEBUG Vec new_vec512_f32x16(Args&&... args) { return new_reg(TypeId::kFloat32x16, std::forward(args)...); } + + //! Creates a new 512-bit vector register (ZMM) that will be used for packed 64-bit floating point operation. + template + ASMJIT_INLINE_NODEBUG Vec new_vec512_f64x8(Args&&... args) { return new_reg(TypeId::kFloat64x8, std::forward(args)...); } + + //! Alias of \ref new_vec128() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_xmm(Args&&... args) { return new_reg(TypeId::kInt32x4, std::forward(args)...); } + + //! Alias of \ref new_vec128_f32x1() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_xmm_ss(Args&&... args) { return new_reg(TypeId::kFloat32x1, std::forward(args)...); } + + //! Alias of \ref new_vec128_f64x1() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_xmm_sd(Args&&... args) { return new_reg(TypeId::kFloat64x1, std::forward(args)...); } + + //! Alias of \ref new_vec128_f32x4() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_xmm_ps(Args&&... args) { return new_reg(TypeId::kFloat32x4, std::forward(args)...); } + + //! Alias of \ref new_vec128_f64x2() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_xmm_pd(Args&&... args) { return new_reg(TypeId::kFloat64x2, std::forward(args)...); } + + //! Alias of \ref new_vec256() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_ymm(Args&&... args) { return new_reg(TypeId::kInt32x8, std::forward(args)...); } + + //! Alias of \ref new_vec256_f32x8() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_ymm_ps(Args&&... args) { return new_reg(TypeId::kFloat32x8, std::forward(args)...); } + + //! Alias of \ref new_vec256_f64x4() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_ymm_pd(Args&&... args) { return new_reg(TypeId::kFloat64x4, std::forward(args)...); } + + //! Alias of \ref new_vec512() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_zmm(Args&&... args) { return new_reg(TypeId::kInt32x16, std::forward(args)...); } + + //! Alias of \ref new_vec512_f32x16() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_zmm_ps(Args&&... args) { return new_reg(TypeId::kFloat32x16, std::forward(args)...); } + + //! Alias of \ref new_vec512_f64x8() that matches x86 architecture terminology. + template + ASMJIT_INLINE_NODEBUG Vec new_zmm_pd(Args&&... args) { return new_reg(TypeId::kFloat64x8, std::forward(args)...); } + + //! Creates a new 64-bit MMX register. + //! + //! \note MMX ISA is generally deprecated by the X86 architecture. + template + ASMJIT_INLINE_NODEBUG Mm new_mm(Args&&... args) { return new_reg(TypeId::kMmx64, std::forward(args)...); } + + //! Creates a new 8-bit mask (K) register. + template + ASMJIT_INLINE_NODEBUG KReg new_k8(Args&&... args) { return new_reg(TypeId::kMask8, std::forward(args)...); } + + //! Creates a new 16-bit mask (K) register. + template + ASMJIT_INLINE_NODEBUG KReg new_k16(Args&&... args) { return new_reg(TypeId::kMask16, std::forward(args)...); } + + //! Creates a new 32-bit mask (K) register. + template + ASMJIT_INLINE_NODEBUG KReg new_k32(Args&&... args) { return new_reg(TypeId::kMask32, std::forward(args)...); } + + //! Creates a new 64-bit mask (K) register. + template + ASMJIT_INLINE_NODEBUG KReg new_k64(Args&&... args) { return new_reg(TypeId::kMask64, std::forward(args)...); } + + //! Creates a new 8-bit mask (K) register, alias of \ref new_k8(). + template + ASMJIT_INLINE_NODEBUG KReg new_kb(Args&&... args) { return new_reg(TypeId::kMask8, std::forward(args)...); } + + //! Creates a new 16-bit mask (K) register, alias of \ref new_k16(). + template + ASMJIT_INLINE_NODEBUG KReg new_kw(Args&&... args) { return new_reg(TypeId::kMask16, std::forward(args)...); } + + //! Creates a new 32-bit mask (K) register, alias of \ref new_k32(). + template + ASMJIT_INLINE_NODEBUG KReg new_kd(Args&&... args) { return new_reg(TypeId::kMask32, std::forward(args)...); } + + //! Creates a new 64-bit mask (K) register, alias of \ref new_k64(). + template + ASMJIT_INLINE_NODEBUG KReg new_kq(Args&&... args) { return new_reg(TypeId::kMask64, std::forward(args)...); } //! \} diff --git a/src/asmjit/x86/x86formatter.cpp b/src/asmjit/x86/x86formatter.cpp index 1235a82..39c554c 100644 --- a/src/asmjit/x86/x86formatter.cpp +++ b/src/asmjit/x86/x86formatter.cpp @@ -561,7 +561,7 @@ struct ImmBits { char text[48 - 3]; }; -ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmShuf(String& sb, uint32_t imm8, uint32_t bits, uint32_t count) noexcept { +ASMJIT_FAVOR_SIZE static Error FormatterInternal_format_imm_shuf(String& sb, uint32_t imm8, uint32_t bits, uint32_t count) noexcept { uint32_t mask = (1 << bits) - 1; uint32_t last_predicate_shift = bits * (count - 1u); @@ -576,7 +576,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmShuf(String& sb, uint3 return Error::kOk; } -ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmBits(String& sb, uint32_t imm8, const ImmBits* bits, uint32_t count) noexcept { +ASMJIT_FAVOR_SIZE static Error FormatterInternal_format_imm_bits(String& sb, uint32_t imm8, const ImmBits* bits, uint32_t count) noexcept { uint32_t n = 0; char buf[64]; @@ -615,7 +615,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmBits(String& sb, uint3 return Error::kOk; } -ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmText(String& sb, uint32_t imm8, uint32_t bits, uint32_t advance, const char* text, uint32_t count = 1) noexcept { +ASMJIT_FAVOR_SIZE static Error FormatterInternal_format_imm_text(String& sb, uint32_t imm8, uint32_t bits, uint32_t advance, const char* text, uint32_t count = 1) noexcept { uint32_t mask = (1u << bits) - 1; uint32_t pos = 0; @@ -628,7 +628,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_formatImmText(String& sb, uint3 return sb.append(kImmCharEnd); } -ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst( +ASMJIT_FAVOR_SIZE static Error FormatterInternal_explain_const( String& sb, FormatFlags format_flags, InstId inst_id, @@ -700,55 +700,55 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst( }; static const ImmBits vroundxx[] = { - { 0x07u, 0, ImmBits::kModeLookup, "ROUND\0" "FLOOR\0" "CEIL\0" "TRUNC\0" "\0" "\0" "\0" "\0" }, - { 0x08u, 3, ImmBits::kModeLookup, "\0" "INEXACT\0" } + { 0x07u, 0, ImmBits::kModeLookup, "ROUND\0" "FLOOR\0" "CEIL\0" "TRUNC\0" "CURRENT\0" "\0" "\0" "\0" }, + { 0x08u, 3, ImmBits::kModeLookup, "\0" "SUPPRESS\0" } }; uint32_t u8 = imm.value_as(); switch (inst_id) { case Inst::kIdVblendpd: case Inst::kIdBlendpd: - return FormatterInternal_formatImmShuf(sb, u8, 1, vec_size / 8); + return FormatterInternal_format_imm_shuf(sb, u8, 1, vec_size / 8); case Inst::kIdVblendps: case Inst::kIdBlendps: - return FormatterInternal_formatImmShuf(sb, u8, 1, vec_size / 4); + return FormatterInternal_format_imm_shuf(sb, u8, 1, vec_size / 4); case Inst::kIdVcmppd: case Inst::kIdVcmpps: case Inst::kIdVcmpsd: case Inst::kIdVcmpss: - return FormatterInternal_formatImmText(sb, u8, 5, 0, vcmpx); + return FormatterInternal_format_imm_text(sb, u8, 5, 0, vcmpx); case Inst::kIdCmppd: case Inst::kIdCmpps: case Inst::kIdCmpsd: case Inst::kIdCmpss: - return FormatterInternal_formatImmText(sb, u8, 3, 0, vcmpx); + return FormatterInternal_format_imm_text(sb, u8, 3, 0, vcmpx); case Inst::kIdVdbpsadbw: - return FormatterInternal_formatImmShuf(sb, u8, 2, 4); + return FormatterInternal_format_imm_shuf(sb, u8, 2, 4); case Inst::kIdVdppd: case Inst::kIdVdpps: case Inst::kIdDppd: case Inst::kIdDpps: - return FormatterInternal_formatImmShuf(sb, u8, 1, 8); + return FormatterInternal_format_imm_shuf(sb, u8, 1, 8); case Inst::kIdVmpsadbw: case Inst::kIdMpsadbw: - return FormatterInternal_formatImmBits(sb, u8, vmpsadbw, Support::min(vec_size / 8, 4)); + return FormatterInternal_format_imm_bits(sb, u8, vmpsadbw, Support::min(vec_size / 8, 4)); case Inst::kIdVpblendw: case Inst::kIdPblendw: - return FormatterInternal_formatImmShuf(sb, u8, 1, 8); + return FormatterInternal_format_imm_shuf(sb, u8, 1, 8); case Inst::kIdVpblendd: - return FormatterInternal_formatImmShuf(sb, u8, 1, Support::min(vec_size / 4, 8)); + return FormatterInternal_format_imm_shuf(sb, u8, 1, Support::min(vec_size / 4, 8)); case Inst::kIdVpclmulqdq: case Inst::kIdPclmulqdq: - return FormatterInternal_formatImmBits(sb, u8, vpclmulqdq, ASMJIT_ARRAY_SIZE(vpclmulqdq)); + return FormatterInternal_format_imm_bits(sb, u8, vpclmulqdq, ASMJIT_ARRAY_SIZE(vpclmulqdq)); case Inst::kIdVroundpd: case Inst::kIdVroundps: @@ -758,57 +758,57 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst( case Inst::kIdRoundps: case Inst::kIdRoundsd: case Inst::kIdRoundss: - return FormatterInternal_formatImmBits(sb, u8, vroundxx, ASMJIT_ARRAY_SIZE(vroundxx)); + return FormatterInternal_format_imm_bits(sb, u8, vroundxx, ASMJIT_ARRAY_SIZE(vroundxx)); case Inst::kIdVshufpd: case Inst::kIdShufpd: - return FormatterInternal_formatImmText(sb, u8, 1, 2, vshufpd, Support::min(vec_size / 8, 8)); + return FormatterInternal_format_imm_text(sb, u8, 1, 2, vshufpd, Support::min(vec_size / 8, 8)); case Inst::kIdVshufps: case Inst::kIdShufps: - return FormatterInternal_formatImmText(sb, u8, 2, 4, vshufps, 4); + return FormatterInternal_format_imm_text(sb, u8, 2, 4, vshufps, 4); case Inst::kIdVcvtps2ph: - return FormatterInternal_formatImmBits(sb, u8, vroundxx, 1); + return FormatterInternal_format_imm_bits(sb, u8, vroundxx, 1); case Inst::kIdVperm2f128: case Inst::kIdVperm2i128: - return FormatterInternal_formatImmBits(sb, u8, vperm2x128, ASMJIT_ARRAY_SIZE(vperm2x128)); + return FormatterInternal_format_imm_bits(sb, u8, vperm2x128, ASMJIT_ARRAY_SIZE(vperm2x128)); case Inst::kIdVpermilpd: - return FormatterInternal_formatImmShuf(sb, u8, 1, vec_size / 8); + return FormatterInternal_format_imm_shuf(sb, u8, 1, vec_size / 8); case Inst::kIdVpermilps: - return FormatterInternal_formatImmShuf(sb, u8, 2, 4); + return FormatterInternal_format_imm_shuf(sb, u8, 2, 4); case Inst::kIdVpshufd: case Inst::kIdPshufd: - return FormatterInternal_formatImmShuf(sb, u8, 2, 4); + return FormatterInternal_format_imm_shuf(sb, u8, 2, 4); case Inst::kIdVpshufhw: case Inst::kIdVpshuflw: case Inst::kIdPshufhw: case Inst::kIdPshuflw: case Inst::kIdPshufw: - return FormatterInternal_formatImmShuf(sb, u8, 2, 4); + return FormatterInternal_format_imm_shuf(sb, u8, 2, 4); case Inst::kIdVfixupimmpd: case Inst::kIdVfixupimmps: case Inst::kIdVfixupimmsd: case Inst::kIdVfixupimmss: - return FormatterInternal_formatImmBits(sb, u8, vfixupimmxx, ASMJIT_ARRAY_SIZE(vfixupimmxx)); + return FormatterInternal_format_imm_bits(sb, u8, vfixupimmxx, ASMJIT_ARRAY_SIZE(vfixupimmxx)); case Inst::kIdVfpclasspd: case Inst::kIdVfpclassps: case Inst::kIdVfpclasssd: case Inst::kIdVfpclassss: - return FormatterInternal_formatImmBits(sb, u8, vfpclassxx, ASMJIT_ARRAY_SIZE(vfpclassxx)); + return FormatterInternal_format_imm_bits(sb, u8, vfpclassxx, ASMJIT_ARRAY_SIZE(vfpclassxx)); case Inst::kIdVgetmantpd: case Inst::kIdVgetmantps: case Inst::kIdVgetmantsd: case Inst::kIdVgetmantss: - return FormatterInternal_formatImmBits(sb, u8, vgetmantxx, ASMJIT_ARRAY_SIZE(vgetmantxx)); + return FormatterInternal_format_imm_bits(sb, u8, vgetmantxx, ASMJIT_ARRAY_SIZE(vgetmantxx)); case Inst::kIdVpcmpb: case Inst::kIdVpcmpd: @@ -818,7 +818,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst( case Inst::kIdVpcmpud: case Inst::kIdVpcmpuq: case Inst::kIdVpcmpuw: - return FormatterInternal_formatImmText(sb, u8, 3, 0, vpcmpx); + return FormatterInternal_format_imm_text(sb, u8, 3, 0, vpcmpx); case Inst::kIdVpcomb: case Inst::kIdVpcomd: @@ -828,21 +828,21 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst( case Inst::kIdVpcomud: case Inst::kIdVpcomuq: case Inst::kIdVpcomuw: - return FormatterInternal_formatImmText(sb, u8, 3, 0, vpcomx); + return FormatterInternal_format_imm_text(sb, u8, 3, 0, vpcomx); case Inst::kIdVpermq: case Inst::kIdVpermpd: - return FormatterInternal_formatImmShuf(sb, u8, 2, 4); + return FormatterInternal_format_imm_shuf(sb, u8, 2, 4); case Inst::kIdVpternlogd: case Inst::kIdVpternlogq: - return FormatterInternal_formatImmShuf(sb, u8, 1, 8); + return FormatterInternal_format_imm_shuf(sb, u8, 1, 8); case Inst::kIdVrangepd: case Inst::kIdVrangeps: case Inst::kIdVrangesd: case Inst::kIdVrangess: - return FormatterInternal_formatImmBits(sb, u8, vrangexx, ASMJIT_ARRAY_SIZE(vrangexx)); + return FormatterInternal_format_imm_bits(sb, u8, vrangexx, ASMJIT_ARRAY_SIZE(vrangexx)); case Inst::kIdVreducepd: case Inst::kIdVreduceps: @@ -852,7 +852,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst( case Inst::kIdVrndscaleps: case Inst::kIdVrndscalesd: case Inst::kIdVrndscaless: - return FormatterInternal_formatImmBits(sb, u8, vreducexx_vrndscalexx, ASMJIT_ARRAY_SIZE(vreducexx_vrndscalexx)); + return FormatterInternal_format_imm_bits(sb, u8, vreducexx_vrndscalexx, ASMJIT_ARRAY_SIZE(vreducexx_vrndscalexx)); case Inst::kIdVshuff32x4: case Inst::kIdVshuff64x2: @@ -860,7 +860,7 @@ ASMJIT_FAVOR_SIZE static Error FormatterInternal_explainConst( case Inst::kIdVshufi64x2: { uint32_t count = Support::max(vec_size / 16, 2u); uint32_t bits = count <= 2 ? 1u : 2u; - return FormatterInternal_formatImmShuf(sb, u8, bits, count); + return FormatterInternal_format_imm_shuf(sb, u8, bits, count); } default: @@ -969,7 +969,7 @@ ASMJIT_FAVOR_SIZE Error FormatterInternal::format_instruction( vec_size = Support::max(vec_size, operands[j].as().size()); } } - ASMJIT_PROPAGATE(FormatterInternal_explainConst(sb, format_flags, inst_id, vec_size, op.as())); + ASMJIT_PROPAGATE(FormatterInternal_explain_const(sb, format_flags, inst_id, vec_size, op.as())); } // Support AVX-512 masking - {k}{z}. diff --git a/test/asmjit_bench_codegen.cpp b/testing/bench/asmjit_bench_codegen.cpp similarity index 97% rename from test/asmjit_bench_codegen.cpp rename to testing/bench/asmjit_bench_codegen.cpp index b11503b..3f4933a 100644 --- a/test/asmjit_bench_codegen.cpp +++ b/testing/bench/asmjit_bench_codegen.cpp @@ -8,8 +8,8 @@ #include #include -#include "asmjitutils.h" -#include "cmdline.h" +#include "../commons/asmjitutils.h" +#include "../commons/cmdline.h" using namespace asmjit; diff --git a/test/asmjit_bench_codegen.h b/testing/bench/asmjit_bench_codegen.h similarity index 97% rename from test/asmjit_bench_codegen.h rename to testing/bench/asmjit_bench_codegen.h index cecbdc4..9df289c 100644 --- a/test/asmjit_bench_codegen.h +++ b/testing/bench/asmjit_bench_codegen.h @@ -7,8 +7,9 @@ #define ASMJIT_TEST_PERF_H_INCLUDED #include -#include "asmjitutils.h" -#include "performancetimer.h" + +#include "../commons/asmjitutils.h" +#include "../commons/performancetimer.h" namespace asmjit_perf_utils { diff --git a/test/asmjit_bench_codegen_a64.cpp b/testing/bench/asmjit_bench_codegen_a64.cpp similarity index 100% rename from test/asmjit_bench_codegen_a64.cpp rename to testing/bench/asmjit_bench_codegen_a64.cpp diff --git a/test/asmjit_bench_codegen_x86.cpp b/testing/bench/asmjit_bench_codegen_x86.cpp similarity index 99% rename from test/asmjit_bench_codegen_x86.cpp rename to testing/bench/asmjit_bench_codegen_x86.cpp index 775be71..0620931 100644 --- a/test/asmjit_bench_codegen_x86.cpp +++ b/testing/bench/asmjit_bench_codegen_x86.cpp @@ -13,7 +13,7 @@ #include #include "asmjit_bench_codegen.h" -#include "asmjit_test_misc.h" +#include "../tests/asmjit_test_misc.h" using namespace asmjit; diff --git a/test/asmjit_bench_overhead.cpp b/testing/bench/asmjit_bench_overhead.cpp similarity index 99% rename from test/asmjit_bench_overhead.cpp rename to testing/bench/asmjit_bench_overhead.cpp index 6b7d0ca..a964c30 100644 --- a/test/asmjit_bench_overhead.cpp +++ b/testing/bench/asmjit_bench_overhead.cpp @@ -1,9 +1,9 @@ #include #include -#include "asmjitutils.h" -#include "cmdline.h" -#include "performancetimer.h" +#include "../commons/asmjitutils.h" +#include "../commons/cmdline.h" +#include "../commons/performancetimer.h" using namespace asmjit; diff --git a/test/asmjit_bench_regalloc.cpp b/testing/bench/asmjit_bench_regalloc.cpp similarity index 98% rename from test/asmjit_bench_regalloc.cpp rename to testing/bench/asmjit_bench_regalloc.cpp index f3ac0bd..32379cd 100644 --- a/test/asmjit_bench_regalloc.cpp +++ b/testing/bench/asmjit_bench_regalloc.cpp @@ -20,13 +20,12 @@ #include #include -#include "asmjitutils.h" +#include "../commons/asmjitutils.h" #if !defined(ASMJIT_NO_COMPILER) - #include "cmdline.h" - #include "performancetimer.h" - #include "asmjit_test_compiler.h" - #include "asmjit_test_random.h" + #include "../commons/cmdline.h" + #include "../commons/performancetimer.h" + #include "../commons/random.h" #endif using namespace asmjit; diff --git a/test/asmjitutils.h b/testing/commons/asmjitutils.h similarity index 89% rename from test/asmjitutils.h rename to testing/commons/asmjitutils.h index 791faef..87c05c1 100644 --- a/test/asmjitutils.h +++ b/testing/commons/asmjitutils.h @@ -88,6 +88,8 @@ static void print_cpu_info() noexcept { // CPU Features // ------------ + using asmjit::CpuHints; + #ifndef ASMJIT_NO_LOGGING printf("CPU Features:\n"); asmjit::CpuFeatures::Iterator it(cpu.features().iterator()); @@ -99,6 +101,27 @@ static void print_cpu_info() noexcept { }; printf("\n"); #endif // !ASMJIT_NO_LOGGING + + // CPU Hints + // --------- + + printf("CPU Hints:\n"); + auto print_hint = [&](CpuHints hint, const char* name) { + if ((cpu.hints() & hint) != CpuHints::kNone) { + printf(" %s\n", name); + } + }; + + print_hint(CpuHints::kVecMaskedOps8 , "VecMaskedOps8" ); + print_hint(CpuHints::kVecMaskedOps16 , "VecMaskedOps16" ); + print_hint(CpuHints::kVecMaskedOps32 , "VecMaskedOps32" ); + print_hint(CpuHints::kVecMaskedOps64 , "VecMaskedOps64" ); + print_hint(CpuHints::kVecFastIntMul32, "VecFastIntMul32"); + print_hint(CpuHints::kVecFastIntMul64, "VecFastIntMul64"); + print_hint(CpuHints::kVecFastGather , "VecFastGather" ); + print_hint(CpuHints::kVecMaskedStore , "VecMaskedStore" ); + + printf("\n"); } [[maybe_unused]] diff --git a/test/cmdline.h b/testing/commons/cmdline.h similarity index 100% rename from test/cmdline.h rename to testing/commons/cmdline.h diff --git a/test/performancetimer.h b/testing/commons/performancetimer.h similarity index 100% rename from test/performancetimer.h rename to testing/commons/performancetimer.h diff --git a/test/asmjit_test_random.h b/testing/commons/random.h similarity index 93% rename from test/asmjit_test_random.h rename to testing/commons/random.h index d085ab3..27f99f6 100644 --- a/test/asmjit_test_random.h +++ b/testing/commons/random.h @@ -3,8 +3,8 @@ // See or LICENSE.md for license and copyright information // SPDX-License-Identifier: Zlib -#ifndef ASMJIT_TEST_RANDOM_H_INCLUDED -#define ASMJIT_TEST_RANDOM_H_INCLUDED +#ifndef TESTING_COMMONS_RANDOM_H_INCLUDED +#define TESTING_COMMONS_RANDOM_H_INCLUDED #include #include @@ -74,4 +74,4 @@ public: } // {anonymous} } // {TestUtils} -#endif // ASMJIT_TEST_RANDOM_H_INCLUDED +#endif // TESTING_COMMONS_RANDOM_H_INCLUDED diff --git a/test/asmjit_test_assembler.cpp b/testing/tests/asmjit_test_assembler.cpp similarity index 97% rename from test/asmjit_test_assembler.cpp rename to testing/tests/asmjit_test_assembler.cpp index d92a2b8..32dfd1e 100644 --- a/test/asmjit_test_assembler.cpp +++ b/testing/tests/asmjit_test_assembler.cpp @@ -8,11 +8,11 @@ #include #include -#include "asmjitutils.h" -#include "cmdline.h" - #include "asmjit_test_assembler.h" +#include "../commons/asmjitutils.h" +#include "../commons/cmdline.h" + using namespace asmjit; #if !defined(ASMJIT_NO_X86) diff --git a/test/asmjit_test_assembler.h b/testing/tests/asmjit_test_assembler.h similarity index 100% rename from test/asmjit_test_assembler.h rename to testing/tests/asmjit_test_assembler.h diff --git a/test/asmjit_test_assembler_a64.cpp b/testing/tests/asmjit_test_assembler_a64.cpp similarity index 99% rename from test/asmjit_test_assembler_a64.cpp rename to testing/tests/asmjit_test_assembler_a64.cpp index 566f927..7354291 100644 --- a/test/asmjit_test_assembler_a64.cpp +++ b/testing/tests/asmjit_test_assembler_a64.cpp @@ -12,7 +12,6 @@ #include #include "asmjit_test_assembler.h" -#include "cmdline.h" using namespace asmjit; diff --git a/test/asmjit_test_assembler_x64.cpp b/testing/tests/asmjit_test_assembler_x64.cpp similarity index 99% rename from test/asmjit_test_assembler_x64.cpp rename to testing/tests/asmjit_test_assembler_x64.cpp index 7aaae91..7de00e2 100644 --- a/test/asmjit_test_assembler_x64.cpp +++ b/testing/tests/asmjit_test_assembler_x64.cpp @@ -12,7 +12,6 @@ #include #include "asmjit_test_assembler.h" -#include "cmdline.h" using namespace asmjit; diff --git a/test/asmjit_test_assembler_x86.cpp b/testing/tests/asmjit_test_assembler_x86.cpp similarity index 99% rename from test/asmjit_test_assembler_x86.cpp rename to testing/tests/asmjit_test_assembler_x86.cpp index b855d99..0362060 100644 --- a/test/asmjit_test_assembler_x86.cpp +++ b/testing/tests/asmjit_test_assembler_x86.cpp @@ -12,7 +12,6 @@ #include #include "asmjit_test_assembler.h" -#include "cmdline.h" using namespace asmjit; diff --git a/test/asmjit_test_compiler.cpp b/testing/tests/asmjit_test_compiler.cpp similarity index 99% rename from test/asmjit_test_compiler.cpp rename to testing/tests/asmjit_test_compiler.cpp index be4cbbb..b5940b0 100644 --- a/test/asmjit_test_compiler.cpp +++ b/testing/tests/asmjit_test_compiler.cpp @@ -15,10 +15,10 @@ #if !defined(ASMJIT_NO_COMPILER) -#include "cmdline.h" -#include "performancetimer.h" +#include "../commons/asmjitutils.h" +#include "../commons/cmdline.h" +#include "../commons/performancetimer.h" -#include "asmjitutils.h" #include "asmjit_test_compiler.h" #if !defined(ASMJIT_NO_X86) diff --git a/test/asmjit_test_compiler.h b/testing/tests/asmjit_test_compiler.h similarity index 100% rename from test/asmjit_test_compiler.h rename to testing/tests/asmjit_test_compiler.h diff --git a/test/asmjit_test_compiler_a64.cpp b/testing/tests/asmjit_test_compiler_a64.cpp similarity index 99% rename from test/asmjit_test_compiler_a64.cpp rename to testing/tests/asmjit_test_compiler_a64.cpp index 188e857..460999f 100644 --- a/test/asmjit_test_compiler_a64.cpp +++ b/testing/tests/asmjit_test_compiler_a64.cpp @@ -11,7 +11,7 @@ #include #include -#include "./asmjit_test_compiler.h" +#include "asmjit_test_compiler.h" using namespace asmjit; diff --git a/test/asmjit_test_compiler_x86.cpp b/testing/tests/asmjit_test_compiler_x86.cpp similarity index 100% rename from test/asmjit_test_compiler_x86.cpp rename to testing/tests/asmjit_test_compiler_x86.cpp diff --git a/test/asmjit_test_emitters.cpp b/testing/tests/asmjit_test_emitters.cpp similarity index 99% rename from test/asmjit_test_emitters.cpp rename to testing/tests/asmjit_test_emitters.cpp index ed3e27e..897436b 100644 --- a/test/asmjit_test_emitters.cpp +++ b/testing/tests/asmjit_test_emitters.cpp @@ -8,7 +8,7 @@ #include #include -#include "asmjitutils.h" +#include "../commons/asmjitutils.h" #if ASMJIT_ARCH_X86 != 0 #include diff --git a/test/asmjit_test_environment.cpp b/testing/tests/asmjit_test_environment.cpp similarity index 99% rename from test/asmjit_test_environment.cpp rename to testing/tests/asmjit_test_environment.cpp index 963d391..a3b86e1 100644 --- a/test/asmjit_test_environment.cpp +++ b/testing/tests/asmjit_test_environment.cpp @@ -13,7 +13,7 @@ #include #endif -#include "asmjitutils.h" +#include "../commons/asmjitutils.h" using namespace asmjit; diff --git a/test/asmjit_test_instinfo.cpp b/testing/tests/asmjit_test_instinfo.cpp similarity index 99% rename from test/asmjit_test_instinfo.cpp rename to testing/tests/asmjit_test_instinfo.cpp index c69664d..8c794c8 100644 --- a/test/asmjit_test_instinfo.cpp +++ b/testing/tests/asmjit_test_instinfo.cpp @@ -10,7 +10,8 @@ #endif #include -#include "asmjitutils.h" + +#include "../commons/asmjitutils.h" using namespace asmjit; diff --git a/test/asmjit_test_misc.h b/testing/tests/asmjit_test_misc.h similarity index 100% rename from test/asmjit_test_misc.h rename to testing/tests/asmjit_test_misc.h diff --git a/test/asmjit_test_unit.cpp b/testing/tests/asmjit_test_runner.cpp similarity index 97% rename from test/asmjit_test_unit.cpp rename to testing/tests/asmjit_test_runner.cpp index 0903947..ed831b4 100644 --- a/test/asmjit_test_unit.cpp +++ b/testing/tests/asmjit_test_runner.cpp @@ -6,15 +6,15 @@ #include #if !defined(ASMJIT_NO_X86) -#include + #include #endif #if !defined(ASMJIT_NO_AARCH64) -#include + #include #endif -#include "asmjitutils.h" #include "broken.h" +#include "../commons/asmjitutils.h" #if !defined(ASMJIT_NO_COMPILER) #include diff --git a/test/asmjit_test_unicompiler.cpp b/testing/tests/asmjit_test_unicompiler.cpp similarity index 92% rename from test/asmjit_test_unicompiler.cpp rename to testing/tests/asmjit_test_unicompiler.cpp index ba45f3e..501332f 100644 --- a/test/asmjit_test_unicompiler.cpp +++ b/testing/tests/asmjit_test_unicompiler.cpp @@ -9,8 +9,8 @@ #include #include -#include "asmjitutils.h" -#include "asmjit_test_random.h" +#include "../commons/asmjitutils.h" +#include "../commons/random.h" static void print_app_info() noexcept { printf("AsmJit UniCompiler Test Suite v%u.%u.%u [Arch=%s] [Mode=%s]\n\n", @@ -50,6 +50,7 @@ float fadd(float a, float b) noexcept; float fsub(float a, float b) noexcept; float fmul(float a, float b) noexcept; float fdiv(float a, float b) noexcept; +float fsqrt(float a) noexcept; float fmadd_nofma_ref(float a, float b, float c) noexcept; float fmadd_fma_ref(float a, float b, float c) noexcept; @@ -57,6 +58,7 @@ double fadd(double a, double b) noexcept; double fsub(double a, double b) noexcept; double fmul(double a, double b) noexcept; double fdiv(double a, double b) noexcept; +double fsqrt(double a) noexcept; double fmadd_nofma_ref(double a, double b, double c) noexcept; double fmadd_fma_ref(double a, double b, double c) noexcept; @@ -68,6 +70,7 @@ static ASMJIT_NOINLINE float fadd(float a, float b) noexcept { return a + b; } static ASMJIT_NOINLINE float fsub(float a, float b) noexcept { return a - b; } static ASMJIT_NOINLINE float fmul(float a, float b) noexcept { return a * b; } static ASMJIT_NOINLINE float fdiv(float a, float b) noexcept { return a / b; } +static ASMJIT_NOINLINE float fsqrt(float a) noexcept { return std::sqrt(a); } static ASMJIT_NOINLINE float fmadd_nofma_ref(float a, float b, float c) noexcept { return a * b + c; } static ASMJIT_NOINLINE float fmadd_fma_ref(float a, float b, float c) noexcept { return std::fma(a, b, c); } @@ -75,11 +78,18 @@ static ASMJIT_NOINLINE double fadd(double a, double b) noexcept { return a + b; static ASMJIT_NOINLINE double fsub(double a, double b) noexcept { return a - b; } static ASMJIT_NOINLINE double fmul(double a, double b) noexcept { return a * b; } static ASMJIT_NOINLINE double fdiv(double a, double b) noexcept { return a / b; } +static ASMJIT_NOINLINE double fsqrt(double a) noexcept { return std::sqrt(a); } static ASMJIT_NOINLINE double fmadd_nofma_ref(double a, double b, double c) noexcept { return fadd(fmul(a, b), c); } static ASMJIT_NOINLINE double fmadd_fma_ref(double a, double b, double c) noexcept { return std::fma(a, b, c); } #endif +static ASMJIT_INLINE float fsign(float a) noexcept { return Support::bit_cast(Support::bit_cast(a) & (uint32_t(1) << 31)); } +static ASMJIT_INLINE double fsign(double a) noexcept { return Support::bit_cast(Support::bit_cast(a) & (uint64_t(1) << 63)); } + +static ASMJIT_INLINE float fxor(float a, float b) noexcept { return Support::bit_cast(Support::bit_cast(a) ^ Support::bit_cast(b)); } +static ASMJIT_INLINE double fxor(double a, double b) noexcept { return Support::bit_cast(Support::bit_cast(a) ^ Support::bit_cast(b)); } + // ujit::UniCompiler - Tests - Types // ================================= @@ -115,14 +125,14 @@ typedef void (*TestVVVVFunc)(void* dst, const void* src1, const void* src2, cons // ujit::UniCompiler - Tests - JIT Context Error Handler // ===================================================== -class TestErrorHandler : public asmjit::ErrorHandler { +class TestErrorHandler : public ErrorHandler { public: TestErrorHandler() noexcept {} ~TestErrorHandler() noexcept override {} - void handle_error(asmjit::Error err, const char* message, asmjit::BaseEmitter* origin) override { + void handle_error(Error err, const char* message, BaseEmitter* origin) override { Support::maybe_unused(origin); - EXPECT_EQ(err, asmjit::Error::kOk) + EXPECT_EQ(err, Error::kOk) .message("AsmJit Error: %s", message); } }; @@ -131,16 +141,16 @@ public: class JitContext { public: - asmjit::JitRuntime rt; - asmjit::CpuFeatures features {}; - UniOptFlags opt_flags {}; + JitRuntime rt; + CpuFeatures features {}; + CpuHints cpu_hints {}; #if !defined(ASMJIT_NO_LOGGING) - asmjit::StringLogger logger; + StringLogger logger; #endif // !ASMJIT_NO_LOGGING TestErrorHandler eh; - asmjit::CodeHolder code; + CodeHolder code; BackendCompiler cc; void prepare() noexcept { @@ -154,16 +164,16 @@ public: #endif // !ASMJIT_NO_LOGGING code.attach(&cc); - cc.add_diagnostic_options(asmjit::DiagnosticOptions::kRAAnnotate); - cc.add_diagnostic_options(asmjit::DiagnosticOptions::kValidateAssembler); - cc.add_diagnostic_options(asmjit::DiagnosticOptions::kValidateIntermediate); + cc.add_diagnostic_options(DiagnosticOptions::kRAAnnotate); + cc.add_diagnostic_options(DiagnosticOptions::kValidateAssembler); + cc.add_diagnostic_options(DiagnosticOptions::kValidateIntermediate); } template Fn finish() noexcept { Fn fn; - EXPECT_EQ(cc.finalize(), asmjit::Error::kOk); - EXPECT_EQ(rt.add(&fn, &code), asmjit::Error::kOk); + EXPECT_EQ(cc.finalize(), Error::kOk); + EXPECT_EQ(rt.add(&fn, &code), Error::kOk); code.reset(); return fn; } @@ -180,9 +190,9 @@ public: static TestCondRRFunc create_func_cond_rr(JitContext& ctx, UniOpCond op, CondCode cond_code, uint32_t variation) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(VecWidth::k128); @@ -200,7 +210,7 @@ static TestCondRRFunc create_func_cond_rr(JitContext& ctx, UniOpCond op, CondCod // Test a conditional branch based on the given condition. Label done = pc.new_label(); pc.mov(result, 1); - pc.j(done, Condition(op, cond_code, a, b)); + pc.j(done, UniCondition(op, cond_code, a, b)); pc.mov(result, 0); pc.bind(done); break; @@ -211,7 +221,7 @@ static TestCondRRFunc create_func_cond_rr(JitContext& ctx, UniOpCond op, CondCod Gp true_value = pc.new_gp32("true_value"); pc.mov(result, 0); pc.mov(true_value, 1); - pc.cmov(result, true_value, Condition(op, cond_code, a, b)); + pc.cmov(result, true_value, UniCondition(op, cond_code, a, b)); break; } @@ -221,7 +231,7 @@ static TestCondRRFunc create_func_cond_rr(JitContext& ctx, UniOpCond op, CondCod Gp true_value = pc.new_gp32("true_value"); pc.mov(false_value, 0); pc.mov(true_value, 1); - pc.select(result, true_value, false_value, Condition(op, cond_code, a, b)); + pc.select(result, true_value, false_value, UniCondition(op, cond_code, a, b)); break; } } @@ -234,9 +244,9 @@ static TestCondRRFunc create_func_cond_rr(JitContext& ctx, UniOpCond op, CondCod static TestCondRIFunc create_func_cond_ri(JitContext& ctx, UniOpCond op, CondCode cond_code, Imm bImm) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(VecWidth::k128); @@ -248,7 +258,7 @@ static TestCondRIFunc create_func_cond_ri(JitContext& ctx, UniOpCond op, CondCod node->set_arg(0, a); pc.mov(result, 1); - pc.j(done, Condition(op, cond_code, a, bImm)); + pc.j(done, UniCondition(op, cond_code, a, bImm)); pc.mov(result, 0); pc.bind(done); ctx.cc.ret(result); @@ -552,9 +562,9 @@ static ASMJIT_NOINLINE void test_cond_ops(JitContext& ctx) noexcept { static TestMFunc create_func_m(JitContext& ctx, UniOpM op) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(VecWidth::k128); @@ -619,9 +629,9 @@ static ASMJIT_NOINLINE void test_m_ops(JitContext& ctx) noexcept { static TestRMFunc create_func_rm(JitContext& ctx, UniOpRM op) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(VecWidth::k128); @@ -735,9 +745,9 @@ static ASMJIT_NOINLINE void test_rm_ops(JitContext& ctx) noexcept { static TestMRFunc create_func_mr(JitContext& ctx, UniOpMR op) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(VecWidth::k128); @@ -849,9 +859,9 @@ static ASMJIT_NOINLINE void test_mr_ops(JitContext& ctx) noexcept { static TestRRFunc create_func_rr(JitContext& ctx, UniOpRR op) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(VecWidth::k128); @@ -929,9 +939,9 @@ static ASMJIT_NOINLINE void test_rr_ops(JitContext& ctx) noexcept { static TestRRRFunc create_func_rrr(JitContext& ctx, UniOpRRR op) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(VecWidth::k128); @@ -953,9 +963,9 @@ static TestRRRFunc create_func_rrr(JitContext& ctx, UniOpRRR op) noexcept { static TestRRIFunc create_func_rri(JitContext& ctx, UniOpRRR op, Imm bImm) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(VecWidth::k128); @@ -1161,9 +1171,9 @@ static constexpr uint32_t kNumVariationsVV_Broadcast = 4; static TestVVFunc create_func_vv(JitContext& ctx, VecWidth vw, UniOpVV op, Variation variation = Variation{0}) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(vw); @@ -1175,7 +1185,7 @@ static TestVVFunc create_func_vv(JitContext& ctx, VecWidth vw, UniOpVV op, Varia node->set_arg(0, dst_ptr); node->set_arg(1, src_ptr); - Vec dst_vec = pc.new_vec(vw, "dst_vec"); + Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec"); // There are some instructions that fill the high part of the register, so just zero the destination to make // sure that we can test this function (that the low part is actually zeroed and doesn't contain garbage). @@ -1233,7 +1243,7 @@ static TestVVFunc create_func_vv(JitContext& ctx, VecWidth vw, UniOpVV op, Varia pc.emit_2v(op, dst_vec, dst_vec); } else { - Vec src_vec = pc.new_vec(vw, "src_vec"); + Vec src_vec = pc.new_vec_with_width(vw, "src_vec"); pc.v_loaduvec(src_vec, mem_ptr(src_ptr)); pc.emit_2v(op, dst_vec, src_vec); } @@ -1252,9 +1262,9 @@ static constexpr uint32_t kNumVariationsVVI = 3; static TestVVFunc create_func_vvi(JitContext& ctx, VecWidth vw, UniOpVVI op, uint32_t imm, Variation variation = Variation{0}) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(vw); @@ -1266,14 +1276,14 @@ static TestVVFunc create_func_vvi(JitContext& ctx, VecWidth vw, UniOpVVI op, uin node->set_arg(0, dst_ptr); node->set_arg(1, src_ptr); - Vec src_vec = pc.new_vec(vw, "src_vec"); + Vec src_vec = pc.new_vec_with_width(vw, "src_vec"); switch (variation.value) { default: case 0: { // There are some instructions that fill the high part of the register, so just zero the destination to make // sure that we can test this function (that the low part is actually zeroed and doesn't contain garbage). - Vec dst_vec = pc.new_vec(vw, "dst_vec"); + Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec"); pc.v_zero_i(dst_vec); pc.v_loaduvec(src_vec, mem_ptr(src_ptr)); @@ -1291,7 +1301,7 @@ static TestVVFunc create_func_vvi(JitContext& ctx, VecWidth vw, UniOpVVI op, uin } case 2: { - Vec dst_vec = pc.new_vec(vw, "dst_vec"); + Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec"); pc.emit_2vi(op, dst_vec, mem_ptr(src_ptr), imm); pc.v_storeuvec(mem_ptr(dst_ptr), dst_vec); break; @@ -1312,9 +1322,9 @@ static constexpr uint32_t kNumVariationsVVV = 5; static TestVVVFunc create_func_vvv(JitContext& ctx, VecWidth vw, UniOpVVV op, Variation variation = Variation{0}) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(vw); @@ -1328,15 +1338,15 @@ static TestVVVFunc create_func_vvv(JitContext& ctx, VecWidth vw, UniOpVVV op, Va node->set_arg(1, src1_ptr); node->set_arg(2, src2_ptr); - Vec src1_vec = pc.new_vec(vw, "src1_vec"); - Vec src2_vec = pc.new_vec(vw, "src2_vec"); + Vec src1_vec = pc.new_vec_with_width(vw, "src1_vec"); + Vec src2_vec = pc.new_vec_with_width(vw, "src2_vec"); switch (variation.value) { default: case 0: { // There are some instructions that fill the high part of the register, so just zero the destination to make // sure that we can test this function (that the low part is actually zeroed and doesn't contain garbage). - Vec dst_vec = pc.new_vec(vw, "dst_vec"); + Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec"); pc.v_zero_i(dst_vec); pc.v_loaduvec(src1_vec, mem_ptr(src1_ptr)); @@ -1366,7 +1376,7 @@ static TestVVVFunc create_func_vvv(JitContext& ctx, VecWidth vw, UniOpVVV op, Va } case 3: { - Vec dst_vec = pc.new_vec(vw, "dst_vec"); + Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec"); pc.v_zero_i(dst_vec); pc.v_loaduvec(src1_vec, mem_ptr(src1_ptr)); @@ -1393,9 +1403,9 @@ static constexpr uint32_t kNumVariationsVVVI = 5; static TestVVVFunc create_func_vvvi(JitContext& ctx, VecWidth vw, UniOpVVVI op, uint32_t imm, Variation variation = Variation{0}) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(vw); @@ -1409,15 +1419,15 @@ static TestVVVFunc create_func_vvvi(JitContext& ctx, VecWidth vw, UniOpVVVI op, node->set_arg(1, src1_ptr); node->set_arg(2, src2_ptr); - Vec src1_vec = pc.new_vec(vw, "src1_vec"); - Vec src2_vec = pc.new_vec(vw, "src2_vec"); + Vec src1_vec = pc.new_vec_with_width(vw, "src1_vec"); + Vec src2_vec = pc.new_vec_with_width(vw, "src2_vec"); switch (variation.value) { default: case 0: { // There are some instructions that fill the high part of the register, so just zero the destination to make // sure that we can test this function (that the low part is actually zeroed and doesn't contain garbage). - Vec dst_vec = pc.new_vec(vw, "dst_vec"); + Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec"); pc.v_zero_i(dst_vec); pc.v_loaduvec(src1_vec, mem_ptr(src1_ptr)); @@ -1447,7 +1457,7 @@ static TestVVVFunc create_func_vvvi(JitContext& ctx, VecWidth vw, UniOpVVVI op, } case 3: { - Vec dst_vec = pc.new_vec(vw, "dst_vec"); + Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec"); pc.v_zero_i(dst_vec); pc.v_loaduvec(src1_vec, mem_ptr(src1_ptr)); @@ -1479,9 +1489,9 @@ static constexpr uint32_t kNumVariationsVVVV = 4; static TestVVVVFunc create_func_vvvv(JitContext& ctx, VecWidth vw, UniOpVVVV op, Variation variation = Variation{0}) noexcept { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); - asmjit::FuncNode* node = ctx.cc.new_func(asmjit::FuncSignature::build()); + FuncNode* node = ctx.cc.new_func(FuncSignature::build()); EXPECT_NOT_NULL(node); pc.init_vec_width(vw); @@ -1497,9 +1507,9 @@ static TestVVVVFunc create_func_vvvv(JitContext& ctx, VecWidth vw, UniOpVVVV op, node->set_arg(2, src2_ptr); node->set_arg(3, src3_ptr); - Vec src1_vec = pc.new_vec(vw, "src1_vec"); - Vec src2_vec = pc.new_vec(vw, "src2_vec"); - Vec src3_vec = pc.new_vec(vw, "src3_vec"); + Vec src1_vec = pc.new_vec_with_width(vw, "src1_vec"); + Vec src2_vec = pc.new_vec_with_width(vw, "src2_vec"); + Vec src3_vec = pc.new_vec_with_width(vw, "src3_vec"); pc.v_loaduvec(src1_vec, mem_ptr(src1_ptr)); pc.v_loaduvec(src2_vec, mem_ptr(src2_ptr)); @@ -1510,7 +1520,7 @@ static TestVVVVFunc create_func_vvvv(JitContext& ctx, VecWidth vw, UniOpVVVV op, case 0: { // There are some instructions that fill the high part of the register, so just zero the destination to make // sure that we can test this function (that the low part is actually zeroed and doesn't contain garbage). - Vec dst_vec = pc.new_vec(vw, "dst_vec"); + Vec dst_vec = pc.new_vec_with_width(vw, "dst_vec"); pc.v_zero_i(dst_vec); pc.emit_4v(op, dst_vec, src1_vec, src2_vec, src3_vec); @@ -1847,8 +1857,14 @@ static const char* vec_op_name_vv(UniOpVV op) noexcept { case UniOpVV::kCvtI32HiToI64 : return "v_cvt_i32_hi_to_i64"; case UniOpVV::kCvtU32LoToU64 : return "v_cvt_u32_lo_to_u64"; case UniOpVV::kCvtU32HiToU64 : return "v_cvt_u32_hi_to_u64"; + case UniOpVV::kAbsF32S : return "s_abs_f32"; + case UniOpVV::kAbsF64S : return "s_abs_f64"; case UniOpVV::kAbsF32 : return "v_abs_f32"; case UniOpVV::kAbsF64 : return "v_abs_f64"; + case UniOpVV::kNegF32S : return "s_neg_f32"; + case UniOpVV::kNegF64S : return "s_neg_f64"; + case UniOpVV::kNegF32 : return "v_neg_f32"; + case UniOpVV::kNegF64 : return "v_neg_f64"; case UniOpVV::kNotF32 : return "v_not_f32"; case UniOpVV::kNotF64 : return "v_not_f64"; case UniOpVV::kTruncF32S : return "v_trunc_f32s"; @@ -1863,10 +1879,18 @@ static const char* vec_op_name_vv(UniOpVV op) noexcept { case UniOpVV::kCeilF64S : return "v_ceil_f64s"; case UniOpVV::kCeilF32 : return "v_ceil_f32"; case UniOpVV::kCeilF64 : return "v_ceil_f64"; - case UniOpVV::kRoundF32S : return "v_round_f32s"; - case UniOpVV::kRoundF64S : return "v_round_f64s"; - case UniOpVV::kRoundF32 : return "v_round_f32"; - case UniOpVV::kRoundF64 : return "v_round_f64"; + case UniOpVV::kRoundEvenF32S : return "v_round_even_f32s"; + case UniOpVV::kRoundEvenF64S : return "v_round_even_f64s"; + case UniOpVV::kRoundEvenF32 : return "v_round_even_f32"; + case UniOpVV::kRoundEvenF64 : return "v_round_even_f64"; + case UniOpVV::kRoundHalfAwayF32S : return "v_round_half_away_f32s"; + case UniOpVV::kRoundHalfAwayF64S : return "v_round_half_away_f64s"; + case UniOpVV::kRoundHalfAwayF32 : return "v_round_half_away_f32"; + case UniOpVV::kRoundHalfAwayF64 : return "v_round_half_away_f64"; + case UniOpVV::kRoundHalfUpF32S : return "v_round_half_up_f32s"; + case UniOpVV::kRoundHalfUpF64S : return "v_round_half_up_f64s"; + case UniOpVV::kRoundHalfUpF32 : return "v_round_half_up_f32"; + case UniOpVV::kRoundHalfUpF64 : return "v_round_half_up_f64"; case UniOpVV::kRcpF32 : return "v_rcp_f32"; case UniOpVV::kRcpF64 : return "v_rcp_f64"; case UniOpVV::kSqrtF32S : return "v_sqrt_f32s"; @@ -1888,10 +1912,9 @@ static const char* vec_op_name_vv(UniOpVV op) noexcept { case UniOpVV::kCvtRoundF32ToI32 : return "v_cvt_round_f32_to_i32"; case UniOpVV::kCvtRoundF64ToI32Lo: return "v_cvt_round_f64_to_i32_lo"; case UniOpVV::kCvtRoundF64ToI32Hi: return "v_cvt_round_f64_to_i32_hi"; - - default: - ASMJIT_NOT_REACHED(); } + + ASMJIT_NOT_REACHED(); } static VecOpInfo vec_op_info_vv(UniOpVV op) noexcept { @@ -1936,8 +1959,14 @@ static VecOpInfo vec_op_info_vv(UniOpVV op) noexcept { case UniOpVV::kCvtI32HiToI64 : return VecOpInfo::make(VE::kInt64, VE::kInt32); case UniOpVV::kCvtU32LoToU64 : return VecOpInfo::make(VE::kUInt64, VE::kUInt32); case UniOpVV::kCvtU32HiToU64 : return VecOpInfo::make(VE::kUInt64, VE::kUInt32); + case UniOpVV::kAbsF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); + case UniOpVV::kAbsF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); case UniOpVV::kAbsF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); case UniOpVV::kAbsF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); + case UniOpVV::kNegF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); + case UniOpVV::kNegF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); + case UniOpVV::kNegF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); + case UniOpVV::kNegF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); case UniOpVV::kNotF32 : return VecOpInfo::make(VE::kUInt32, VE::kUInt32); case UniOpVV::kNotF64 : return VecOpInfo::make(VE::kUInt64, VE::kUInt64); case UniOpVV::kTruncF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); @@ -1952,10 +1981,18 @@ static VecOpInfo vec_op_info_vv(UniOpVV op) noexcept { case UniOpVV::kCeilF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); case UniOpVV::kCeilF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); case UniOpVV::kCeilF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); - case UniOpVV::kRoundF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); - case UniOpVV::kRoundF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); - case UniOpVV::kRoundF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); - case UniOpVV::kRoundF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); + case UniOpVV::kRoundEvenF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); + case UniOpVV::kRoundEvenF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); + case UniOpVV::kRoundEvenF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); + case UniOpVV::kRoundEvenF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); + case UniOpVV::kRoundHalfAwayF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); + case UniOpVV::kRoundHalfAwayF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); + case UniOpVV::kRoundHalfAwayF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); + case UniOpVV::kRoundHalfAwayF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); + case UniOpVV::kRoundHalfUpF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); + case UniOpVV::kRoundHalfUpF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); + case UniOpVV::kRoundHalfUpF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); + case UniOpVV::kRoundHalfUpF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); case UniOpVV::kRcpF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); case UniOpVV::kRcpF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); case UniOpVV::kSqrtF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); @@ -1977,10 +2014,9 @@ static VecOpInfo vec_op_info_vv(UniOpVV op) noexcept { case UniOpVV::kCvtRoundF32ToI32 : return VecOpInfo::make(VE::kInt32, VE::kFloat32); case UniOpVV::kCvtRoundF64ToI32Lo: return VecOpInfo::make(VE::kInt32, VE::kFloat64); case UniOpVV::kCvtRoundF64ToI32Hi: return VecOpInfo::make(VE::kInt32, VE::kFloat64); - - default: - ASMJIT_NOT_REACHED(); } + + ASMJIT_NOT_REACHED(); } static const char* vec_op_name_vvi(UniOpVVI op) noexcept { @@ -2030,43 +2066,48 @@ static const char* vec_op_name_vvi(UniOpVVI op) noexcept { case UniOpVVI::kSrlnHiU32 : return "v_srln_hi_u32"; case UniOpVVI::kSrlnLoU64 : return "v_srln_lo_u64"; case UniOpVVI::kSrlnHiU64 : return "v_srln_hi_u64"; + case UniOpVVI::kSrlnRndLoU16 : return "v_srln_rnd_lo_u16"; + case UniOpVVI::kSrlnRndHiU16 : return "v_srln_rnd_hi_u16"; + case UniOpVVI::kSrlnRndLoU32 : return "v_srln_rnd_lo_u32"; + case UniOpVVI::kSrlnRndHiU32 : return "v_srln_rnd_hi_u32"; + case UniOpVVI::kSrlnRndLoU64 : return "v_srln_rnd_lo_u64"; + case UniOpVVI::kSrlnRndHiU64 : return "v_srln_rnd_hi_u64"; #endif // ASMJIT_UJIT_AARCH64 - - default: - ASMJIT_NOT_REACHED(); } + + ASMJIT_NOT_REACHED(); } static VecOpInfo vec_op_info_vvi(UniOpVVI op) noexcept { using VE = VecElementType; switch (op) { - case UniOpVVI::kSllU16 : return VecOpInfo::make(VE::kUInt16, VE::kUInt16); - case UniOpVVI::kSllU32 : return VecOpInfo::make(VE::kUInt32, VE::kUInt32); - case UniOpVVI::kSllU64 : return VecOpInfo::make(VE::kUInt64, VE::kUInt64); - case UniOpVVI::kSrlU16 : return VecOpInfo::make(VE::kUInt16, VE::kUInt16); - case UniOpVVI::kSrlU32 : return VecOpInfo::make(VE::kUInt32, VE::kUInt32); - case UniOpVVI::kSrlU64 : return VecOpInfo::make(VE::kUInt64, VE::kUInt64); - case UniOpVVI::kSraI16 : return VecOpInfo::make(VE::kInt16, VE::kInt16); - case UniOpVVI::kSraI32 : return VecOpInfo::make(VE::kInt32, VE::kInt32); - case UniOpVVI::kSraI64 : return VecOpInfo::make(VE::kInt64, VE::kInt64); - case UniOpVVI::kSllbU128 : return VecOpInfo::make(VE::kUInt8, VE::kUInt8); - case UniOpVVI::kSrlbU128 : return VecOpInfo::make(VE::kUInt8, VE::kUInt8); - case UniOpVVI::kSwizzleU16x4 : return VecOpInfo::make(VE::kUInt16, VE::kUInt16); - case UniOpVVI::kSwizzleLoU16x4 : return VecOpInfo::make(VE::kUInt16, VE::kUInt16); - case UniOpVVI::kSwizzleHiU16x4 : return VecOpInfo::make(VE::kUInt16, VE::kUInt16); - case UniOpVVI::kSwizzleU32x4 : return VecOpInfo::make(VE::kUInt32, VE::kUInt32); - case UniOpVVI::kSwizzleU64x2 : return VecOpInfo::make(VE::kUInt64, VE::kUInt64); + case UniOpVVI::kSllU16 : return VecOpInfo::make(VE::kUInt16 , VE::kUInt16); + case UniOpVVI::kSllU32 : return VecOpInfo::make(VE::kUInt32 , VE::kUInt32); + case UniOpVVI::kSllU64 : return VecOpInfo::make(VE::kUInt64 , VE::kUInt64); + case UniOpVVI::kSrlU16 : return VecOpInfo::make(VE::kUInt16 , VE::kUInt16); + case UniOpVVI::kSrlU32 : return VecOpInfo::make(VE::kUInt32 , VE::kUInt32); + case UniOpVVI::kSrlU64 : return VecOpInfo::make(VE::kUInt64 , VE::kUInt64); + case UniOpVVI::kSraI16 : return VecOpInfo::make(VE::kInt16 , VE::kInt16); + case UniOpVVI::kSraI32 : return VecOpInfo::make(VE::kInt32 , VE::kInt32); + case UniOpVVI::kSraI64 : return VecOpInfo::make(VE::kInt64 , VE::kInt64); + case UniOpVVI::kSllbU128 : return VecOpInfo::make(VE::kUInt8 , VE::kUInt8); + case UniOpVVI::kSrlbU128 : return VecOpInfo::make(VE::kUInt8 , VE::kUInt8); + case UniOpVVI::kSwizzleU16x4 : return VecOpInfo::make(VE::kUInt16 , VE::kUInt16); + case UniOpVVI::kSwizzleLoU16x4 : return VecOpInfo::make(VE::kUInt16 , VE::kUInt16); + case UniOpVVI::kSwizzleHiU16x4 : return VecOpInfo::make(VE::kUInt16 , VE::kUInt16); + case UniOpVVI::kSwizzleU32x4 : return VecOpInfo::make(VE::kUInt32 , VE::kUInt32); + case UniOpVVI::kSwizzleU64x2 : return VecOpInfo::make(VE::kUInt64 , VE::kUInt64); case UniOpVVI::kSwizzleF32x4 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32); case UniOpVVI::kSwizzleF64x2 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); - case UniOpVVI::kSwizzleU64x4 : return VecOpInfo::make(VE::kUInt64, VE::kUInt64); + case UniOpVVI::kSwizzleU64x4 : return VecOpInfo::make(VE::kUInt64 , VE::kUInt64); case UniOpVVI::kSwizzleF64x4 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64); - case UniOpVVI::kExtractV128_I32: return VecOpInfo::make(VE::kInt32, VE::kInt32); - case UniOpVVI::kExtractV128_I64: return VecOpInfo::make(VE::kInt64, VE::kInt64); + case UniOpVVI::kExtractV128_I32: return VecOpInfo::make(VE::kInt32 , VE::kInt32); + case UniOpVVI::kExtractV128_I64: return VecOpInfo::make(VE::kInt64 , VE::kInt64); case UniOpVVI::kExtractV128_F32: return VecOpInfo::make(VE::kFloat32, VE::kFloat32); case UniOpVVI::kExtractV128_F64: return VecOpInfo::make(VE::kFloat64, VE::kFloat64); - case UniOpVVI::kExtractV256_I32: return VecOpInfo::make(VE::kUInt32, VE::kUInt32); - case UniOpVVI::kExtractV256_I64: return VecOpInfo::make(VE::kUInt64, VE::kUInt64); + case UniOpVVI::kExtractV256_I32: return VecOpInfo::make(VE::kUInt32 , VE::kUInt32); + case UniOpVVI::kExtractV256_I64: return VecOpInfo::make(VE::kUInt64 , VE::kUInt64); case UniOpVVI::kExtractV256_F32: return VecOpInfo::make(VE::kFloat32, VE::kFloat32); case UniOpVVI::kExtractV256_F64: return VecOpInfo::make(VE::kFloat64, VE::kFloat64); @@ -2086,11 +2127,16 @@ static VecOpInfo vec_op_info_vvi(UniOpVVI op) noexcept { case UniOpVVI::kSrlnHiU32 : return VecOpInfo::make(VE::kUInt16, VE::kUInt16); case UniOpVVI::kSrlnLoU64 : return VecOpInfo::make(VE::kUInt32, VE::kUInt32); case UniOpVVI::kSrlnHiU64 : return VecOpInfo::make(VE::kUInt64, VE::kUInt64); + case UniOpVVI::kSrlnRndLoU16 : return VecOpInfo::make(VE::kUInt8 , VE::kUInt16); + case UniOpVVI::kSrlnRndHiU16 : return VecOpInfo::make(VE::kUInt8 , VE::kUInt16); + case UniOpVVI::kSrlnRndLoU32 : return VecOpInfo::make(VE::kUInt16, VE::kUInt32); + case UniOpVVI::kSrlnRndHiU32 : return VecOpInfo::make(VE::kUInt16, VE::kUInt32); + case UniOpVVI::kSrlnRndLoU64 : return VecOpInfo::make(VE::kUInt32, VE::kUInt64); + case UniOpVVI::kSrlnRndHiU64 : return VecOpInfo::make(VE::kUInt32, VE::kUInt64); #endif // ASMJIT_UJIT_AARCH64 - - default: - ASMJIT_NOT_REACHED(); } + + ASMJIT_NOT_REACHED(); } static const char* vec_op_name_vvv(UniOpVVV op) noexcept { @@ -2208,6 +2254,10 @@ static const char* vec_op_name_vvv(UniOpVVV op) noexcept { case UniOpVVV::kDivF64S : return "v_div_f64s"; case UniOpVVV::kDivF32 : return "v_div_f32"; case UniOpVVV::kDivF64 : return "v_div_f64"; + case UniOpVVV::kModF32S : return "v_mod_f32s"; + case UniOpVVV::kModF64S : return "v_mod_f64s"; + case UniOpVVV::kModF32 : return "v_mod_f32"; + case UniOpVVV::kModF64 : return "v_mod_f64"; case UniOpVVV::kMinF32S : return "v_min_f32s"; case UniOpVVV::kMinF64S : return "v_min_f64s"; case UniOpVVV::kMinF32 : return "v_min_f32"; @@ -2298,9 +2348,15 @@ static const char* vec_op_name_vvv(UniOpVVV op) noexcept { case UniOpVVV::kMAddwHiU32 : return "v_maddw_hi_u32"; #endif // ASMJIT_UJIT_AARCH64 - default: - ASMJIT_NOT_REACHED(); +#if defined(ASMJIT_UJIT_X86) + case UniOpVVV::kPermuteU8 : return "v_permute_u8"; + case UniOpVVV::kPermuteU16 : return "v_permute_u16"; + case UniOpVVV::kPermuteU32 : return "v_permute_u32"; + case UniOpVVV::kPermuteU64 : return "v_permute_u64"; +#endif // ASMJIT_UJIT_X86 } + + ASMJIT_NOT_REACHED(); } static VecOpInfo vec_op_info_vvv(UniOpVVV op) noexcept { @@ -2420,6 +2476,10 @@ static VecOpInfo vec_op_info_vvv(UniOpVVV op) noexcept { case UniOpVVV::kDivF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64); case UniOpVVV::kDivF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32); case UniOpVVV::kDivF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64); + case UniOpVVV::kModF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32); + case UniOpVVV::kModF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64); + case UniOpVVV::kModF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32); + case UniOpVVV::kModF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64); case UniOpVVV::kMinF32S : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32); case UniOpVVV::kMinF64S : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64); case UniOpVVV::kMinF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32); @@ -2510,9 +2570,15 @@ static VecOpInfo vec_op_info_vvv(UniOpVVV op) noexcept { case UniOpVVV::kMAddwHiU32 : return VecOpInfo::make(VE::kUInt64, VE::kUInt32, VE::kUInt32); #endif // ASMJIT_UJIT_AARCH64 - default: - ASMJIT_NOT_REACHED(); +#if defined(ASMJIT_UJIT_X86) + case UniOpVVV::kPermuteU8 : return VecOpInfo::make(VE::kUInt8, VE::kUInt8, VE::kUInt8); + case UniOpVVV::kPermuteU16 : return VecOpInfo::make(VE::kUInt16, VE::kUInt16, VE::kUInt16); + case UniOpVVV::kPermuteU32 : return VecOpInfo::make(VE::kUInt32, VE::kUInt32, VE::kUInt32); + case UniOpVVV::kPermuteU64 : return VecOpInfo::make(VE::kUInt64, VE::kUInt64, VE::kUInt64); +#endif // ASMJIT_UJIT_X86 } + + ASMJIT_NOT_REACHED(); } static const char* vec_op_name_vvvi(UniOpVVVI op) noexcept { @@ -2530,10 +2596,9 @@ static const char* vec_op_name_vvvi(UniOpVVVI op) noexcept { case UniOpVVVI::kInsertV256_F32 : return "v_insert_v256_f32"; case UniOpVVVI::kInsertV256_U64 : return "v_insert_v256_u64"; case UniOpVVVI::kInsertV256_F64 : return "v_insert_v256_f64"; - - default: - ASMJIT_NOT_REACHED(); } + + ASMJIT_NOT_REACHED(); } static VecOpInfo vec_op_info_vvvi(UniOpVVVI op) noexcept { @@ -2580,10 +2645,9 @@ static const char* vec_op_name_vvvv(UniOpVVVV op) noexcept { case UniOpVVVV::kNMSubF64S: return "v_nmsub_f64s"; case UniOpVVVV::kNMSubF32 : return "v_nmsub_f32"; case UniOpVVVV::kNMSubF64 : return "v_nmsub_f64"; - - default: - ASMJIT_NOT_REACHED(); } + + ASMJIT_NOT_REACHED(); } static VecOpInfo vec_op_info_vvvv(UniOpVVVV op) noexcept { @@ -2609,49 +2673,55 @@ static VecOpInfo vec_op_info_vvvv(UniOpVVVV op) noexcept { case UniOpVVVV::kNMSubF64S: return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64, VE::kFloat64); case UniOpVVVV::kNMSubF32 : return VecOpInfo::make(VE::kFloat32, VE::kFloat32, VE::kFloat32, VE::kFloat32); case UniOpVVVV::kNMSubF64 : return VecOpInfo::make(VE::kFloat64, VE::kFloat64, VE::kFloat64, VE::kFloat64); - - default: - ASMJIT_NOT_REACHED(); } + + ASMJIT_NOT_REACHED(); } // ujit::UniCompiler - Tests - SIMD - Float To Int - Machine Behavior // ================================================================== -#if defined(ASMJIT_UJIT_X86) -static ASMJIT_INLINE_NODEBUG int32_t cvt_non_finite_f32_to_i32([[maybe_unused]] float x) noexcept { return INT32_MIN; } -static ASMJIT_INLINE_NODEBUG int32_t cvt_non_finite_f64_to_i32([[maybe_unused]] double x) noexcept { return INT32_MIN; } -#endif // ASMJIT_UJIT_X86 +template +static ASMJIT_INLINE_NODEBUG int32_t cvt_float_to_int_trunc(const FloatT& x) noexcept { + constexpr IntT min_value = std::numeric_limits::lowest(); + constexpr IntT max_value = std::numeric_limits::max(); + constexpr IntT zero = IntT(0); -#if defined(ASMJIT_UJIT_AARCH64) -static constexpr int32_t kPInfToInt32 = INT32_MAX; -static constexpr int32_t kNInfToInt32 = INT32_MIN; -static constexpr int32_t kNaNToInt32 = 0; + if (std::isnan(x)) { + return behavior == FloatToIntOutsideRangeBehavior::kSmallestValue ? min_value : zero; + } -static ASMJIT_INLINE_NODEBUG int32_t cvt_non_finite_f32_to_i32(float x) noexcept { - if (x == std::numeric_limits::infinity()) { - return kPInfToInt32; + if (x < FloatT(min_value)) { + return min_value; } - else if (x == -std::numeric_limits::infinity()) { - return kNInfToInt32; - } - else { - return kNaNToInt32; + + if (x > FloatT(max_value)) { + return behavior == FloatToIntOutsideRangeBehavior::kSmallestValue ? min_value : max_value; } + + return IntT(x); } -static ASMJIT_INLINE_NODEBUG int32_t cvt_non_finite_f64_to_i32(double x) noexcept { - if (x == std::numeric_limits::infinity()) { - return kPInfToInt32; +template +static ASMJIT_INLINE_NODEBUG int32_t cvt_float_to_int_round(const FloatT& x) noexcept { + constexpr IntT min_value = std::numeric_limits::lowest(); + constexpr IntT max_value = std::numeric_limits::max(); + constexpr IntT zero = IntT(0); + + if (std::isnan(x)) { + return behavior == FloatToIntOutsideRangeBehavior::kSmallestValue ? min_value : zero; } - else if (x == -std::numeric_limits::infinity()) { - return kNInfToInt32; + + if (x < FloatT(min_value)) { + return min_value; } - else { - return kNaNToInt32; + + if (x > FloatT(max_value)) { + return behavior == FloatToIntOutsideRangeBehavior::kSmallestValue ? min_value : max_value; } + + return IntT(std::nearbyint(x)); } -#endif // ASMJIT_UJIT_AARCH64 // ujit::UniCompiler - Tests - SIMD - Data Generators & Constraints // ================================================================ @@ -2737,6 +2807,7 @@ public: case 66: return 0.1f; case 69: return 0.2f; case 79: return 0.3f; + case 89: return -13005961.0f; case 99: return -std::numeric_limits::infinity(); case 100: case 102: @@ -2799,6 +2870,7 @@ public: case 66: return 0.1; case 69: return 0.2; case 79: return 0.3; + case 80: return 4503599627370495.5; case 99: return -std::numeric_limits::infinity(); case 100: case 102: @@ -2813,6 +2885,7 @@ public: case 122: return 10.3; case 123: return 20.3; case 124: return -100.3; + case 125: return 4503599627370496.0; case 127: return 1.3; case 130: return std::numeric_limits::quiet_NaN(); case 135: return -std::numeric_limits::infinity(); @@ -2824,6 +2897,7 @@ public: case 165: return -0.5; case 175: return -1.0; case 245: return 2.5; + case 248: return -4503599627370495.5; default: { double sign = rng.next_uint32() < 0x7FFFFFF ? 1.0 : -1.0; @@ -2833,6 +2907,12 @@ public: } }; +template +struct half_minus_1ulp_const; + +template<> struct half_minus_1ulp_const { static inline constexpr float value = 0.49999997f; }; +template<> struct half_minus_1ulp_const { static inline constexpr double value = 0.49999999999999994; }; + // Some SIMD operations are constrained, especially those higher level. So, to successfully test these we // have to model the constraints in a way that the SIMD instruction we test actually gets the correct input. // Note that a constraint doesn't have to be always range based, it could be anything. @@ -3326,16 +3406,24 @@ template struct vec_op_ceil : public op_each_vv> { static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::ceil(a); } }; -template struct vec_op_round : public op_each_vv> { +template struct vec_op_round_even : public op_each_vv> { static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::nearbyint(a); } }; +template struct vec_op_round_half_away : public op_each_vv> { + static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::trunc(fadd(a, fxor(half_minus_1ulp_const::value, fsign(a)))); } +}; + +template struct vec_op_round_half_up : public op_each_vv> { + static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::floor(fadd(a, half_minus_1ulp_const::value)); } +}; + template struct vec_op_sqrt : public op_each_vv> { - static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::sqrt(a); } + static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return fsqrt(a); } }; template struct vec_op_rcp : public op_each_vv> { - static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return T(1) / a; } + static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return fdiv(T(1), a); } }; struct vec_op_cvt_i32_to_f32 { @@ -3403,130 +3491,74 @@ struct vec_op_cvt_i32_to_f64_impl { struct vec_op_cvt_i32_lo_to_f64 : public vec_op_cvt_i32_to_f64_impl {}; struct vec_op_cvt_i32_hi_to_f64 : public vec_op_cvt_i32_to_f64_impl {}; +template struct vec_op_cvt_trunc_f32_to_i32 { - static ASMJIT_INLINE int32_t cvt(float val) noexcept { - if (!std::isfinite(val)) - return cvt_non_finite_f32_to_i32(val); - - if (val <= float(INT32_MIN)) { - return INT32_MIN; - } - else if (val >= float(INT32_MAX)) { - return INT32_MAX; - } - else { - return int32_t(val); - } - } - template static ASMJIT_INLINE VecOverlay apply(const VecOverlay& a) noexcept { VecOverlay out{}; for (uint32_t off = 0; off < kW; off += 16) { - out.data_i32[off / 4 + 0] = cvt(a.data_f32[off / 4 + 0]); - out.data_i32[off / 4 + 1] = cvt(a.data_f32[off / 4 + 1]); - out.data_i32[off / 4 + 2] = cvt(a.data_f32[off / 4 + 2]); - out.data_i32[off / 4 + 3] = cvt(a.data_f32[off / 4 + 3]); + out.data_i32[off / 4 + 0] = cvt_float_to_int_trunc(a.data_f32[off / 4 + 0]); + out.data_i32[off / 4 + 1] = cvt_float_to_int_trunc(a.data_f32[off / 4 + 1]); + out.data_i32[off / 4 + 2] = cvt_float_to_int_trunc(a.data_f32[off / 4 + 2]); + out.data_i32[off / 4 + 3] = cvt_float_to_int_trunc(a.data_f32[off / 4 + 3]); } return out; } }; -template +template struct vec_op_cvt_trunc_f64_to_i32_impl { - static ASMJIT_INLINE int32_t cvt(double val) noexcept { - if (!std::isfinite(val)) { - return cvt_non_finite_f64_to_i32(val); - } - - if (val <= double(INT32_MIN)) { - return INT32_MIN; - } - else if (val >= double(INT32_MAX)) { - return INT32_MAX; - } - else { - return int32_t(val); - } - } - template static ASMJIT_INLINE VecOverlay apply(const VecOverlay& a) noexcept { VecOverlay out{}; uint32_t adj = kHi ? kW / 8 : 0u; for (uint32_t off = 0; off < kW; off += 16) { - out.data_i32[off / 8 + adj + 0] = cvt(a.data_f64[off / 8 + 0]); - out.data_i32[off / 8 + adj + 1] = cvt(a.data_f64[off / 8 + 1]); + out.data_i32[off / 8 + adj + 0] = cvt_float_to_int_trunc(a.data_f64[off / 8 + 0]); + out.data_i32[off / 8 + adj + 1] = cvt_float_to_int_trunc(a.data_f64[off / 8 + 1]); } return out; } }; -struct vec_op_cvt_trunc_f64_to_i32_lo : vec_op_cvt_trunc_f64_to_i32_impl {}; -struct vec_op_cvt_trunc_f64_to_i32_hi : vec_op_cvt_trunc_f64_to_i32_impl {}; +template +struct vec_op_cvt_trunc_f64_to_i32_lo : vec_op_cvt_trunc_f64_to_i32_impl {}; +template +struct vec_op_cvt_trunc_f64_to_i32_hi : vec_op_cvt_trunc_f64_to_i32_impl {}; + +template struct vec_op_cvt_round_f32_to_i32 { - static ASMJIT_INLINE int32_t cvt(float val) noexcept { - if (!std::isfinite(val)) { - return cvt_non_finite_f32_to_i32(val); - } - - if (val <= float(INT32_MIN)) { - return INT32_MIN; - } - else if (val >= float(INT32_MAX)) { - return INT32_MAX; - } - else { - return static_cast(std::nearbyint(val)); - } - } - template static ASMJIT_INLINE VecOverlay apply(const VecOverlay& a) noexcept { VecOverlay out{}; for (uint32_t off = 0; off < kW; off += 16) { - out.data_i32[off / 4 + 0] = cvt(a.data_f32[off / 4 + 0]); - out.data_i32[off / 4 + 1] = cvt(a.data_f32[off / 4 + 1]); - out.data_i32[off / 4 + 2] = cvt(a.data_f32[off / 4 + 2]); - out.data_i32[off / 4 + 3] = cvt(a.data_f32[off / 4 + 3]); + out.data_i32[off / 4 + 0] = cvt_float_to_int_round(a.data_f32[off / 4 + 0]); + out.data_i32[off / 4 + 1] = cvt_float_to_int_round(a.data_f32[off / 4 + 1]); + out.data_i32[off / 4 + 2] = cvt_float_to_int_round(a.data_f32[off / 4 + 2]); + out.data_i32[off / 4 + 3] = cvt_float_to_int_round(a.data_f32[off / 4 + 3]); } return out; } }; -template +template struct vec_op_cvt_round_f64_to_i32_impl { - static ASMJIT_INLINE int32_t cvt(double val) noexcept { - if (!std::isfinite(val)) { - return cvt_non_finite_f64_to_i32(val); - } - - if (val <= double(INT32_MIN)) { - return INT32_MIN; - } - else if (val >= double(INT32_MAX)) { - return INT32_MAX; - } - else { - return static_cast(std::nearbyint(val)); - } - } - template static ASMJIT_INLINE VecOverlay apply(const VecOverlay& a) noexcept { VecOverlay out{}; uint32_t adj = kHi ? kW / 8 : 0u; for (uint32_t off = 0; off < kW; off += 16) { - out.data_i32[off / 8 + adj + 0] = cvt(a.data_f64[off / 8 + 0]); - out.data_i32[off / 8 + adj + 1] = cvt(a.data_f64[off / 8 + 1]); + out.data_i32[off / 8 + adj + 0] = cvt_float_to_int_round(a.data_f64[off / 8 + 0]); + out.data_i32[off / 8 + adj + 1] = cvt_float_to_int_round(a.data_f64[off / 8 + 1]); } return out; } }; -struct vec_op_cvt_round_f64_to_i32_lo : vec_op_cvt_round_f64_to_i32_impl {}; -struct vec_op_cvt_round_f64_to_i32_hi : vec_op_cvt_round_f64_to_i32_impl {}; +template +struct vec_op_cvt_round_f64_to_i32_lo : vec_op_cvt_round_f64_to_i32_impl {}; +template +struct vec_op_cvt_round_f64_to_i32_hi : vec_op_cvt_round_f64_to_i32_impl {}; struct scalar_op_cvt_f32_to_f64 { template @@ -3558,12 +3590,20 @@ template struct scalar_op_ceil : public op_scal static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::ceil(a); } }; -template struct scalar_op_round : public op_scalar_vv> { +template struct scalar_op_round_even : public op_scalar_vv> { static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::nearbyint(a); } }; +template struct scalar_op_round_half_away : public op_scalar_vv> { + static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::trunc(fadd(a, fxor(half_minus_1ulp_const::value, fsign(a)))); } +}; + +template struct scalar_op_round_half_up : public op_scalar_vv> { + static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::floor(fadd(a, half_minus_1ulp_const::value)); } +}; + template struct scalar_op_sqrt : public op_scalar_vv> { - static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return std::sqrt(a); } + static ASMJIT_INLINE_NODEBUG T apply_one(const T& a) noexcept { return fsqrt(a); } }; // ujit::UniCompiler - Tests - Generic Operations - VVI @@ -4715,15 +4755,17 @@ template static ASMJIT_NOINLINE void test_simd_ops(JitContext& ctx) noexcept { // We need to know some behaviors in advance so we can select the right test function, // so create a dummy compiler and extract the necessary information from it. - ScalarOpBehavior scalar_op_behavior; - FMAddOpBehavior fmadd_op_behavior; + ScalarOpBehavior scalar_op_behavior {}; + FMAddOpBehavior fmadd_op_behavior {}; + FloatToIntOutsideRangeBehavior float_to_int_behavior {}; { ctx.prepare(); - UniCompiler pc(&ctx.cc, ctx.features, ctx.opt_flags); + UniCompiler pc(&ctx.cc, ctx.features, ctx.cpu_hints); scalar_op_behavior = pc.scalar_op_behavior(); fmadd_op_behavior = pc.fmadd_op_behavior(); + float_to_int_behavior = pc.float_to_int_outside_range_behavior(); } bool valgrind_fma_bug = false; @@ -4843,8 +4885,12 @@ static ASMJIT_NOINLINE void test_simd_ops(JitContext& ctx) noexcept { test_vecop_vv>(ctx, Variation{v}); test_vecop_vv>(ctx, Variation{v}); test_vecop_vv>(ctx, Variation{v}); - test_vecop_vv>(ctx, Variation{v}); - test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); } else { test_vecop_vv>(ctx, Variation{v}); @@ -4853,8 +4899,12 @@ static ASMJIT_NOINLINE void test_simd_ops(JitContext& ctx) noexcept { test_vecop_vv>(ctx, Variation{v}); test_vecop_vv>(ctx, Variation{v}); test_vecop_vv>(ctx, Variation{v}); - test_vecop_vv>(ctx, Variation{v}); - test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); } test_vecop_vv>(ctx, Variation{v}); @@ -4863,8 +4913,12 @@ static ASMJIT_NOINLINE void test_simd_ops(JitContext& ctx) noexcept { test_vecop_vv>(ctx, Variation{v}); test_vecop_vv>(ctx, Variation{v}); test_vecop_vv>(ctx, Variation{v}); - test_vecop_vv>(ctx, Variation{v}); - test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{v}); } } @@ -4908,12 +4962,24 @@ static ASMJIT_NOINLINE void test_simd_ops(JitContext& ctx) noexcept { test_vecop_vv(ctx, Variation{v}); test_vecop_vv(ctx, Variation{v}); - test_vecop_vv(ctx, Variation{v}); - test_vecop_vv(ctx, Variation{0}); - test_vecop_vv(ctx, Variation{0}); - test_vecop_vv(ctx, Variation{v}); - test_vecop_vv(ctx, Variation{0}); - test_vecop_vv(ctx, Variation{0}); + if (float_to_int_behavior == FloatToIntOutsideRangeBehavior::kSmallestValue) { + constexpr FloatToIntOutsideRangeBehavior behavior = FloatToIntOutsideRangeBehavior::kSmallestValue; + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{0}); + test_vecop_vv>(ctx, Variation{0}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{0}); + test_vecop_vv>(ctx, Variation{0}); + } + else { + constexpr FloatToIntOutsideRangeBehavior behavior = FloatToIntOutsideRangeBehavior::kSaturatedValue; + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{0}); + test_vecop_vv>(ctx, Variation{0}); + test_vecop_vv>(ctx, Variation{v}); + test_vecop_vv>(ctx, Variation{0}); + test_vecop_vv>(ctx, Variation{0}); + } } } @@ -5410,30 +5476,29 @@ static void test_gp_ops(JitContext& ctx) noexcept { } #if defined(ASMJIT_UJIT_X86) -static void dump_feature_list(asmjit::String& out, const asmjit::CpuFeatures& features) noexcept { +static void dump_feature_list(String& out, const CpuFeatures& features) noexcept { #if !defined(ASMJIT_NO_LOGGING) - asmjit::CpuFeatures::Iterator it = features.iterator(); + CpuFeatures::Iterator it = features.iterator(); bool first = true; while (it.has_next()) { size_t feature_id = it.next(); if (!first) { out.append(' '); } - asmjit::Formatter::format_feature(out, asmjit::Arch::kHost, uint32_t(feature_id)); + Formatter::format_feature(out, Arch::kHost, uint32_t(feature_id)); first = false; } #else - asmjit::Support::maybe_unused(features); + Support::maybe_unused(features); out.append(""); #endif } -static void test_x86_ops(JitContext& ctx, const asmjit::CpuFeatures& host_features) noexcept { - using Ext = asmjit::CpuFeatures::X86; - using CpuFeatures = asmjit::CpuFeatures; +static void test_x86_ops(JitContext& ctx, const CpuFeatures& host_features) noexcept { + using Ext = CpuFeatures::X86; { - asmjit::String s; + String s; dump_feature_list(s, host_features); INFO("Available CPU features: %s", s.data()); } @@ -5474,7 +5539,7 @@ static void test_x86_ops(JitContext& ctx, const asmjit::CpuFeatures& host_featur continue; } - asmjit::String s; + String s; if (filtered == host_features) { s.assign("[ALL]"); } @@ -5554,7 +5619,7 @@ static void test_x86_ops(JitContext& ctx, const asmjit::CpuFeatures& host_featur continue; } - asmjit::String s; + String s; if (filtered == host_features) { s.assign("[ALL]"); } @@ -5585,7 +5650,7 @@ static void test_x86_ops(JitContext& ctx, const asmjit::CpuFeatures& host_featur #endif // ASMJIT_UJIT_X86 #if defined(ASMJIT_UJIT_AARCH64) -static void test_a64_ops(JitContext& ctx, const asmjit::CpuFeatures& host_features) noexcept { +static void test_a64_ops(JitContext& ctx, const CpuFeatures& host_features) noexcept { ctx.features = host_features; test_gp_ops(ctx); diff --git a/test/asmjit_test_unicompiler_avx2fma.cpp b/testing/tests/asmjit_test_unicompiler_avx2fma.cpp similarity index 100% rename from test/asmjit_test_unicompiler_avx2fma.cpp rename to testing/tests/asmjit_test_unicompiler_avx2fma.cpp diff --git a/test/asmjit_test_unicompiler_sse2.cpp b/testing/tests/asmjit_test_unicompiler_sse2.cpp similarity index 90% rename from test/asmjit_test_unicompiler_sse2.cpp rename to testing/tests/asmjit_test_unicompiler_sse2.cpp index 7803713..fa49e60 100644 --- a/test/asmjit_test_unicompiler_sse2.cpp +++ b/testing/tests/asmjit_test_unicompiler_sse2.cpp @@ -35,6 +35,10 @@ float fdiv(float a, float b) noexcept { return _mm_cvtss_f32(_mm_div_ss(_mm_set1_ps(a), _mm_set1_ps(b))); } +float fsqrt(float a) noexcept { + return _mm_cvtss_f32(_mm_sqrt_ss(_mm_set1_ps(a))); +} + float fmadd_nofma_ref(float a, float b, float c) noexcept { return _mm_cvtss_f32(_mm_add_ss(_mm_mul_ss(_mm_set1_ps(a), _mm_set1_ps(b)), _mm_set1_ps(c))); } @@ -55,6 +59,10 @@ double fdiv(double a, double b) noexcept { return _mm_cvtsd_f64(_mm_div_sd(_mm_set1_pd(a), _mm_set1_pd(b))); } +double fsqrt(double a) noexcept { + return _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_set1_pd(a))); +} + double fmadd_nofma_ref(double a, double b, double c) noexcept { return _mm_cvtsd_f64(_mm_add_sd(_mm_mul_sd(_mm_set1_pd(a), _mm_set1_pd(b)), _mm_set1_pd(c))); } diff --git a/test/asmjit_test_x86_sections.cpp b/testing/tests/asmjit_test_x86_sections.cpp similarity index 100% rename from test/asmjit_test_x86_sections.cpp rename to testing/tests/asmjit_test_x86_sections.cpp diff --git a/test/broken.cpp b/testing/tests/broken.cpp similarity index 100% rename from test/broken.cpp rename to testing/tests/broken.cpp diff --git a/test/broken.h b/testing/tests/broken.h similarity index 100% rename from test/broken.h rename to testing/tests/broken.h diff --git a/tools/configure-makefiles.sh b/tools/configure-makefiles.sh deleted file mode 100755 index 69503dc..0000000 --- a/tools/configure-makefiles.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh - -CURRENT_DIR="`pwd`" -BUILD_DIR="${CURRENT_DIR}/../build" -BUILD_OPTIONS="-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1" - -echo "== [Configuring Build - Debug] ==" -eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Debug" -DCMAKE_BUILD_TYPE=Debug ${BUILD_OPTIONS} -echo "" - -echo "== [Configuring Build - Release] ==" -eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Release" -DCMAKE_BUILD_TYPE=Release ${BUILD_OPTIONS} -echo "" diff --git a/tools/configure-ninja.sh b/tools/configure-ninja.sh deleted file mode 100755 index 84808d2..0000000 --- a/tools/configure-ninja.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh - -CURRENT_DIR="`pwd`" -BUILD_DIR="${CURRENT_DIR}/../build" -BUILD_OPTIONS="-G Ninja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1" - -echo "== [Configuring Build - Debug] ==" -eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Debug" -DCMAKE_BUILD_TYPE=Debug ${BUILD_OPTIONS} -echo "" - -echo "== [Configuring Build - Release] ==" -eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Release" -DCMAKE_BUILD_TYPE=Release ${BUILD_OPTIONS} -echo "" diff --git a/tools/configure-sanitizers.sh b/tools/configure-sanitizers.sh deleted file mode 100755 index eee697a..0000000 --- a/tools/configure-sanitizers.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh - -CURRENT_DIR="`pwd`" -BUILD_DIR="${CURRENT_DIR}/../build" -BUILD_OPTIONS="-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1" - -echo "== [Configuring Build - Release_ASAN] ==" -eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Release_ASAN" ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=address -echo "" - -echo "== [Configuring Build - Release_UBSAN] ==" -eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Release_UBSAN" ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=undefined -echo "" - -echo "== [Configuring Build - Release_MSAN] ==" -eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Release_MSAN" ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Release -DASMJIT_SANITIZE=memory -echo "" - -echo "== [Configuring Build - Debug_UBSAN] ==" -eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}/Debug_UBSAN" ${BUILD_OPTIONS} -DCMAKE_BUILD_TYPE=Debug -DASMJIT_SANITIZE=undefined -echo "" diff --git a/tools/configure-vs2019-x64.bat b/tools/configure-vs2019-x64.bat deleted file mode 100644 index 05bc31e..0000000 --- a/tools/configure-vs2019-x64.bat +++ /dev/null @@ -1,2 +0,0 @@ -@echo off -cmake .. -B "..\build_vs2019_x64" -G"Visual Studio 16" -A x64 -DASMJIT_TEST=1 diff --git a/tools/configure-vs2019-x86.bat b/tools/configure-vs2019-x86.bat deleted file mode 100644 index a0e2663..0000000 --- a/tools/configure-vs2019-x86.bat +++ /dev/null @@ -1,2 +0,0 @@ -@echo off -cmake .. -B "..\build_vs2019_x86" -G"Visual Studio 16" -A Win32 -DASMJIT_TEST=1 diff --git a/tools/configure-vs2022-x64.bat b/tools/configure-vs2022-x64.bat deleted file mode 100644 index b33f541..0000000 --- a/tools/configure-vs2022-x64.bat +++ /dev/null @@ -1,2 +0,0 @@ -@echo off -cmake .. -B "..\build_vs2022_x64" -G"Visual Studio 17" -A x64 -DASMJIT_TEST=1 diff --git a/tools/configure-vs2022-x86.bat b/tools/configure-vs2022-x86.bat deleted file mode 100644 index 0ba3505..0000000 --- a/tools/configure-vs2022-x86.bat +++ /dev/null @@ -1,2 +0,0 @@ -@echo off -cmake .. -B "..\build_vs2022_x86" -G"Visual Studio 17" -A Win32 -DASMJIT_TEST=1 diff --git a/tools/configure-xcode.sh b/tools/configure-xcode.sh deleted file mode 100755 index d9c7d98..0000000 --- a/tools/configure-xcode.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh - -CURRENT_DIR="`pwd`" -BUILD_DIR="${CURRENT_DIR}/../build" -BUILD_OPTIONS="-G Xcode -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DASMJIT_TEST=1" - -mkdir -p "${BUILD_DIR}" -eval cmake "${CURRENT_DIR}/.." -B "${BUILD_DIR}" ${BUILD_OPTIONS}