diff --git a/.clang-format b/.clang-format index d0da08381..08011e6bc 100644 --- a/.clang-format +++ b/.clang-format @@ -4,5 +4,24 @@ AlignAfterOpenBracket: Align AlignConsecutiveDeclarations: 'false' BreakBeforeBraces: Allman NamespaceIndentation: All +IncludeBlocks: Regroup +# First regex to match classifies the header into a group. +# Group are ordered in the source code by increasing priority. +IncludeCategories: + # Standard headers + - Regex: <[^\.]+> + Priority: 4 + # Third party dependencies (prefer angle bracket over quotes) + - Regex: <.+\..+> + Priority: 3 + # xsimd absolute headers (e.g. in tests) + - Regex: '["<]xsimd/.+[">]' + Priority: 2 + # Relative header from project + - Regex: '"\.+.*"' + Priority: 1 + # Not quoted header (anything else, avoid) + - Regex: '".*"' + Priority: 0 ... diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..2d173401f --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,4 @@ +--- +Checks: '-*,modernize-type-traits' +WarningsAsErrors: true +HeaderFilterRegex: '.*' diff --git a/.github/toolchains/clang-powerpc64-linux-gnu.cmake b/.github/toolchains/clang-powerpc64-linux-gnu.cmake new file mode 100644 index 000000000..771702fdf --- /dev/null +++ b/.github/toolchains/clang-powerpc64-linux-gnu.cmake @@ -0,0 +1,5 @@ +set(CMAKE_SYSTEM_PROCESSOR powerpc64) +set(triple powerpc64-linux-gnu) + +include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) + diff --git a/.github/toolchains/clang-powerpc64le-linux-gnu.cmake b/.github/toolchains/clang-powerpc64le-linux-gnu.cmake new file mode 100644 index 000000000..b4fa02506 --- /dev/null +++ b/.github/toolchains/clang-powerpc64le-linux-gnu.cmake @@ -0,0 +1,5 @@ +set(CMAKE_SYSTEM_PROCESSOR powerpc64le) +set(triple powerpc64le-linux-gnu) + +include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) + diff --git a/.github/toolchains/gcc-s390x-linux-gnu.cmake b/.github/toolchains/gcc-s390x-linux-gnu.cmake new file mode 100644 index 000000000..05fba0b53 --- /dev/null +++ b/.github/toolchains/gcc-s390x-linux-gnu.cmake @@ -0,0 +1,4 @@ +set(CMAKE_SYSTEM_PROCESSOR s390x) +set(triple s390x-linux-gnu) + +include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index 3efa4de96..c1c5ca5a0 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -18,19 +18,17 @@ jobs: - 18 steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Build script env: TARGET: ${{ matrix.target }} API: ${{ matrix.api }} run: | - mkdir _build NDK="$($ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager --list_installed | sed -E 's/( +[|] +)/|/g;s/ +$//' | grep '^ ndk' | cut -d '|' -f 4 | sort | head -n1)" - cd _build && \ - cmake .. -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/$NDK/build/cmake/android.toolchain.cmake \ - -DANDROID_ABI=$ABI \ - -DANDROID_PLATFORM=android-$API \ - -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release - - cmake --build . --verbose + cmake -B _build \ + -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/$NDK/build/cmake/android.toolchain.cmake \ + -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \ + -DANDROID_ABI=$ABI -DANDROID_PLATFORM=android-$API \ + -DCMAKE_BUILD_TYPE=Release + cmake --build _build --verbose diff --git a/.github/workflows/arch-consistency-check.yml b/.github/workflows/arch-consistency-check.yml index dc57879dc..83839e136 100644 --- a/.github/workflows/arch-consistency-check.yml +++ b/.github/workflows/arch-consistency-check.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout xsimd - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Install dependencies run: sudo apt install g++ - name: Check architecture consistency diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 679c93c5a..cccb01a53 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -7,14 +7,9 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Install dependencies - run: | - sudo apt install g++ + - uses: actions/checkout@v6 - name: Setup - run: | - mkdir _build - cd _build && cmake .. -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release + run: cmake -B _build -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release - name: Build run: cmake --build _build - name: Testing sequential diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 1df779b8e..0ff2cd0a4 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -11,19 +11,16 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout xsimd - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Configure build - run: | - mkdir _build && cd _build - cmake .. -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=_install + run: cmake -B _build -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=_install - name: Build run: cmake --build _build --target install - name: Check install run: | mkdir _install_build && cd _install_build cp ${{ github.workspace }}/.github/cmake-test/* . - ls $PWD/../_build/_install/share/cmake/xsimd - cmake . -DCMAKE_PREFIX_PATH=$PWD/../_build/_install/share/cmake/xsimd + ls $PWD/../_install/share/cmake/xsimd + cmake . -DCMAKE_PREFIX_PATH=$PWD/../_install/share/cmake/xsimd cmake --build . diff --git a/.github/workflows/cross-arm.yml b/.github/workflows/cross-arm.yml index 071e85f25..79a83492a 100644 --- a/.github/workflows/cross-arm.yml +++ b/.github/workflows/cross-arm.yml @@ -15,6 +15,7 @@ jobs: - { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' } sys: - { compiler: 'gcc', version: '10' } + - { compiler: 'gcc', version: '14' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} @@ -32,14 +33,18 @@ jobs: run: | sudo apt-get install ninja-build - name: Checkout xsimd - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Setup run: | - mkdir _build - cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake + cmake -B _build \ + -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} \ + -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release \ + -DTARGET_ARCH=generic \ + -DCMAKE_C_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" \ + -DCMAKE_CXX_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" \ + -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake - name: Build run: cmake --build _build - name: Testing xsimd - run: | - qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd + run: qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd working-directory: ${{ github.workspace }}/_build diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml index 92ffae333..6f07a763c 100644 --- a/.github/workflows/cross-ppc.yml +++ b/.github/workflows/cross-ppc.yml @@ -3,10 +3,11 @@ on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true + jobs: build: runs-on: ubuntu-latest - name: '${{ matrix.target.arch }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}' + name: '${{ matrix.target.platform }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}' strategy: matrix: target: @@ -14,16 +15,27 @@ jobs: - { platform: 'ppc64', dir: 'powerpc64-linux-gnu', flags: '-maltivec -mvsx -mcpu=power10', full: 'OFF' } sys: - { compiler: 'gcc', version: '12' } + - { compiler: 'clang', version: '20', gcc_runtime: '12' } steps: - - name: Setup compiler + - name: Setup GCC if: ${{ matrix.sys.compiler == 'gcc' }} run: | sudo apt-get update || exit 1 - sudo apt-get --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib || exit 1 + sudo apt-get -y --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib cmake || exit 1 sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20 sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20 + - name: Setup LLVM + if: ${{ matrix.sys.compiler == 'clang' }} + run: | + sudo apt-get update || exit 1 + sudo apt-get -y --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.gcc_runtime }}-${{ matrix.target.dir }} g++-${{ matrix.sys.gcc_runtime }}-multilib cmake || exit 1 + sudo apt-get -y --no-install-suggests --no-install-recommends install clang-${{ matrix.sys.version }} || exit 1 + sudo update-alternatives --remove-all /usr/bin/clang || true + sudo update-alternatives --remove-all /usr/bin/clang++ || true + sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{ matrix.sys.version }} 20 + sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.sys.version }} 20 - name: Setup QEMU run: | sudo apt-get --no-install-suggests --no-install-recommends install qemu-user @@ -31,14 +43,27 @@ jobs: run: | sudo apt-get install ninja-build - name: Checkout xsimd - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Setup run: | - mkdir _build - cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake + cmake -B build/ \ + -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \ + -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" \ + -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" \ + -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake - name: Build - run: cmake --build _build --verbose -j1 + run: cmake --build build/ --verbose -j1 + - name: Set CPU feature test expectations + run: /bin/true - name: Testing xsimd run: | - qemu-${{ matrix.target.platform }} -cpu power10 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd - working-directory: ${{ github.workspace }}/_build + # Set CPU feature test expectations, 0 is explicit absence of the feature + export XSIMD_TEST_CPU_ASSUME_SSE4_2="0" + export XSIMD_TEST_CPU_ASSUME_NEON64="0" + export XSIMD_TEST_CPU_ASSUME_RVV="0" + export XSIMD_TEST_CPU_ASSUME_VXE="0" + export XSIMD_TEST_CPU_ASSUME_VSX="1" + + qemu-${{ matrix.target.platform }} -cpu power10 -L /usr/${{ matrix.target.dir}}/ ./build/test/test_xsimd diff --git a/.github/workflows/cross-rvv-arch.yml b/.github/workflows/cross-rvv-arch.yml new file mode 100644 index 000000000..ae482cc08 --- /dev/null +++ b/.github/workflows/cross-rvv-arch.yml @@ -0,0 +1,85 @@ +# RISC-V RVV cross-compilation build using qemu 11 + gcc 15 (Arch Linux). +# +# Why this workflow exists alongside cross-rvv.yml: +# +# QEMU's RISC-V Vector emulation is dramatically slower than scalar in +# qemu < 11 (see QEMU issue #2137 for documented 100x+ slowdowns of +# auto-vectorised RVV loops under TCG). At vlen=128 the slowdown is large +# enough that gcc's RVV codegen for our test suite causes the qemu-user +# emulator to make no observable progress within the 6h GHA timeout — +# i.e. the apt-shipped qemu-user-static (8.2.x in noble, 9.x in plucky) +# can't run xsimd's full test_xsimd at vlen=128. +# +# Empirically: +# qemu 8.2.2 (Ubuntu 24.04 apt) : test_xsimd at vlen=128 times out +# qemu 9.2.1 (Ubuntu 25.04 plucky) : ditto +# qemu 10.0.8 (Debian trixie) : ditto +# qemu 11.0.0 (Arch) + gcc 15.1 : 367 cases / 5664 asserts in <10 min +# +# So vlen=128 RVV coverage lives in this workflow, which runs the build +# and test inside an `archlinux:latest` container (qemu 11 + gcc 15.1). +# The matching ubuntu-runner workflow `cross-rvv.yml` keeps multi-compiler +# matrix coverage (gcc-14, clang-17/18) for vlens >= 256, where the apt +# qemu is fast enough. +# +# References: +# QEMU 11.0.0 release notes: https://www.qemu.org/2026/04/22/qemu-11-0-0/ +# QEMU RVV slowdowns issue: https://gitlab.com/qemu-project/qemu/-/issues/2137 +# Ubuntu RVV vstart bug: https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2095169 +name: RISC-V RVV cross-compilation build (qemu 11) +on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + container: archlinux:latest + name: 'RISC-V RVV${{ matrix.vector_bits }} (qemu 11)' + strategy: + fail-fast: false + matrix: + vector_bits: + - 128 + - 256 + - 512 + steps: + - name: Setup toolchain and qemu + run: | + pacman -Sy --noconfirm + pacman -S --noconfirm --needed \ + qemu-user-static riscv64-linux-gnu-gcc riscv64-linux-gnu-glibc \ + cmake ninja git ca-certificates + qemu-riscv64-static --version + riscv64-linux-gnu-gcc --version | head -1 + - name: Checkout xsimd + uses: actions/checkout@v6 + - name: Setup + run: > + cmake -S . -B _build + -GNinja + -DBUILD_TESTS=ON + -DDOWNLOAD_DOCTEST=ON + -DCMAKE_BUILD_TYPE=Release + -DTARGET_ARCH=generic + -DCMAKE_C_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl" + -DCMAKE_CXX_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl" + -DCMAKE_TOOLCHAIN_FILE=.github/toolchains/gcc-riscv64-linux-gnu.cmake + - name: Build + run: cmake --build _build + - name: Set CPU feature test expectations + run: | + echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_SVE=0" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_RVV=1" >> "$GITHUB_ENV" + - name: Testing xsimd + timeout-minutes: 15 + # Invoke qemu-riscv64-static explicitly. Inside the archlinux:latest + # container we don't have permission to register binfmt_misc with the + # host kernel, so exec'ing the riscv64 ELF directly fails with + # "Exec format error". + run: > + QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0" + QEMU_LD_PREFIX="/usr/riscv64-linux-gnu" + qemu-riscv64-static ./test/test_xsimd + working-directory: _build diff --git a/.github/workflows/cross-rvv.yml b/.github/workflows/cross-rvv.yml index e7274627c..215823ee5 100644 --- a/.github/workflows/cross-rvv.yml +++ b/.github/workflows/cross-rvv.yml @@ -1,3 +1,17 @@ +# RISC-V RVV cross-compilation build (Ubuntu apt qemu, multi-compiler matrix). +# +# vlen=128 is intentionally NOT covered here. Ubuntu's qemu-user-static +# (8.2.x in noble, 9.x in plucky) hangs on the xsimd test_xsimd binary at +# vlen=128 — see QEMU issue #2137 (RVV TCG slowdowns) for the underlying +# emulator behaviour. Until ubuntu-latest ships qemu 11+, vlen=128 coverage +# lives in cross-rvv-arch.yml, which runs inside an archlinux:latest +# container with qemu 11. Vlens >= 256 run fast enough under the apt qemu +# to stay within the test step's timeout. +# +# References: +# QEMU 11.0.0 release notes: https://www.qemu.org/2026/04/22/qemu-11-0-0/ +# QEMU RVV slowdowns issue: https://gitlab.com/qemu-project/qemu/-/issues/2137 +# Ubuntu RVV vstart bug: https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2095169 name: RISC-V RVV cross-compilation build on: [push, pull_request] concurrency: @@ -8,13 +22,13 @@ jobs: runs-on: ubuntu-latest name: 'RISC-V RVV${{ matrix.vector_bits }}' strategy: + fail-fast: false matrix: sys: - { compiler: 'gcc', gcc_runtime: '14'} - { compiler: 'clang', version: '17', gcc_runtime: '14'} - { compiler: 'clang', version: '18', gcc_runtime: '14'} vector_bits: - - 128 - 256 - 512 steps: @@ -35,14 +49,19 @@ jobs: sudo ln -srf $(which clang++-${{ matrix.sys.version }}) /usr/bin/clang++ rm llvm.sh - name: Setup QEMU - uses: docker/setup-qemu-action@v3.0.0 - with: - platforms: riscv64 + # Use the qemu-user-static package shipped by the runner image rather + # than docker/setup-qemu-action: tonistiigi/binfmt pins an even older + # qemu (~6.x/7.x) whose RVV implementation miscompiles vmulh* and is + # known to hang test_xsimd until the 6h GHA timeout. + run: | + sudo apt-get -y -qq update + sudo apt-get -y -qq --no-install-suggests --no-install-recommends install qemu-user-static + qemu-riscv64-static --version - name: Setup Ninja run: | sudo apt-get -y -qq install ninja-build - name: Checkout xsimd - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Setup run: > cmake -S . -B _build @@ -56,7 +75,13 @@ jobs: -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-riscv64-linux-gnu.cmake - name: Build run: cmake --build _build + - name: Set CPU feature test expectations + run: | + echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_SVE=0" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_RVV=1" >> "$GITHUB_ENV" - name: Testing xsimd + timeout-minutes: 15 run: > QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0" QEMU_LD_PREFIX="/usr/riscv64-linux-gnu" diff --git a/.github/workflows/cross-s390x.yml b/.github/workflows/cross-s390x.yml new file mode 100644 index 000000000..b748d328e --- /dev/null +++ b/.github/workflows/cross-s390x.yml @@ -0,0 +1,55 @@ +name: IBM Z cross-compilation build +on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + name: '${{ matrix.target.platform }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}' + strategy: + matrix: + target: + - { platform: 's390x', dir: 's390x-linux-gnu', full: 'OFF' } + sys: + - { compiler: 'gcc', version: '14' } + steps: + - name: Setup compiler + if: ${{ matrix.sys.compiler == 'gcc' }} + run: | + sudo apt-get update || exit 1 + sudo apt-get -y --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib cmake || exit 1 + sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true + sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true + sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20 + sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20 + - name: Setup QEMU + run: | + sudo apt-get --no-install-suggests --no-install-recommends install qemu-user + - name: Setup Ninja + run: | + sudo apt-get install ninja-build + - name: Checkout xsimd + uses: actions/checkout@v6 + - name: Setup + run: | + cmake -B build/ \ + -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \ + -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" \ + -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" \ + -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake + - name: Build + run: cmake --build build/ --verbose -j1 + - name: Testing xsimd + run: | + # Set CPU feature test expectations, 0 is explicit absence of the feature + export XSIMD_TEST_CPU_ASSUME_SSE4_2="0" + export XSIMD_TEST_CPU_ASSUME_NEON64="0" + export XSIMD_TEST_CPU_ASSUME_RVV="0" + export XSIMD_TEST_CPU_ASSUME_VSX="0" + export XSIMD_TEST_CPU_ASSUME_VXE="1" + + qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./build/test/test_xsimd diff --git a/.github/workflows/cross-sve.yml b/.github/workflows/cross-sve.yml index 4cd292f27..8a2ffe102 100644 --- a/.github/workflows/cross-sve.yml +++ b/.github/workflows/cross-sve.yml @@ -27,14 +27,26 @@ jobs: run: | sudo apt-get install ninja-build - name: Checkout xsimd - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Setup run: | - mkdir _build - cd _build && cmake .. -GNinja -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" -DCMAKE_CXX_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/gcc-aarch64-linux-gnu.cmake + cmake -B _build \ + -GNinja \ + -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DTARGET_ARCH=generic \ + -DCMAKE_C_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" \ + -DCMAKE_CXX_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" \ + -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/gcc-aarch64-linux-gnu.cmake - name: Build run: cmake --build _build - - name: Testing xsimd + - name: Set CPU feature test expectations run: | - qemu-aarch64 --cpu max,sve${{ matrix.vector_bits }}=on -L /usr/aarch64-linux-gnu/ ./test/test_xsimd + echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_RVV=0" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_NEON64=1" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_SVE=1" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_MANUFACTURER=unknown" >> "$GITHUB_ENV" + - name: Testing xsimd + run: qemu-aarch64 --cpu max,sve${{ matrix.vector_bits }}=on -L /usr/aarch64-linux-gnu/ ./test/test_xsimd working-directory: ${{ github.workspace }}/_build diff --git a/.github/workflows/cxx-no-exceptions.yml b/.github/workflows/cxx-no-exceptions.yml index add1c693b..e48ae31ca 100644 --- a/.github/workflows/cxx-no-exceptions.yml +++ b/.github/workflows/cxx-no-exceptions.yml @@ -4,14 +4,9 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Install dependencies - run: | - sudo apt install g++ + - uses: actions/checkout@v6 - name: Setup - run: | - mkdir _build - cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-fno-exceptions + run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-fno-exceptions - name: Build run: cmake --build _build diff --git a/.github/workflows/cxx-versions.yml b/.github/workflows/cxx-versions.yml index 5ec2f0768..7bfcac2f8 100644 --- a/.github/workflows/cxx-versions.yml +++ b/.github/workflows/cxx-versions.yml @@ -3,21 +3,46 @@ on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true + jobs: - build: + build-unix: + name: 'Unix C++${{ matrix.cxx-version }}' runs-on: ubuntu-latest strategy: matrix: cxx-version: [14, 17, 20] steps: - - uses: actions/checkout@v3 - - name: Install dependencies - run: | - sudo apt install g++ + - uses: actions/checkout@v6 - name: Setup + run: cmake -B build/ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${{matrix.cxx-version}} + - name: Build + run: cmake --build build/ + - name: Test + run: ./build/test/test_xsimd + + build-msvc: + name: 'MSVC C++${{ matrix.cxx-version }}' + defaults: + run: + shell: bash {0} + runs-on: windows-2022 + strategy: + matrix: + cxx-version: [14, 17, 20] + steps: + - name: Setup compiler + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: amd64 + - name: Setup Ninja run: | - mkdir _build - cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${{matrix.cxx-version}} + python3 -m pip install --upgrade pip setuptools wheel + python3 -m pip install ninja + - uses: actions/checkout@v6 + - name: Setup + run: cmake -B build/ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${{matrix.cxx-version}} -G Ninja - name: Build - run: cmake --build _build + run: cmake --build build/ + - name: Test + run: ./build/test/test_xsimd diff --git a/.github/workflows/doxygen.yml b/.github/workflows/doxygen.yml index 910206321..00826b921 100644 --- a/.github/workflows/doxygen.yml +++ b/.github/workflows/doxygen.yml @@ -7,7 +7,7 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 - name: Install dependencies run: sudo apt install doxygen python3-breathe python3-sphinx-rtd-theme - name: Render diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml index 619690589..988c348df 100644 --- a/.github/workflows/emscripten.yml +++ b/.github/workflows/emscripten.yml @@ -9,7 +9,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v6 - uses: mamba-org/setup-micromamba@v2 with: diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml index 0a5411d27..bc4781cbb 100644 --- a/.github/workflows/emulated.yml +++ b/.github/workflows/emulated.yml @@ -9,63 +9,36 @@ defaults: jobs: build: runs-on: ubuntu-latest - name: '${{ matrix.sys.compiler }} ${{ matrix.sys.version }} - emulated' + name: '${{ matrix.sys.compiler }} - emulated<${{ matrix.sys.size }}>' strategy: matrix: sys: - - { compiler: 'gcc', version: '14'} - - { compiler: 'clang', version: '18'} + - { compiler: 'g++', size: '128'} + - { compiler: 'g++', size: '256'} + - { compiler: 'g++', size: '512'} steps: - - name: Setup compiler - if: ${{ matrix.sys.compiler == 'gcc' }} - run: | - GCC_VERSION=${{ matrix.sys.version }} - sudo apt-get update - sudo apt-get --no-install-suggests --no-install-recommends install g++-$GCC_VERSION - CC=gcc-$GCC_VERSION - echo "CC=$CC" >> $GITHUB_ENV - CXX=g++-$GCC_VERSION - echo "CXX=$CXX" >> $GITHUB_ENV - CXXFLAGS="-Wno-noexcept-type -Wno-stringop-overflow -Wno-maybe-uninitialized" - echo "CXXFLAGS=$CXXFLAGS" >> $GITHUB_ENV - - name: Setup compiler - if: ${{ matrix.sys.compiler == 'clang' }} - run: | - LLVM_VERSION=${{ matrix.sys.version }} - sudo apt-get update || exit 1 - sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 - sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1 - sudo ln -s /usr/include/asm-generic /usr/include/asm - CC=clang-$LLVM_VERSION - echo "CC=$CC" >> $GITHUB_ENV - CXX=clang++-$LLVM_VERSION - echo "CXX=$CXX" >> $GITHUB_ENV - name: Checkout xsimd - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Install mamba uses: mamba-org/setup-micromamba@v1 with: environment-file: environment.yml + - name: Setup GCC compiler + if: ${{ matrix.sys.compiler == 'g++' }} + run: echo "CXXFLAGS=-Wno-noexcept-type -Wno-stringop-overflow -Wno-maybe-uninitialized" >> $GITHUB_ENV - name: Configure build - env: - CC: ${{ env.CC }} - CXX: ${{ env.CXX }} run: | - - mkdir _build - cd _build - cmake .. -DBUILD_TESTS=ON \ - -DBUILD_BENCHMARK=ON \ - -DBUILD_EXAMPLES=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_C_COMPILER=$CC \ - -DCMAKE_CXX_COMPILER=$CXX \ - -DXSIMD_ENABLE_WERROR=ON \ - -DCMAKE_CXX_FLAGS="-DXSIMD_DEFAULT_ARCH=emulated\<128\> -DXSIMD_WITH_EMULATED=1 ${CXXFLAGS}" \ - -G Ninja + cmake -B _build \ + -DBUILD_TESTS=ON \ + -DBUILD_BENCHMARK=ON \ + -DBUILD_EXAMPLES=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=${{ matrix.sys.compiler }} \ + -DXSIMD_ENABLE_WERROR=ON \ + -DTARGET_ARCH="emulated<${{ matrix.sys.size }}>" \ + -DCMAKE_CXX_FLAGS="${CXXFLAGS}" \ + -GNinja - name: Build run: ninja -C _build - name: Test - run: | - cd _build/test - ./test_xsimd + run: ninja -C _build xtest diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml index 193038e75..03a914bda 100644 --- a/.github/workflows/linux.yml +++ b/.github/workflows/linux.yml @@ -1,4 +1,4 @@ -name: Linux build +name: Linux x86 build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} @@ -6,6 +6,7 @@ concurrency: defaults: run: shell: bash -l {0} + jobs: build: runs-on: ubuntu-latest @@ -13,9 +14,10 @@ jobs: strategy: matrix: sys: - - { compiler: 'gcc', version: '12', flags: 'force_no_instr_set' } - - { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex' } - - { compiler: 'gcc', version: '14', flags: 'avx' } + - { compiler: 'gcc', version: '12', flags: 'force_no_instr_set' } + - { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex' } + - { compiler: 'gcc', version: '14', flags: 'avx' } + - { compiler: 'gcc', version: '14', flags: 'avx2' } - { compiler: 'gcc', version: '13', flags: 'avx512' } - { compiler: 'gcc', version: '10', flags: 'avx512' } - { compiler: 'gcc', version: '12', flags: 'i386' } @@ -23,11 +25,15 @@ jobs: - { compiler: 'gcc', version: '13', flags: 'avx512vbmi' } - { compiler: 'gcc', version: '14', flags: 'avx512vbmi2' } - { compiler: 'gcc', version: '13', flags: 'avx512vnni' } - - { compiler: 'clang', version: '16', flags: 'force_no_instr_set' } + - { compiler: 'clang', version: '16', flags: 'force_no_instr_set' } - { compiler: 'clang', version: '16', flags: 'enable_xtl_complex' } - { compiler: 'clang', version: '17', flags: 'avx' } - { compiler: 'clang', version: '17', flags: 'sse3' } - { compiler: 'clang', version: '18', flags: 'avx512' } + - { compiler: 'clang', version: '18', flags: 'avx_128' } + - { compiler: 'clang', version: '18', flags: 'avx2_128' } + - { compiler: 'clang', version: '18', flags: 'avx512vl_128' } + - { compiler: 'clang', version: '18', flags: 'avx512vl_256' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} @@ -56,7 +62,7 @@ jobs: CXX=clang++-$LLVM_VERSION echo "CXX=$CXX" >> $GITHUB_ENV - name: Checkout xsimd - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Install mamba uses: mamba-org/setup-micromamba@v2 with: @@ -75,12 +81,31 @@ jobs: if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge" fi + if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then + CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge" + CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx_128" + fi + if [[ '${{ matrix.sys.flags }}' == 'avx2' ]]; then + CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell" + fi + if [[ '${{ matrix.sys.flags }}' == 'avx2_128' ]]; then + CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell" + CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx2_128" + fi if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona" fi if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" fi + if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then + CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" + CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128" + fi + if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then + CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" + CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_256" + fi if [[ '${{ matrix.sys.flags }}' == 'avx512pf' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knl" fi @@ -105,25 +130,34 @@ jobs: # Cheap way of spotting uninitialized read CXX_FLAGS="$CXX_FLAGS -ftrivial-auto-var-init=pattern" - mkdir _build - cd _build - cmake .. -DBUILD_TESTS=ON \ - -DBUILD_BENCHMARK=ON \ - -DBUILD_EXAMPLES=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_C_COMPILER=$CC \ - -DCMAKE_CXX_COMPILER=$CXX \ - $CMAKE_EXTRA_ARGS \ - -DCMAKE_CXX_FLAGS='$CXX_FLAGS' \ - -G Ninja + cmake -B _build \ + -DBUILD_TESTS=ON \ + -DBUILD_BENCHMARK=ON \ + -DBUILD_EXAMPLES=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=$CC \ + -DCMAKE_CXX_COMPILER=$CXX \ + $CMAKE_EXTRA_ARGS \ + -DCMAKE_CXX_FLAGS='$CXX_FLAGS' \ + -G Ninja - name: Build - run: ninja -C _build + run: cmake --build _build - name: Test run: | - cd _build - cd test + # Set CPU feature test expectations, 0 is explicit absence of the feature + export XSIMD_TEST_CPU_ASSUME_NEON64="0" + export XSIMD_TEST_CPU_ASSUME_RVV="0" + export XSIMD_TEST_CPU_ASSUME_VSX="0" + export XSIMD_TEST_CPU_ASSUME_VXE="0" + cd _build/test if echo '${{ matrix.sys.flags }}' | grep -q 'avx512' ; then + # Running with emulation, must have AVX512, lower tier are checked by implications in tests + export XSIMD_TEST_CPU_ASSUME_AVX512F="1" ../../sde-external-9.48.0-2024-11-25-lin/sde64 -tgl -- ./test_xsimd else + export XSIMD_TEST_CPU_ASSUME_SSE4_2=$(grep -q 'sse4_2' /proc/cpuinfo && echo "1" || echo "0") + export XSIMD_TEST_CPU_ASSUME_AVX=$(grep -q 'avx' /proc/cpuinfo && echo "1" || echo "0") + export XSIMD_TEST_CPU_ASSUME_AVX512F=$(grep -q 'avx512f' /proc/cpuinfo && echo "1" || echo "0") + export XSIMD_TEST_CPU_ASSUME_MANUFACTURER="intel,amd" ./test_xsimd fi diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index f2b17faa9..14132f5db 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -3,6 +3,7 @@ on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true + jobs: build: strategy: @@ -14,14 +15,24 @@ jobs: runs-on: macos-${{ matrix.os }} name: 'macos-${{ matrix.os }}' steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v6 - name: Setup - run: | - mkdir _build - cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" + run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" - name: Build run: cmake --build _build --verbose - name: Testing sequential run: cmake --build _build --target xbenchmark --verbose + - name: Set CPU feature test expectations + run: | + echo "XSIMD_TEST_CPU_ASSUME_RVV=0" >> "$GITHUB_ENV" + if echo '${{ matrix.os }}' | grep -q intel; then + echo "XSIMD_TEST_CPU_ASSUME_NEON64=0" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=1" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_MANUFACTURER=intel" >> "$GITHUB_ENV" + else + echo "XSIMD_TEST_CPU_ASSUME_NEON64=1" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV" + echo "XSIMD_TEST_CPU_ASSUME_MANUFACTURER=unknown" >> "$GITHUB_ENV" + fi - name: Testing xsimd run: ${{github.workspace}}/_build/test/test_xsimd diff --git a/.github/workflows/sanitizer.yml b/.github/workflows/sanitizer.yml index 7eceeadc9..520be087f 100644 --- a/.github/workflows/sanitizer.yml +++ b/.github/workflows/sanitizer.yml @@ -22,7 +22,7 @@ jobs: CXX: clang++-${{ matrix.llvm-version }} steps: - name: Checkout xsimd - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Setup compiler run: | wget https://apt.llvm.org/llvm.sh @@ -30,18 +30,15 @@ jobs: sudo ./llvm.sh ${{ matrix.llvm-version }} - name: Configure build run: | - mkdir _build - cd _build - cmake .. -DBUILD_TESTS=ON \ - -DBUILD_BENCHMARK=ON \ - -DBUILD_EXAMPLES=ON \ - -DDOWNLOAD_DOCTEST=ON \ - -DCMAKE_BUILD_TYPE=Debug \ - -DCMAKE_CXX_FLAGS='-f${{ matrix.flags }} -O0 -g -fno-inline' \ - -G Ninja + cmake -B_build \ + -DBUILD_TESTS=ON \ + -DBUILD_BENCHMARK=ON \ + -DBUILD_EXAMPLES=ON \ + -DDOWNLOAD_DOCTEST=ON \ + -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_CXX_FLAGS='-f${{ matrix.flags }} -O0 -g -fno-inline' \ + -G Ninja - name: Build - run: ninja -C _build + run: cmake --build _build - name: Test - run: | - cd _build/test - ./test_xsimd + run: ./_build/test/test_xsimd diff --git a/.github/workflows/style-check.yml b/.github/workflows/style-check.yml index 81f0e8383..f308b99eb 100644 --- a/.github/workflows/style-check.yml +++ b/.github/workflows/style-check.yml @@ -8,17 +8,43 @@ jobs: name: Format check runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - name: Run clang-format style check for C/C++ programs. - uses: jidicula/clang-format-action@v4.11.0 - with: - clang-format-version: '17' - exclude-regex: 'doctest.h' + - uses: actions/checkout@v6 + - run: sudo apt install clang-format + - run: | + git fetch origin ${{ github.event.pull_request.base.sha }} + git clang-format --diff ${{ github.event.pull_request.base.sha }} | tee diff.patch + ! grep -q '^diff ' diff.patch + inlining-check: runs-on: ubuntu-latest name: Check inline keyword usage steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v6 - run: sudo apt install clang-tools - run: sh ./test/check_inline_specifier.sh . + include-check: + runs-on: ubuntu-latest + name: Check unused standard includes + steps: + - uses: actions/checkout@v6 + - run: pip install diskarzhan + - run: diskarzhan `find -name '*.[ch]pp'` + + clang-tidy-check: + name: Clang-tidy check (x86_64) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - run: sudo apt install clang-tidy + - name: Configure + run: cmake -B _build + -DCMAKE_CXX_COMPILER=clang++ + -DBUILD_TESTS=ON + -DDOWNLOAD_DOCTEST=ON + -DCMAKE_BUILD_TYPE=Debug + -DCMAKE_EXPORT_COMPILE_COMMANDS=ON + -DCMAKE_CXX_FLAGS='-march=tigerlake' + . + - name: Check + run: run-clang-tidy -p _build diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index e9e782aed..10ce6e982 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -3,8 +3,9 @@ on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true + jobs: - build: + build-windows-x86: name: 'MSVC ${{ matrix.os }}, ${{ matrix.target }} ${{ matrix.sys.set }}' defaults: run: @@ -41,18 +42,21 @@ jobs: python3 -m pip install --upgrade pip setuptools wheel python3 -m pip install ninja - name: Checkout xsimd - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Setup - run: | - mkdir _build - cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="${{ matrix.sys.flags }}" -G Ninja + run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="${{ matrix.sys.flags }}" -G Ninja - name: Build - run: | - cd _build && cmake --build . + run: cmake --build _build - name: Testing xsimd if: ${{ !startsWith(matrix.sys.set, 'AVX512') }} - run: | - cd _build && ./test/test_xsimd + env: + # Set CPU feature test expectations + # Assuming the runner always has AVX2 (independent of compilation option) + XSIMD_TEST_CPU_ASSUME_NEON64: "0" + XSIMD_TEST_CPU_ASSUME_SSE4_2: "1" + XSIMD_TEST_CPU_ASSUME_AVX2: "1" + XSIMD_TEST_CPU_ASSUME_MANUFACTURER: "intel,amd" + run: ./_build/test/test_xsimd build-windows-mingw: name: 'MSYS2 ${{ matrix.msystem }}' @@ -80,29 +84,34 @@ jobs: cmake:p ninja:p - name: Checkout xsimd - uses: actions/checkout@v2 + uses: actions/checkout@v6 - name: Configure - run: | - mkdir _build - cd _build - cmake .. -DBUILD_TESTS=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DDOWNLOAD_DOCTEST=ON -G Ninja + run: cmake -B _build -DBUILD_TESTS=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DDOWNLOAD_DOCTEST=ON -G Ninja - name: Build - run: ninja -C _build + run: cmake --build _build - name: Test - run: | - cd _build && ./test/test_xsimd + run: ./_build/test/test_xsimd - build-windows-arm64: - name: 'MSVC arm64' + build-windows-clang-cl: + name: 'clang-cl x64 ${{ matrix.config.name }}' defaults: run: shell: bash {0} - runs-on: windows-11-arm + strategy: + matrix: + config: + - { name: "AVX2", flags: "/arch:AVX2", benchmark: "ON", examples: "ON" } + - { name: "/fp:fast", flags: "/fp:fast", benchmark: "OFF", examples: "OFF" } + runs-on: windows-2025 steps: - name: Setup compiler uses: ilammy/msvc-dev-cmd@v1 with: arch: amd64 + - name: Check clang-cl + run: | + command -v clang-cl + clang-cl --version - name: Setup Ninja run: | python3 -m pip install --upgrade pip setuptools wheel @@ -111,11 +120,41 @@ jobs: uses: actions/checkout@v3 - name: Setup run: | - mkdir _build - cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -G Ninja + cmake -B _build \ + -DBUILD_TESTS=ON \ + -DDOWNLOAD_DOCTEST=ON \ + -DBUILD_BENCHMARK=${{ matrix.config.benchmark }} \ + -DBUILD_EXAMPLES=${{ matrix.config.examples }} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=clang-cl \ + -DCMAKE_CXX_COMPILER=clang-cl \ + -DCMAKE_CXX_FLAGS="${{ matrix.config.flags }} -DXSIMD_REASSOCIATIVE_MATH=1" \ + -G Ninja - name: Build - run: | - cd _build && cmake --build . + run: cmake --build _build - name: Testing xsimd + run: ./_build/test/test_xsimd + + build-windows-arm64: + name: 'MSVC arm64' + defaults: + run: + shell: bash {0} + runs-on: windows-11-arm + steps: + - name: Setup compiler + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: arm64 + - name: Setup Ninja run: | - cd _build && ./test/test_xsimd + python3 -m pip install --upgrade pip setuptools wheel + python3 -m pip install ninja + - name: Checkout xsimd + uses: actions/checkout@v6 + - name: Setup + run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -G Ninja + - name: Build + run: cmake --build _build + - name: Testing xsimd + run: ./_build/test/test_xsimd diff --git a/CMakeLists.txt b/CMakeLists.txt index f817b6b51..66c01f281 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,16 +9,17 @@ # The full license is in the file LICENSE, distributed with this software. # ############################################################################ -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.13) + project(xsimd) -option(XSIMD_REFACTORING ON) -set(XSIMD_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) +OPTION(ENABLE_XTL_COMPLEX "enables support for xcomplex defined in xtl" OFF) +OPTION(BUILD_TESTS "xsimd test suite" OFF) # Versioning # ========== -file(STRINGS "${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp" xsimd_version_defines +file(STRINGS "include/xsimd/config/xsimd_config.hpp" xsimd_version_defines REGEX "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH)") foreach(ver ${xsimd_version_defines}) if(ver MATCHES "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$") @@ -32,74 +33,25 @@ message(STATUS "xsimd v${${PROJECT_NAME}_VERSION}") # Build # ===== -set(XSIMD_HEADERS -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_constants.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx2.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_sse.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma4.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_common.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_isa.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_neon.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_neon64.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_rvv.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_scalar.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse2.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse3.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_1.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_2.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_ssse3.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_vsx.hpp -${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sve.hpp -${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_arch.hpp -${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp -${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_cpuid.hpp -${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_aligned_allocator.hpp -${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_alignment.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_all_registers.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_api.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon64_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx2_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512f_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_batch.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_batch_constant.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_avx_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_avx2_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_sse_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma4_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_common_arch.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_vsx_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_rvv_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse2_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse3_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse4_1_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse4_2_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_ssse3_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sve_register.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_traits.hpp -${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_utils.hpp -${XSIMD_INCLUDE_DIR}/xsimd/xsimd.hpp -) - add_library(xsimd INTERFACE) +add_library(xsimd::xsimd ALIAS xsimd) target_include_directories(xsimd INTERFACE - $ + $ $) -OPTION(ENABLE_XTL_COMPLEX "enables support for xcomplex defined in xtl" OFF) -OPTION(BUILD_TESTS "xsimd test suite" OFF) +target_compile_features(xsimd INTERFACE cxx_std_14) +# Only add xtl build option to the build tree, that is, if xsimd being locally +# developed or is vendored. +# Otherwise (if an install is performed), this will be handled in the user +# cmake script (xsimdConfig.cmake). if(ENABLE_XTL_COMPLEX) find_package(xtl 0.8.0 REQUIRED) - target_compile_features(xsimd INTERFACE cxx_std_14) - target_compile_definitions(xsimd INTERFACE XSIMD_ENABLE_XTL_COMPLEX=1) - target_link_libraries(xsimd INTERFACE xtl) -else() - target_compile_features(xsimd INTERFACE cxx_std_11) + target_link_libraries(xsimd INTERFACE $) + target_compile_definitions(xsimd INTERFACE + $ + ) endif() if(BUILD_TESTS) @@ -125,8 +77,6 @@ if(${XSIMD_SKIP_INSTALL}) return() # skip installation endif () -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") -include(JoinPaths) include(GNUInstallDirs) include(CMakePackageConfigHelpers) @@ -137,7 +87,7 @@ install(TARGETS xsimd export(EXPORT ${PROJECT_NAME}-targets FILE "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Targets.cmake") -install(DIRECTORY ${XSIMD_INCLUDE_DIR}/xsimd +install(DIRECTORY include/xsimd DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". diff --git a/Changelog.rst b/Changelog.rst index cc5d373cb..2a600241b 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -9,6 +9,102 @@ Changelog ========= +14.2.0 +------ + + * **New architecture**: IBM Z (s390x) support + + * [API] New cross-platform ``cpu_features`` API for querying CPU features available at runtime + + * [API] Add ``xsimd::get()`` for compile-time lane extraction + + * [API] Add ``xsimd::stream_load``, ``xsimd::stream_store``, and ``xsimd::fence`` for non-temporal memory transfers + + * [VSX] Fix dynamic dispatch support with runtime cpu feature inspection + + * [VSX] Fix rounding + + * [SVE/RVV] Fix dynamic dispatch by inspecting available vector length + + * [AVX2] Add native ``uint64``/``int64`` multiplication kernel + + * [NEON] Add support for Windows ARM + + * [NEON] Simplify static dispatch of intrinsicts + + * [NEON] Fix ``batch_bool`` store on ARM by replacing ``vst1_lane_u32`` with a full + lane store followed by a memcpy + + * [SVE] Fix dynamic dispatch ODR violation + + * [ci] Fix emulated architecture interaction with AVX512 leading to CI failures. + Provide a cmake-level configuration switch for emulated build + + * Fix build with compilers that do not support C++20 (even though we only require C++14) + + * Fix ``xsimd::signbit`` scalar overload leaking into non-scalar overload resolution + + * Fix complex batch load + + * Harden fast-math reassociation barriers + + * Publish the C++14 requirement through the CMake interface + +14.1.0 +------ + + * Add popcnt and bmi + + * [API] Add bitwise-shift batch constant api + + * Refactor x86 CPU features + + * [NEON] Unsigned bitwise shifts are never called + + * Improve coverage of emulated architectures + + * Introduce `count{l,r}_{zero,one}` for `batch_bool` + + * Fix emulated mask() + + * [neon] Implement bitwise_rshift for 64 bit integers on arm32 + + * Fix fast_cast int64/uint64→double under -ffast-math + + * Small complexity reduction + + * Add make_batch_constant from std::array in C++20 + + * [ci] Use home-baked clang-format action + + * Fix apple detection + + * [ci] add GCC 10 with AVX-512 to test matrix + + * Slighly less pessimistic detection of neon64 + + * Fix runtime detection of SVE + + * [ci] Setup Windows arm64 runner + + * iota batch constant and a few overloads + + * [test] Improve testing logging and accuracy + + * Fix default values for AVX and AVX512 OS state enabled flags + + * Implement batch_bool::mask() for riscv + + * [ci] Revert emscripten to 4.0.21 + + * Restore RISCV support + + * Implement optimized movemasks for NEON + + * Fix limit behavior of atan2 under -ffast-math + + * Move to C++14 + 14.0.0 ------ diff --git a/README.md b/README.md index 9b7861210..2c2e459b1 100644 --- a/README.md +++ b/README.md @@ -50,12 +50,13 @@ The following SIMD instruction set extensions are supported: Architecture | Instruction set extensions -------------|----------------------------------------------------- x86 | SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA3+SSE, FMA3+AVX, FMA3+AVX2 -x86 | AVX512BW, AVX512CD, AVX512DQ, AVX512F (gcc7 and higher) +x86 | AVX512BW, AVX512CD, AVX512DQ, AVX512F, AVX512VL (gcc7 and higher) x86 AMD | FMA4 ARM | NEON, NEON64, SVE128/256/512 (fixed vector size) WebAssembly | WASM powerpc64 | VSX RISC-V | RISC-V128/256/512 (fixed vector size) +IBM Z (s390x)| VXE (IBM z14) ## Installation @@ -104,7 +105,7 @@ http://xsimd.readthedocs.io/ | 9.x | ^0.7.0 | | 8.x | ^0.7.0 | -The dependency on `xtl` is required if you want to support vectorization for `xtl::xcomplex`. In this case, you must build your project with C++14 support enabled. +The dependency on `xtl` is required if you want to support vectorization for `xtl::xcomplex`. ## Usage diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 732d8d7b7..dc9011f3f 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -9,13 +9,12 @@ # The full license is in the file LICENSE, distributed with this software. # ############################################################################ -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.13) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-benchmark) find_package(xsimd REQUIRED CONFIG) - set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIRS}) endif () if(NOT CMAKE_BUILD_TYPE) @@ -63,18 +62,16 @@ if(MSVC) endforeach() endif() -include_directories(${XSIMD_INCLUDE_DIR}) - -set(XSIMD_BENCHMARK +set(XSIMD_BENCHMARK_SRC main.cpp xsimd_benchmark.hpp ) -set(XSIMD_BENCHMARK_TARGET benchmark_xsimd) -add_executable(${XSIMD_BENCHMARK_TARGET} ${XSIMD_BENCHMARK} ${XSIMD_HEADERS}) +add_executable(benchmark_xsimd ${XSIMD_BENCHMARK_SRC}) +target_link_libraries(benchmark_xsimd PRIVATE xsimd) if(ENABLE_XTL_COMPLEX) target_link_libraries(benchmark_xsimd PRIVATE xtl) endif() -add_custom_target(xbenchmark COMMAND benchmark_xsimd DEPENDS ${XSIMD_BENCHMARK_TARGET}) +add_custom_target(xbenchmark COMMAND benchmark_xsimd DEPENDS benchmark_xsimd) diff --git a/benchmark/main.cpp b/benchmark/main.cpp index 7a630e461..e5ef24365 100644 --- a/benchmark/main.cpp +++ b/benchmark/main.cpp @@ -10,7 +10,10 @@ ****************************************************************************/ #include "xsimd_benchmark.hpp" + +#include #include +#include void benchmark_operation() { diff --git a/benchmark/xsimd_benchmark.hpp b/benchmark/xsimd_benchmark.hpp index 6f6b91bf2..3f48cff40 100644 --- a/benchmark/xsimd_benchmark.hpp +++ b/benchmark/xsimd_benchmark.hpp @@ -14,8 +14,8 @@ #include "xsimd/arch/xsimd_scalar.hpp" #include "xsimd/xsimd.hpp" + #include -#include #include #include diff --git a/cmake/JoinPaths.cmake b/cmake/JoinPaths.cmake deleted file mode 100644 index 32d6d6685..000000000 --- a/cmake/JoinPaths.cmake +++ /dev/null @@ -1,26 +0,0 @@ -# This module provides function for joining paths -# known from from most languages -# -# Original license: -# SPDX-License-Identifier: (MIT OR CC0-1.0) -# Explicit permission given to distribute this module under -# the terms of the project as described in /LICENSE.rst. -# Copyright 2020 Jan Tojnar -# https://github.com/jtojnar/cmake-snips -# -# Modelled after Python’s os.path.join -# https://docs.python.org/3.7/library/os.path.html#os.path.join -# Windows not supported -function(join_paths joined_path first_path_segment) - set(temp_path "${first_path_segment}") - foreach(current_segment IN LISTS ARGN) - if(NOT ("${current_segment}" STREQUAL "")) - if(IS_ABSOLUTE "${current_segment}") - set(temp_path "${current_segment}") - else() - set(temp_path "${temp_path}/${current_segment}") - endif() - endif() - endforeach() - set(${joined_path} "${temp_path}" PARENT_SCOPE) -endfunction() diff --git a/docs/Doxyfile b/docs/Doxyfile index 72cd9c32e..c574a8579 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -15,6 +15,7 @@ INPUT = ../include/xsimd/types/xsimd_api.hpp \ ../include/xsimd/types/xsimd_avx512cd_register.hpp \ ../include/xsimd/types/xsimd_avx512dq_register.hpp \ ../include/xsimd/types/xsimd_avx512f_register.hpp \ + ../include/xsimd/types/xsimd_avx512vl_register.hpp \ ../include/xsimd/types/xsimd_avx_register.hpp \ ../include/xsimd/types/xsimd_fma3_avx_register.hpp \ ../include/xsimd/types/xsimd_fma3_avx2_register.hpp \ diff --git a/docs/source/api/arithmetic_index.rst b/docs/source/api/arithmetic_index.rst index 429600cb3..d4f5deb19 100644 --- a/docs/source/api/arithmetic_index.rst +++ b/docs/source/api/arithmetic_index.rst @@ -40,6 +40,12 @@ Binary operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`mul` | per slot multiply | +---------------------------------------+----------------------------------------------------+ +| :cpp:func:`mul_lo` | low N bits of the 2N-bit integer product | ++---------------------------------------+----------------------------------------------------+ +| :cpp:func:`mul_hi` | high N bits of the 2N-bit integer product | ++---------------------------------------+----------------------------------------------------+ +| :cpp:func:`mul_hilo` | pair {hi, lo} of the 2N-bit integer product | ++---------------------------------------+----------------------------------------------------+ | :cpp:func:`div` | per slot division | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`mod` | per slot modulo | diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d26751525..464892f87 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -9,16 +9,14 @@ # The full license is in the file LICENSE, distributed with this software. # ############################################################################ -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.13) + if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-examples) find_package(xsimd REQUIRED CONFIG) - set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIR}) endif () -include_directories(${XSIMD_INCLUDE_DIR}) - if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting examples build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) @@ -35,7 +33,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" endif() endif() -add_executable(mandelbrot mandelbrot.cpp ${XSIMD_HEADERS}) +add_executable(mandelbrot mandelbrot.cpp) +target_link_libraries(mandelbrot PRIVATE xsimd) set_property(TARGET mandelbrot PROPERTY CXX_STANDARD 14) if(ENABLE_XTL_COMPLEX) target_link_libraries(mandelbrot PRIVATE xtl) diff --git a/examples/mandelbrot.cpp b/examples/mandelbrot.cpp index 9a0d80e7a..0e001689f 100644 --- a/examples/mandelbrot.cpp +++ b/examples/mandelbrot.cpp @@ -13,15 +13,15 @@ // https://github.com/ospray/tsimd/blob/master/benchmarks/mandelbrot.cpp // Author Jefferson Amstutz / intel +#include "pico_bench.hpp" + +#include + #include #include #include #include -#include "pico_bench.hpp" - -#include - // helper function to write the rendered image as PPM file inline void writePPM(const std::string& fileName, const int sizeX, diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp index ff2fb4118..27b5ef24f 100644 --- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp +++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp @@ -12,12 +12,13 @@ #ifndef XSIMD_COMMON_ARITHMETIC_HPP #define XSIMD_COMMON_ARITHMETIC_HPP +#include "../../types/xsimd_batch_constant.hpp" +#include "./xsimd_common_details.hpp" + #include #include #include - -#include "../../types/xsimd_batch_constant.hpp" -#include "./xsimd_common_details.hpp" +#include namespace xsimd { @@ -177,6 +178,122 @@ namespace xsimd self, other); } + // mul_hi + namespace detail + { + template + struct mulhi_helper + { + using wider = std::conditional_t< + std::is_signed::value, + std::conditional_t>, + std::conditional_t>>; + + static XSIMD_INLINE T compute(T x, T y) noexcept + { + constexpr int shift = 8 * sizeof(T); + return static_cast((static_cast(x) * static_cast(y)) >> shift); + } + }; + + // 64-bit unsigned software mul_hi via 32-bit splits + XSIMD_INLINE uint64_t mulhi_u64(uint64_t x, uint64_t y) noexcept + { +#if defined(__SIZEOF_INT128__) + return static_cast((static_cast(x) * static_cast(y)) >> 64); +#else + uint64_t xl = x & 0xffffffffULL; + uint64_t xh = x >> 32; + uint64_t yl = y & 0xffffffffULL; + uint64_t yh = y >> 32; + uint64_t ll = xl * yl; + uint64_t lh = xl * yh; + uint64_t hl = xh * yl; + uint64_t hh = xh * yh; + uint64_t mid = (ll >> 32) + (lh & 0xffffffffULL) + (hl & 0xffffffffULL); + return hh + (lh >> 32) + (hl >> 32) + (mid >> 32); +#endif + } + + XSIMD_INLINE int64_t mulhi_i64(int64_t x, int64_t y) noexcept + { +#if defined(__SIZEOF_INT128__) + return static_cast((static_cast<__int128>(x) * static_cast<__int128>(y)) >> 64); +#else + uint64_t uhi = mulhi_u64(static_cast(x), static_cast(y)); + if (x < 0) + uhi -= static_cast(y); + if (y < 0) + uhi -= static_cast(x); + return static_cast(uhi); +#endif + } + + template <> + struct mulhi_helper + { + static XSIMD_INLINE uint64_t compute(uint64_t x, uint64_t y) noexcept { return mulhi_u64(x, y); } + }; + + template <> + struct mulhi_helper + { + static XSIMD_INLINE int64_t compute(int64_t x, int64_t y) noexcept { return mulhi_i64(x, y); } + }; + + // 64x64 unsigned mul_hi via 32x32->64 widening mul (WMul wraps _mm*_mul_epu32). + template + XSIMD_INLINE batch mulhi_u64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept + { + using B = batch; + const B mask(uint64_t(0xffffffffULL)); + B xl = x & mask; + B xh = x >> 32; + B yl = y & mask; + B yh = y >> 32; + B ll = mul_epu32(xl, yl); + B lh = mul_epu32(xl, yh); + B hl = mul_epu32(xh, yl); + B hh = mul_epu32(xh, yh); + B mid = (ll >> 32) + (lh & mask) + (hl & mask); + return hh + (lh >> 32) + (hl >> 32) + (mid >> 32); + } + + // Signed variant: unsigned core + sign fixup via arithmetic shift-by-63. + template + XSIMD_INLINE batch mulhi_i64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept + { + auto ux = ::xsimd::bitwise_cast(x); + auto uy = ::xsimd::bitwise_cast(y); + auto uhi = mulhi_u64_core(ux, uy, mul_epu32); + auto sa = ::xsimd::bitwise_cast(x >> 63); + auto sb = ::xsimd::bitwise_cast(y >> 63); + return ::xsimd::bitwise_cast(uhi - (uy & sa) - (ux & sb)); + } + } + + template ::value>*/> + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::apply([](T x, T y) noexcept -> T + { return detail::mulhi_helper::compute(x, y); }, + self, other); + } + + // mul_hilo + template ::value>*/> + XSIMD_INLINE std::pair, batch> + mul_hilo(batch const& self, batch const& other, requires_arch) noexcept + { + return std::pair, batch> { mul_hi(self, other, A {}), self * other }; + } + // rotl template XSIMD_INLINE batch rotl(batch const& self, STy other, requires_arch) noexcept diff --git a/include/xsimd/arch/common/xsimd_common_bit.hpp b/include/xsimd/arch/common/xsimd_common_bit.hpp new file mode 100644 index 000000000..5cd99c1cf --- /dev/null +++ b/include/xsimd/arch/common/xsimd_common_bit.hpp @@ -0,0 +1,232 @@ +/**************************************************************** + * Partial backport of `__cpp_lib_bitops == 201907L` from C++20 * + ****************************************************************/ + +#ifndef XSIMD_BIT_HPP +#define XSIMD_BIT_HPP + +#include "../../config/xsimd_config.hpp" + +#if XSIMD_CPP_VERSION > 202002L + +#include + +#if __cpp_lib_bitops >= 201907L + +#include + +namespace xsimd +{ + namespace detail + { + using std::countl_one; + using std::countl_zero; + using std::countr_one; + using std::countr_zero; + using std::popcount; + } +} + +#endif + +#else + +#include +#include + +#ifdef __has_builtin +#define XSIMD_HAS_BUILTIN(x) __has_builtin(x) +#else +#define XSIMD_HAS_BUILTIN(x) 0 +#endif + +#ifdef _MSC_VER +#include +#endif + +namespace xsimd +{ + namespace detail + { + // FIXME: We could do better by dispatching to the appropriate popcount instruction + // depending on the arch. + + template ::value>> + XSIMD_INLINE int popcount(T x) noexcept + { +#if XSIMD_HAS_BUILTIN(__builtin_popcountg) + return __builtin_popcountg(x); +#else + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { +#if XSIMD_HAS_BUILTIN(__builtin_popcount) + return __builtin_popcount(x); +#elif defined(_MSC_VER) + return __popcnt(x); +#else + // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64 + return ((uint64_t)x * 0x200040008001ULL & 0x111111111111111ULL) % 0xf; +#endif + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { +#if XSIMD_HAS_BUILTIN(__builtin_popcount) + return __builtin_popcount(x); +#elif defined(_MSC_VER) + return __popcnt16(x); +#else + // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64 + constexpr unsigned long long msb12 = 0x1001001001001ULL; + constexpr unsigned long long mask5 = 0x84210842108421ULL; + + unsigned int v = (unsigned int)x; + + return ((v & 0xfff) * msb12 & mask5) % 0x1f + + (((v & 0xfff000) >> 12) * msb12 & mask5) % 0x1f; +#endif + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { +#if XSIMD_HAS_BUILTIN(__builtin_popcount) + return __builtin_popcount(x); +#elif defined(_MSC_VER) + return __popcnt(x); +#else + // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + x = x - ((x >> 1) & (T) ~(T)0 / 3); + x = (x & (T) ~(T)0 / 15 * 3) + ((x >> 2) & (T) ~(T)0 / 15 * 3); + x = (x + (x >> 4)) & (T) ~(T)0 / 255 * 15; + return (x * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * CHAR_BIT; +#endif + } + else + { + // sizeof(T) == 8 +#if XSIMD_HAS_BUILTIN(__builtin_popcountll) + return __builtin_popcountll(x); +#elif XSIMD_HAS_BUILTIN(__builtin_popcount) + return __builtin_popcount((unsigned int)x) + __builtin_popcount((unsigned int)(x >> 32)); +#elif defined(_MSC_VER) +#ifdef _M_X64 + return (int)__popcnt64(x); +#else + return (int)(__popcnt((unsigned int)x) + __popcnt((unsigned int)(x >> 32))); +#endif +#else + // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + x = x - ((x >> 1) & (T) ~(T)0 / 3); + x = (x & (T) ~(T)0 / 15 * 3) + ((x >> 2) & (T) ~(T)0 / 15 * 3); + x = (x + (x >> 4)) & (T) ~(T)0 / 255 * 15; + return (x * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * CHAR_BIT; +#endif + } +#endif + } + + template ::value>> + XSIMD_INLINE int countl_zero(T x) noexcept + { +#if XSIMD_HAS_BUILTIN(__builtin_clzg) + return __builtin_clzg(x, (int)(sizeof(T) * CHAR_BIT)); +#else + if (x == 0) + return sizeof(T) * CHAR_BIT; + + XSIMD_IF_CONSTEXPR(sizeof(T) <= 4) + { +#if XSIMD_HAS_BUILTIN(__builtin_clz) + return __builtin_clz((unsigned int)x) - (4 - sizeof(T)) * CHAR_BIT; +#elif defined(_MSC_VER) + unsigned long index; + _BitScanReverse(&index, (unsigned long)x); + return sizeof(T) * CHAR_BIT - index - 1; +#else + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + XSIMD_IF_CONSTEXPR(sizeof(T) >= 2) + { + x |= x >> 8; + } + XSIMD_IF_CONSTEXPR(sizeof(T) >= 4) + { + x |= x >> 16; + } + return sizeof(T) * CHAR_BIT - popcount(x); +#endif + } + else + { + // sizeof(T) == 8 +#if XSIMD_HAS_BUILTIN(__builtin_clzll) + return __builtin_clzll((unsigned long long)x); +#elif defined(_MSC_VER) && defined(_M_X64) + unsigned long index; + _BitScanReverse64(&index, (unsigned long long)x); + return sizeof(T) * CHAR_BIT - index - 1; +#else + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + x |= x >> 32; + return sizeof(T) * CHAR_BIT - popcount(x); +#endif + } +#endif + } + + template ::value>> + XSIMD_INLINE int countl_one(T x) noexcept + { + return countl_zero(T(~x)); + } + + template ::value>> + XSIMD_INLINE int countr_zero(T x) noexcept + { +#if XSIMD_HAS_BUILTIN(__builtin_ctzg) + return __builtin_ctzg(x, (int)(sizeof(T) * CHAR_BIT)); +#else + if (x == 0) + return sizeof(T) * CHAR_BIT; + + XSIMD_IF_CONSTEXPR(sizeof(T) <= 4) + { +#if XSIMD_HAS_BUILTIN(__builtin_ctz) + return __builtin_ctz((unsigned int)x); +#elif defined(_MSC_VER) + unsigned long index; + _BitScanForward(&index, (unsigned long)x); + return index; +#endif + } + else + { + // sizeof(T) == 8 +#if XSIMD_HAS_BUILTIN(__builtin_ctzll) + return __builtin_ctzll((unsigned long long)x); +#elif defined(_MSC_VER) && defined(_M_X64) + unsigned long index; + _BitScanForward64(&index, (unsigned long long)x); + return index; +#endif + } + + // https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup + return popcount((T)((x & -x) - 1)); +#endif + } + + template ::value>> + XSIMD_INLINE int countr_one(T x) noexcept + { + return countr_zero(T(~x)); + } + + } +} + +#endif +#endif diff --git a/include/xsimd/arch/common/xsimd_common_cast.hpp b/include/xsimd/arch/common/xsimd_common_cast.hpp index 1226c887c..95753babd 100644 --- a/include/xsimd/arch/common/xsimd_common_cast.hpp +++ b/include/xsimd/arch/common/xsimd_common_cast.hpp @@ -12,7 +12,10 @@ #ifndef XSIMD_COMMON_CAST_HPP #define XSIMD_COMMON_CAST_HPP -#include "../../types/xsimd_traits.hpp" +#include "../../config/xsimd_macros.hpp" +#include "../../utils/xsimd_type_traits.hpp" + +#include namespace xsimd { diff --git a/include/xsimd/arch/common/xsimd_common_complex.hpp b/include/xsimd/arch/common/xsimd_common_complex.hpp index 874825182..cb25535e1 100644 --- a/include/xsimd/arch/common/xsimd_common_complex.hpp +++ b/include/xsimd/arch/common/xsimd_common_complex.hpp @@ -12,10 +12,10 @@ #ifndef XSIMD_COMMON_COMPLEX_HPP #define XSIMD_COMMON_COMPLEX_HPP -#include - #include "./xsimd_common_details.hpp" +#include + namespace xsimd { diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp index efe01806b..a99f19319 100644 --- a/include/xsimd/arch/common/xsimd_common_details.hpp +++ b/include/xsimd/arch/common/xsimd_common_details.hpp @@ -12,13 +12,13 @@ #ifndef XSIMD_COMMON_DETAILS_HPP #define XSIMD_COMMON_DETAILS_HPP -#include - #include "../../math/xsimd_rem_pio2.hpp" #include "../../types/xsimd_common_arch.hpp" #include "../../types/xsimd_utils.hpp" #include "../xsimd_constants.hpp" +#include + namespace xsimd { // Forward declaration. Should we put them in a separate file? @@ -111,6 +111,69 @@ namespace xsimd namespace detail { + // Prevent -ffast-math from reassociating floating-point + // arithmetic across this point. The reason string + // documents *why* at each call site; unused at runtime. + // + // Zero-cost register constraints per target: + // x86 "+x" (XMM/YMM/ZMM, also scalar float/double) + // ARM "+w" (V-reg / SVE Z-reg, also scalar float/double) + // PPC "+wa" (VS register, also scalar float/double) + // RISC-V "+f" (F/D register, scalar float/double) + // RISC-V RVV "+vr" (V register; GCC 15+ / Clang 20+) + // + // On unknown targets the "+m" fallback spills; it is + // only emitted when the compiler can actually reassociate. + template + XSIMD_INLINE void reassociation_barrier(T& x, const char*) noexcept + { +#if XSIMD_REASSOCIATIVE_MATH && XSIMD_WITH_INLINE_ASM && !defined(__EMSCRIPTEN__) +#if XSIMD_WITH_SSE2 + __asm__ volatile("" : "+x"(x)); +#elif XSIMD_WITH_NEON || XSIMD_WITH_SVE + __asm__ volatile("" : "+w"(x)); +#elif XSIMD_WITH_VSX + __asm__ volatile("" : "+wa"(x)); +#else + __asm__ volatile("" : "+m"(x)); +#endif +#else + (void)x; +#endif + } + + // RISC-V scalar float/double: use F/D registers instead of + // spilling through "+m". These overloads also serve + // emulated batches on RISC-V via the std::array overload. +#if XSIMD_REASSOCIATIVE_MATH && XSIMD_WITH_INLINE_ASM && defined(__riscv) + XSIMD_INLINE void reassociation_barrier(float& x, const char*) noexcept + { + __asm__ volatile("" : "+f"(x)); + } + XSIMD_INLINE void reassociation_barrier(double& x, const char*) noexcept + { + __asm__ volatile("" : "+f"(x)); + } +#endif + + template + XSIMD_INLINE void reassociation_barrier(std::array& arr, const char* reason) noexcept + { + for (auto& v : arr) + reassociation_barrier(v, reason); + } + + template + XSIMD_INLINE void reassociation_barrier(batch& b, const char* reason) noexcept + { +#if XSIMD_REASSOCIATIVE_MATH && XSIMD_WITH_RVV && XSIMD_WITH_INLINE_ASM && ((__GNUC__ >= 15) || (__clang_major__ >= 20)) + __asm__ volatile("" : "+vr"(b.data.value.value)); + (void)reason; +#else + reassociation_barrier(b.data, reason); +#endif + } + template XSIMD_INLINE batch apply(F&& func, batch const& self, batch const& other) noexcept { diff --git a/include/xsimd/arch/common/xsimd_common_logical.hpp b/include/xsimd/arch/common/xsimd_common_logical.hpp index 3716f6282..6ee5218aa 100644 --- a/include/xsimd/arch/common/xsimd_common_logical.hpp +++ b/include/xsimd/arch/common/xsimd_common_logical.hpp @@ -12,6 +12,7 @@ #ifndef XSIMD_COMMON_LOGICAL_HPP #define XSIMD_COMMON_LOGICAL_HPP +#include "./xsimd_common_bit.hpp" #include "./xsimd_common_details.hpp" #include @@ -28,43 +29,37 @@ namespace xsimd template XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { - uint64_t m = self.mask(); - XSIMD_IF_CONSTEXPR(batch_bool::size < 14) - { - // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64 - return (m * 0x200040008001ULL & 0x111111111111111ULL) % 0xf; - } - else - { -#if defined __has_builtin -#if __has_builtin(__builtin_popcountg) -#define builtin_popcount(v) __builtin_popcountg(v) -#endif -#endif + return xsimd::detail::popcount(self.mask()); + } -#ifdef builtin_popcount - return builtin_popcount(m); -#else - // FIXME: we could do better by dispatching to the appropriate - // popcount instruction depending on the arch... - XSIMD_IF_CONSTEXPR(batch_bool::size <= 32) - { - uint32_t m32 = static_cast(m); - // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - m32 = m32 - ((m32 >> 1) & 0x55555555); // reuse input as temporary - m32 = (m32 & 0x33333333) + ((m32 >> 2) & 0x33333333); // temp - return (((m32 + (m32 >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; // count - } - else - { - // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel - m = m - ((m >> 1) & (uint64_t) ~(uint64_t)0 / 3); // temp - m = (m & (uint64_t) ~(uint64_t)0 / 15 * 3) + ((m >> 2) & (uint64_t) ~(uint64_t)0 / 15 * 3); // temp - m = (m + (m >> 4)) & (uint64_t) ~(uint64_t)0 / 255 * 15; // temp - return (m * ((uint64_t) ~(uint64_t)0 / 255)) >> (sizeof(uint64_t) - 1) * CHAR_BIT; // count - } -#endif - } + template + XSIMD_INLINE size_t countl_zero(batch_bool const& self, requires_arch) noexcept + { + constexpr size_t unused_bits = 64 - batch_bool::size; + constexpr uint64_t lower_mask = batch_bool::size < 64 ? ((uint64_t)1 << (batch_bool::size % 64)) - 1 : (uint64_t)-1; + return xsimd::detail::countl_zero(self.mask() & lower_mask) - unused_bits; + } + + template + XSIMD_INLINE size_t countl_one(batch_bool const& self, requires_arch) noexcept + { + constexpr size_t unused_bits = 64 - batch_bool::size; + constexpr uint64_t upper_mask = batch_bool::size < 64 ? ~(((uint64_t)1 << (batch_bool::size % 64)) - 1) : (uint64_t)0; + return xsimd::detail::countl_one(self.mask() | upper_mask) - unused_bits; + } + + template + XSIMD_INLINE size_t countr_zero(batch_bool const& self, requires_arch) noexcept + { + constexpr uint64_t stop = batch_bool::size < 64 ? (uint64_t)1 << (batch_bool::size % 64) : 0; + return xsimd::detail::countr_zero(self.mask() | stop); + } + + template + XSIMD_INLINE size_t countr_one(batch_bool const& self, requires_arch) noexcept + { + constexpr uint64_t stop = batch_bool::size < 64 ? ~((uint64_t)1 << (batch_bool::size % 64)) : (uint64_t)-1; + return xsimd::detail::countr_one(self.mask() & stop); } // from mask diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp index f84883405..7dbb0ac05 100644 --- a/include/xsimd/arch/common/xsimd_common_math.hpp +++ b/include/xsimd/arch/common/xsimd_common_math.hpp @@ -743,7 +743,9 @@ namespace xsimd static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog_2() * a); + detail::reassociation_barrier(k, "compensated exp range reduction"); x = fnma(k, constants::log_2hi(), a); + detail::reassociation_barrier(x, "compensated exp range reduction"); x = fnma(k, constants::log_2lo(), x); return k; } @@ -769,7 +771,9 @@ namespace xsimd static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog10_2() * a); + detail::reassociation_barrier(k, "compensated exp10 range reduction"); x = fnma(k, constants::log10_2hi(), a); + detail::reassociation_barrier(x, "compensated exp10 range reduction"); x -= k * constants::log10_2lo(); return k; } @@ -794,6 +798,7 @@ namespace xsimd static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(a); + detail::reassociation_barrier(k, "compensated exp2 range reduction"); x = (a - k); return k; } @@ -819,7 +824,9 @@ namespace xsimd static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog_2() * a); + detail::reassociation_barrier(k, "compensated exp range reduction"); hi = fnma(k, constants::log_2hi(), a); + detail::reassociation_barrier(hi, "compensated exp range reduction"); lo = k * constants::log_2lo(); x = hi - lo; return k; @@ -846,7 +853,9 @@ namespace xsimd static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog10_2() * a); + detail::reassociation_barrier(k, "compensated exp10 range reduction"); x = fnma(k, constants::log10_2hi(), a); + detail::reassociation_barrier(x, "compensated exp10 range reduction"); x = fnma(k, constants::log10_2lo(), x); return k; } @@ -878,6 +887,7 @@ namespace xsimd { batch_type k = nearbyint(a); x = (a - k) * constants::log_2(); + detail::reassociation_barrier(x, "keep reduced exponent ordered before finalize"); return k; } @@ -937,7 +947,10 @@ namespace xsimd template XSIMD_INLINE batch exp10(batch const& self, requires_arch) noexcept { - return detail::exp(self); + using batch_type = batch; + batch_type out = detail::exp(self); + detail::reassociation_barrier(out, "prevent folding exp10 for literal inputs"); + return out; } // exp2 @@ -1494,6 +1507,7 @@ namespace xsimd batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); + detail::reassociation_barrier(dk, "keep compensated k conversion before split log(2) scaling"); batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); #ifdef __FAST_MATH__ return r; @@ -1525,6 +1539,7 @@ namespace xsimd hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; batch_type dk = to_float(k); + detail::reassociation_barrier(dk, "keep compensated k conversion before split log(2) scaling"); hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); @@ -1584,6 +1599,7 @@ namespace xsimd batch_type R = t1 + t2; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); + detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion"); batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2(), dk); #ifdef __FAST_MATH__ return r; @@ -1629,7 +1645,9 @@ namespace xsimd batch_type val_hi = hi * constants::invlog_2hi(); batch_type val_lo = fma(lo + hi, constants::invlog_2lo(), lo * constants::invlog_2hi()); batch_type dk = to_float(k); + detail::reassociation_barrier(dk, "Kahan compensated log2 summation"); batch_type w1 = dk + val_hi; + detail::reassociation_barrier(w1, "Kahan compensated log2 summation"); val_lo += (dk - w1) + val_hi; val_hi = w1; batch_type r = val_lo + val_hi; @@ -1705,6 +1723,7 @@ namespace xsimd batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type dk = to_float(k); + detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion"); batch_type hfsq = batch_type(0.5) * f * f; batch_type hibits = f - hfsq; hibits &= ::xsimd::bitwise_cast(i_type(0xfffff000)); @@ -1752,10 +1771,11 @@ namespace xsimd #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; + batch_type dk = to_float(k); + detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion"); hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); batch_type f = --x; - batch_type dk = to_float(k); batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; @@ -1818,6 +1838,7 @@ namespace xsimd batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); + detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion"); /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ batch_type c = select(batch_bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo() + c) - hfsq + f); @@ -1853,6 +1874,7 @@ namespace xsimd batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type dk = to_float(k); + detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion"); batch_type r = fma(dk, constants::log_2hi(), fma(s, hfsq + R, dk * constants::log_2lo() + c) - hfsq + f); #ifdef __FAST_MATH__ return r; @@ -1900,17 +1922,9 @@ namespace xsimd batch_type s = bitofsign(self); batch_type v = self ^ s; batch_type t2n = constants::twotonmb(); - // Under fast-math, reordering is possible and the compiler optimizes d - // to v. That's not what we want, so prevent compiler optimization here. - // FIXME: it may be better to emit a memory barrier here (?). -#ifdef __FAST_MATH__ batch_type d0 = v + t2n; - asm volatile("" ::"r"(&d0) : "memory"); + detail::reassociation_barrier(d0, "prevent collapsing (v + 2^n) - 2^n back to v"); batch_type d = d0 - t2n; -#else - batch_type d0 = v + t2n; - batch_type d = d0 - t2n; -#endif return s ^ select(v < t2n, d, v); } } @@ -2192,12 +2206,16 @@ namespace xsimd template XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept { - return fnma(nearbyint(self / other), other, self); + batch q = nearbyint(self / other); + detail::reassociation_barrier(q, "prevent pulling multiply back through rounded quotient"); + return fnma(q, other, self); } template XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept { - return fnma(nearbyint(self / other), other, self); + batch q = nearbyint(self / other); + detail::reassociation_barrier(q, "prevent pulling multiply back through rounded quotient"); + return fnma(q, other, self); } template ::value>> XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 6a301dd44..7a1ed73a3 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -12,13 +12,12 @@ #ifndef XSIMD_COMMON_MEMORY_HPP #define XSIMD_COMMON_MEMORY_HPP +#include "../../types/xsimd_batch_constant.hpp" +#include "./xsimd_common_details.hpp" + #include #include #include -#include - -#include "../../types/xsimd_batch_constant.hpp" -#include "./xsimd_common_details.hpp" namespace xsimd { @@ -224,7 +223,8 @@ namespace xsimd template XSIMD_INLINE typename batch, A>::value_type get(batch, A> const& self, ::xsimd::index, requires_arch) noexcept { - alignas(A::alignment()) T buffer[batch, A>::size]; + using value_type = typename batch, A>::value_type; + alignas(A::alignment()) value_type buffer[batch, A>::size]; self.store_aligned(&buffer[0]); return buffer[I]; } @@ -292,6 +292,12 @@ namespace xsimd return load_unaligned(mem, b, A {}); } + template + XSIMD_INLINE batch_bool load_stream(bool const* mem, batch_bool b, requires_arch) noexcept + { + return load_aligned(mem, b, A {}); + } + // load_aligned namespace detail { @@ -438,6 +444,12 @@ namespace xsimd store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } + template + XSIMD_INLINE batch load_stream(T_in const* mem, convert cvt, requires_arch) noexcept + { + return load_aligned(mem, cvt, A {}); + } + // rotate_right template XSIMD_INLINE batch rotate_right(batch const& self, requires_arch) noexcept @@ -679,6 +691,12 @@ namespace xsimd mem[i] = bool(buffer[i]); } + template + XSIMD_INLINE void store_stream(batch_bool const& self, bool* mem, requires_arch) noexcept + { + store(self, mem, A {}); + } + // store_aligned template XSIMD_INLINE void store_aligned(T_out* mem, batch const& self, requires_arch) noexcept @@ -697,6 +715,12 @@ namespace xsimd return store_aligned(mem, self, common {}); } + template + XSIMD_INLINE void store_stream(T_out* mem, batch const& self, requires_arch) noexcept + { + store_aligned(mem, self, A {}); + } + // swizzle template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch_constant mask, requires_arch) noexcept @@ -778,6 +802,12 @@ namespace xsimd return detail::load_complex(hi, lo, A {}); } + template + XSIMD_INLINE batch, A> load_complex_stream(std::complex const* mem, convert>, requires_arch) noexcept + { + return load_complex_aligned(mem, kernel::convert> {}, A {}); + } + // store_complex_aligned template XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept @@ -802,6 +832,12 @@ namespace xsimd hi.store_unaligned(buffer + real_batch::size); } + template + XSIMD_INLINE void store_complex_stream(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + store_complex_aligned(dst, src, A {}); + } + // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept diff --git a/include/xsimd/arch/common/xsimd_common_swizzle.hpp b/include/xsimd/arch/common/xsimd_common_swizzle.hpp index 4af2225cd..326340f92 100644 --- a/include/xsimd/arch/common/xsimd_common_swizzle.hpp +++ b/include/xsimd/arch/common/xsimd_common_swizzle.hpp @@ -12,12 +12,12 @@ #ifndef XSIMD_COMMON_SWIZZLE_HPP #define XSIMD_COMMON_SWIZZLE_HPP +#include "../../config/xsimd_macros.hpp" + #include #include #include -#include "../../config/xsimd_inline.hpp" - namespace xsimd { template diff --git a/include/xsimd/arch/common/xsimd_common_trigo.hpp b/include/xsimd/arch/common/xsimd_common_trigo.hpp index 78c1ea30e..d85511d2e 100644 --- a/include/xsimd/arch/common/xsimd_common_trigo.hpp +++ b/include/xsimd/arch/common/xsimd_common_trigo.hpp @@ -551,33 +551,45 @@ namespace xsimd { auto test = x > constants::pio4(); xr = x - constants::pio2_1(); + detail::reassociation_barrier(xr, "ordered pio2 subtraction"); xr -= constants::pio2_2(); + detail::reassociation_barrier(xr, "ordered pio2 subtraction"); xr -= constants::pio2_3(); + detail::reassociation_barrier(xr, "ordered pio2 subtraction"); xr = select(test, xr, x); return select(test, B(1.), B(0.)); } else if (all(x <= constants::twentypi())) { B xi = nearbyint(x * constants::twoopi()); + detail::reassociation_barrier(xi, "preserve quadrant selection"); xr = fnma(xi, constants::pio2_1(), x); + detail::reassociation_barrier(xr, "compensated range reduction"); xr -= xi * constants::pio2_2(); + detail::reassociation_barrier(xr, "compensated range reduction"); xr -= xi * constants::pio2_3(); + detail::reassociation_barrier(xr, "compensated range reduction"); return quadrant(xi); } else if (all(x <= constants::mediumpi())) { B fn = nearbyint(x * constants::twoopi()); + detail::reassociation_barrier(fn, "multi-term range reduction"); B r = x - fn * constants::pio2_1(); + detail::reassociation_barrier(r, "multi-term range reduction"); B w = fn * constants::pio2_1t(); B t = r; w = fn * constants::pio2_2(); r = t - w; + detail::reassociation_barrier(r, "multi-term range reduction"); w = fn * constants::pio2_2t() - ((t - r) - w); t = r; w = fn * constants::pio2_3(); r = t - w; + detail::reassociation_barrier(r, "multi-term range reduction"); w = fn * constants::pio2_3t() - ((t - r) - w); xr = r - w; + detail::reassociation_barrier(xr, "multi-term range reduction"); return quadrant(fn); } else diff --git a/include/xsimd/arch/utils/shifts.hpp b/include/xsimd/arch/utils/shifts.hpp new file mode 100644 index 000000000..719ecfb7a --- /dev/null +++ b/include/xsimd/arch/utils/shifts.hpp @@ -0,0 +1,82 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_UTILS_SHIFTS_HPP +#define XSIMD_UTILS_SHIFTS_HPP + +#include "../../config/xsimd_macros.hpp" +#include "../../types/xsimd_batch.hpp" +#include "../../types/xsimd_batch_constant.hpp" +#include "../../types/xsimd_traits.hpp" + +namespace xsimd +{ + namespace kernel + { + namespace utils + { + template + struct select_stride + { + template + static constexpr K get(K i, K) + { + constexpr I values_array[] = { Vs... }; + return static_cast(values_array[length * i + offset]); + } + }; + + template + constexpr I lsb_mask(I bit_index) + { + if (bit_index == 8 * sizeof(I)) + { + return ~I { 0 }; + } + return static_cast((I { 1 } << bit_index) - I { 1 }); + } + + template + constexpr bool all_equals(batch_constant c) + { + return (c == std::integral_constant {}).all(); + } + + template + XSIMD_INLINE batch bitwise_lshift_as_twice_larger( + batch const& self, batch_constant) noexcept + { + using T2 = widen_t; + + const auto self2 = bitwise_cast(self); + + // Lower byte: shift as twice the size and mask bits flowing to higher byte. + constexpr auto shifts_lo = make_batch_constant, A>(); + constexpr auto mask_lo = lsb_mask(8 * sizeof(T)); + const auto shifted_lo = bitwise_lshift(self2, shifts_lo); + constexpr auto batch_mask_lo = make_batch_constant(); + const auto masked_lo = bitwise_and(shifted_lo, batch_mask_lo.as_batch()); + + // Higher byte: mask bits that would flow from lower byte and shift as twice the size. + constexpr auto shifts_hi = make_batch_constant, A>(); + constexpr auto mask_hi = mask_lo << (8 * sizeof(T)); + constexpr auto batch_mask_hi = make_batch_constant(); + const auto masked_hi = bitwise_and(self2, batch_mask_hi.as_batch()); + const auto shifted_hi = bitwise_lshift(masked_hi, shifts_hi); + + return bitwise_cast(bitwise_or(masked_lo, shifted_hi)); + } + } + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp index 4af728e07..1ee0c5b89 100644 --- a/include/xsimd/arch/xsimd_avx.hpp +++ b/include/xsimd/arch/xsimd_avx.hpp @@ -13,13 +13,13 @@ #ifndef XSIMD_AVX_HPP #define XSIMD_AVX_HPP +#include "../types/xsimd_avx_register.hpp" +#include "../types/xsimd_batch_constant.hpp" + #include #include #include -#include "../types/xsimd_avx_register.hpp" -#include "../types/xsimd_batch_constant.hpp" - namespace xsimd { namespace kernel @@ -748,6 +748,80 @@ namespace xsimd return self - batch(mask.data); } + // first (must precede get for two-phase lookup) + template + XSIMD_INLINE float first(batch const& self, requires_arch) noexcept + { + return _mm256_cvtss_f32(self); + } + + template + XSIMD_INLINE double first(batch const& self, requires_arch) noexcept + { + return _mm256_cvtsd_f64(self); + } + + template ::value>> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return static_cast(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return static_cast(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFFFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return static_cast(_mm_cvtsi128_si32(_mm256_castsi256_si128(self))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + batch low = _mm256_castsi256_si128(self); + return first(low, sse4_2 {}); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // get + template + XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); } + constexpr size_t elements_per_lane = batch::size; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); + return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); + } + + template + XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); } + constexpr size_t elements_per_lane = batch::size; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); + return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); + } + + template ::value>> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); } + constexpr size_t elements_per_lane = batch::size; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); + return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); + } + // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept @@ -920,18 +994,18 @@ namespace xsimd using int_t = as_integer_t; constexpr size_t half_size = batch::size / 2; - // confined to lower 128-bit half → forward to SSE2 + // confined to lower 128-bit half → forward to 128 bit XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { constexpr auto mlo = ::xsimd::detail::lower_half(batch_bool_constant {}); - const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, sse4_2 {}); + const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, avx_128 {}); return bitwise_cast(batch(_mm256_zextsi128_si256(lo))); } - // confined to upper 128-bit half → forward to SSE2 + // confined to upper 128-bit half → forward to 128 bit else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); - const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, sse4_2 {}); + const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, avx_128 {}); return detail::zero_extend(hi); } else @@ -962,19 +1036,19 @@ namespace xsimd { constexpr size_t half_size = batch::size / 2; - // confined to lower 128-bit half → forward to SSE2 + // confined to lower 128-bit half → forward to 128 bit XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { constexpr auto mlo = ::xsimd::detail::lower_half(mask); const auto lo = detail::lower_half(src); - store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); + store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); } - // confined to upper 128-bit half → forward to SSE2 + // confined to upper 128-bit half → forward to 128 bit else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); const auto hi = detail::upper_half(src); - store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); + store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); } else { @@ -1515,6 +1589,23 @@ namespace xsimd return _mm256_storeu_pd(mem, self); } + // store_stream + template + XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept + { + _mm256_stream_ps(mem, self); + } + template + XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept + { + _mm256_stream_pd(mem, self); + } + template ::value, void>> + XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept + { + _mm256_stream_si256((__m256i*)mem, self); + } + // sub template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept @@ -1998,46 +2089,6 @@ namespace xsimd return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1); } - // first - template - XSIMD_INLINE float first(batch const& self, requires_arch) noexcept - { - return _mm256_cvtss_f32(self); - } - - template - XSIMD_INLINE double first(batch const& self, requires_arch) noexcept - { - return _mm256_cvtsd_f64(self); - } - - template ::value>> - XSIMD_INLINE T first(batch const& self, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return static_cast(_mm256_cvtsi256_si32(self) & 0xFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return static_cast(_mm256_cvtsi256_si32(self) & 0xFFFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return static_cast(_mm256_cvtsi256_si32(self)); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - batch low = _mm256_castsi256_si128(self); - return first(low, sse4_2 {}); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } - // widen template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp index bf6d9e7de..e2c223cc7 100644 --- a/include/xsimd/arch/xsimd_avx2.hpp +++ b/include/xsimd/arch/xsimd_avx2.hpp @@ -12,13 +12,13 @@ #ifndef XSIMD_AVX2_HPP #define XSIMD_AVX2_HPP -#include -#include - #include "../types/xsimd_avx2_register.hpp" #include "../types/xsimd_batch_constant.hpp" +#include "./utils/shifts.hpp" +#include #include +#include namespace xsimd { @@ -229,6 +229,23 @@ namespace xsimd store_masked(reinterpret_cast(mem), s64, batch_bool_constant {}, Mode {}, avx2 {}); } + // load_stream + template ::value, void>> + XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept + { + return _mm256_stream_load_si256((__m256i const*)mem); + } + template + XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept + { + return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i const*)mem)); + } + template + XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept + { + return _mm256_castsi256_pd(_mm256_stream_load_si256((__m256i const*)mem)); + } + // bitwise_and template ::value>> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept @@ -332,6 +349,29 @@ namespace xsimd } } + // bitwise_lshift multiple (constant) specific implementations. + // Missing implementations are dispatched to the `batch` overload in xsimd_api. + // The 1 byte constant implementation calls the 2 bytes constant version, the 2 bytes + // constant version calls into the 4 bytes version which resolves to the dynamic one above. + template ::value && (sizeof(T) <= 2), int> = 0> + XSIMD_INLINE batch bitwise_lshift( + batch const& self, batch_constant shifts, requires_arch req) noexcept + { + using uint_t = std::make_unsigned_t; + + // AVX2 only supports 16-bit shifts with a uniform bitshift value, + // otherwise emulate using 32-bit shifts. + XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) + { + return bitwise_lshift(self, req); + } + return bitwise_cast( + utils::bitwise_lshift_as_twice_larger( + bitwise_cast(self), + batch_constant(Vs)...> {})); + } + // bitwise_or template ::value>> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept @@ -528,6 +568,7 @@ namespace xsimd 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 + detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)"); return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); } @@ -543,6 +584,7 @@ namespace xsimd 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 + detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)"); return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); } } @@ -869,12 +911,95 @@ namespace xsimd { return _mm256_mullo_epi32(self, other); } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_add_epi64( + _mm256_mul_epu32(self, other), + _mm256_slli_epi64( + _mm256_add_epi64( + _mm256_mul_epu32(other, _mm256_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))), + _mm256_mul_epu32(self, _mm256_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))), + 32)); + } else { return mul(self, other, avx {}); } } + // mul_hi + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + // Sign-extend bytes to 16-bit (unpack-with-self followed by srai 8 + // duplicates the byte then arithmetic-shifts the sign in), do the + // 16x16->16 multiply, then take the high byte of each product. + // unpacklo/unpackhi and packs are all per-128-bit-lane, so the + // round trip preserves byte ordering and no vpermq is needed. + __m256i a_lo = _mm256_srai_epi16(_mm256_unpacklo_epi8(self, self), 8); + __m256i a_hi = _mm256_srai_epi16(_mm256_unpackhi_epi8(self, self), 8); + __m256i b_lo = _mm256_srai_epi16(_mm256_unpacklo_epi8(other, other), 8); + __m256i b_hi = _mm256_srai_epi16(_mm256_unpackhi_epi8(other, other), 8); + __m256i p_lo = _mm256_srai_epi16(_mm256_mullo_epi16(a_lo, b_lo), 8); + __m256i p_hi = _mm256_srai_epi16(_mm256_mullo_epi16(a_hi, b_hi), 8); + // results already lie in [-128, 127], so packs is exact (no saturation kicks in). + return _mm256_packs_epi16(p_lo, p_hi); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + __m256i zero = _mm256_setzero_si256(); + __m256i a_lo = _mm256_unpacklo_epi8(self, zero); + __m256i a_hi = _mm256_unpackhi_epi8(self, zero); + __m256i b_lo = _mm256_unpacklo_epi8(other, zero); + __m256i b_hi = _mm256_unpackhi_epi8(other, zero); + __m256i p_lo = _mm256_srli_epi16(_mm256_mullo_epi16(a_lo, b_lo), 8); + __m256i p_hi = _mm256_srli_epi16(_mm256_mullo_epi16(a_hi, b_hi), 8); + return _mm256_packus_epi16(p_lo, p_hi); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_mulhi_epi16(self, other); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_mulhi_epu16(self, other); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + __m256i even = _mm256_mul_epi32(self, other); + __m256i odd = _mm256_mul_epi32(_mm256_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), + _mm256_shuffle_epi32(other, _MM_SHUFFLE(3, 3, 1, 1))); + __m256i even_hi = _mm256_srli_epi64(even, 32); + return _mm256_blend_epi16(even_hi, odd, 0xCC); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + __m256i even = _mm256_mul_epu32(self, other); + __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(self, 32), _mm256_srli_epi64(other, 32)); + __m256i even_hi = _mm256_srli_epi64(even, 32); + return _mm256_blend_epi16(even_hi, odd, 0xCC); + } + + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhi_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm256_mul_epu32(a, b)); }); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhi_i64_core(self, other, + [](batch a, batch b) + { return batch(_mm256_mul_epu32(a, b)); }); + } + // reduce_add template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept @@ -1225,11 +1350,9 @@ namespace xsimd __m256i r0 = _mm256_shuffle_epi8(self, half_mask); __m256i r1 = _mm256_shuffle_epi8(swapped, half_mask); - // select lane by the mask index divided by 16 - constexpr auto lane = batch_constant< - uint8_t, A, - 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, - 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16> {}; + // select lane by the mask index divided by 16, first lane is 0, second is 16. + constexpr auto lane_size = make_batch_constant(); + constexpr auto lane = (make_iota_batch_constant() / lane_size) * lane_size; batch_bool blend_mask = (mask & 0b10000u) != lane; return _mm256_blendv_epi8(r0, r1, blend_mask); } @@ -1259,66 +1382,32 @@ namespace xsimd namespace detail { - template - constexpr T swizzle_val_none() + template + struct swizzle_mask { - // Most significant bit of the byte must be 1 - return 0x80; - } - - template - constexpr bool swizzle_val_is_cross_lane(T val, T idx, T size) - { - return (idx < (size / 2)) != (val < (size / 2)); - } + static constexpr auto values = std::array { Vals... }; - template - constexpr bool swizzle_val_is_defined(T val, T size) - { - return (0 <= val) && (val < size); - } - - template - constexpr T swizzle_self_val(T val, T idx, T size) - { - return (swizzle_val_is_defined(val, size) && !swizzle_val_is_cross_lane(val, idx, size)) - ? val % (size / 2) - : swizzle_val_none(); - } - - template - constexpr batch_constant(sizeof...(Vals)))...> - swizzle_make_self_batch_impl(std::index_sequence) - { - return {}; - } - - template - constexpr auto swizzle_make_self_batch() - { - return swizzle_make_self_batch_impl(std::make_index_sequence()); - } - - template - constexpr T swizzle_cross_val(T val, T idx, T size) - { - return (swizzle_val_is_defined(val, size) && swizzle_val_is_cross_lane(val, idx, size)) - ? val % (size / 2) - : swizzle_val_none(); - } + static constexpr T get(std::size_t idx_, std::size_t size_) noexcept + { + const T size = static_cast(size_); + const T idx = static_cast(idx_); + const T val = values[idx_]; - template - constexpr batch_constant(sizeof...(Vals)))...> - swizzle_make_cross_batch_impl(std::index_sequence) - { - return {}; - } + // Check if value in bounds + if ((T(0) <= val) && (val < size)) + { + // Whether we need to access the value from the other lane + const bool val_is_cross_lane = (idx < (size / 2)) != (val < (size / 2)); + if (val_is_cross_lane == cross_batch) + { + return val % (size / 2); + } + } - template - constexpr auto swizzle_make_cross_batch() - { - return swizzle_make_cross_batch_impl(std::make_index_sequence()); - } + // Out of bounds with most significant bit set to 1 will set the swizzle target to 0 + return ~T {}; + } + }; } // swizzle (constant mask) @@ -1354,8 +1443,8 @@ namespace xsimd // We can outsmart the dynamic version by creating a compile-time mask that leaves zeros // where it does not need to select data, resulting in a simple OR merge of the two batches. - constexpr auto self_mask = detail::swizzle_make_self_batch(); - constexpr auto cross_mask = detail::swizzle_make_cross_batch(); + constexpr auto self_mask = make_batch_constant, A>(); + constexpr auto cross_mask = make_batch_constant, A>(); // permute bytes within each lane (AVX2 only) __m256i r0 = _mm256_shuffle_epi8(self, self_mask.as_batch()); diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp new file mode 100644 index 000000000..7a590c74f --- /dev/null +++ b/include/xsimd/arch/xsimd_avx2_128.hpp @@ -0,0 +1,170 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX2_128_HPP +#define XSIMD_AVX2_128_HPP + +#include "../types/xsimd_avx2_register.hpp" +#include "../types/xsimd_batch_constant.hpp" + +#include + +namespace xsimd +{ + namespace kernel + { + using namespace types; + + // select + template ::value>> + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + constexpr int mask = batch_bool_constant::mask(); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_blend_epi32(false_br, true_br, mask); + } + else + { + return select(batch_bool_constant(), true_br, false_br, avx_128 {}); + } + } + + // bitwise_lshift + template ::value>> + XSIMD_INLINE batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_sllv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_sllv_epi64(self, other); + } + else + { + return bitwise_lshift(self, other, avx {}); + } + } + + // bitwise_rshift + template ::value>> + XSIMD_INLINE batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_srav_epi32(self, other); + } + else + { + return bitwise_rshift(self, other, avx_128 {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_srlv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_srlv_epi64(self, other); + } + else + { + return bitwise_rshift(self, other, avx_128 {}); + } + } + } + + // load_masked + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_epi32(mem, mask.as_batch()); + } + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_epi32((int32_t*)mem, mask.as_batch()); + } + template + XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_epi64(mem, mask.as_batch()); + } + template + XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_epi64((int64_t*)mem, mask.as_batch()); + } + + // store_masked + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + return _mm_maskstore_epi32(mem, mask.as_batch(), src); + } + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + return _mm_maskstore_epi32((int32_t*)mem, mask.as_batch(), src); + } + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + return _mm_maskstore_epi64(mem, mask.as_batch(), src); + } + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src); + } + + // gather + template = 0, detail::enable_sized_integral_t = 0> + XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, + kernel::requires_arch) noexcept + { + return _mm_i32gather_epi32(reinterpret_cast(src), index, sizeof(T)); + } + + template = 0, detail::enable_sized_integral_t = 0> + XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, + kernel::requires_arch) noexcept + { + return _mm_i64gather_epi64(reinterpret_cast(src), index, sizeof(T)); + } + + template = 0> + XSIMD_INLINE batch gather(batch const&, float const* src, + batch const& index, + kernel::requires_arch) noexcept + { + return _mm_i32gather_ps(src, index, sizeof(float)); + } + + template = 0> + XSIMD_INLINE batch gather(batch const&, double const* src, + batch const& index, + requires_arch) noexcept + { + return _mm_i64gather_pd(src, index, sizeof(double)); + } + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp index 28e2e98d6..57894a831 100644 --- a/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512bw.hpp @@ -12,11 +12,11 @@ #ifndef XSIMD_AVX512BW_HPP #define XSIMD_AVX512BW_HPP +#include "../types/xsimd_avx512bw_register.hpp" + #include #include -#include "../types/xsimd_avx512bw_register.hpp" - namespace xsimd { @@ -470,6 +470,43 @@ namespace xsimd } } + // mul_hi + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + // Per-128-bit-lane unpack/pack pair preserves byte ordering across + // the four 128-bit lanes of a ZMM, so no inter-lane permute needed. + __m512i a_lo = _mm512_srai_epi16(_mm512_unpacklo_epi8(self, self), 8); + __m512i a_hi = _mm512_srai_epi16(_mm512_unpackhi_epi8(self, self), 8); + __m512i b_lo = _mm512_srai_epi16(_mm512_unpacklo_epi8(other, other), 8); + __m512i b_hi = _mm512_srai_epi16(_mm512_unpackhi_epi8(other, other), 8); + __m512i p_lo = _mm512_srai_epi16(_mm512_mullo_epi16(a_lo, b_lo), 8); + __m512i p_hi = _mm512_srai_epi16(_mm512_mullo_epi16(a_hi, b_hi), 8); + return _mm512_packs_epi16(p_lo, p_hi); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + __m512i zero = _mm512_setzero_si512(); + __m512i a_lo = _mm512_unpacklo_epi8(self, zero); + __m512i a_hi = _mm512_unpackhi_epi8(self, zero); + __m512i b_lo = _mm512_unpacklo_epi8(other, zero); + __m512i b_hi = _mm512_unpackhi_epi8(other, zero); + __m512i p_lo = _mm512_srli_epi16(_mm512_mullo_epi16(a_lo, b_lo), 8); + __m512i p_hi = _mm512_srli_epi16(_mm512_mullo_epi16(a_hi, b_hi), 8); + return _mm512_packus_epi16(p_lo, p_hi); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_mulhi_epi16(self, other); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm512_mulhi_epu16(self, other); + } + // neq template ::value>> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512er.hpp b/include/xsimd/arch/xsimd_avx512er.hpp index be02f9850..ee69ef9f8 100644 --- a/include/xsimd/arch/xsimd_avx512er.hpp +++ b/include/xsimd/arch/xsimd_avx512er.hpp @@ -12,9 +12,6 @@ #ifndef XSIMD_AVX512ER_HPP #define XSIMD_AVX512ER_HPP -#include -#include - #include "../types/xsimd_avx512er_register.hpp" #endif diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp index 5ccf165f1..6a7316722 100644 --- a/include/xsimd/arch/xsimd_avx512f.hpp +++ b/include/xsimd/arch/xsimd_avx512f.hpp @@ -12,13 +12,13 @@ #ifndef XSIMD_AVX512F_HPP #define XSIMD_AVX512F_HPP +#include "../types/xsimd_avx512f_register.hpp" +#include "../types/xsimd_batch_constant.hpp" + #include #include #include -#include "../types/xsimd_avx512f_register.hpp" -#include "../types/xsimd_batch_constant.hpp" - namespace xsimd { @@ -1346,6 +1346,97 @@ namespace xsimd } } + // first (must precede get for two-phase lookup) + template + XSIMD_INLINE float first(batch const& self, requires_arch) noexcept + { + return _mm512_cvtss_f32(self); + } + + template + XSIMD_INLINE double first(batch const& self, requires_arch) noexcept + { + return _mm512_cvtsd_f64(self); + } + + template ::value>> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + batch low = _mm512_castsi512_si128(self); + return first(low, sse4_2 {}); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // get: use valignd/valignq to rotate lane I into position 0 in a single op. + template + XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return first(self, avx512f {}); + } + const auto rotated = _mm512_alignr_epi32(_mm512_castps_si512(self), _mm512_castps_si512(self), I); + return _mm_cvtss_f32(_mm512_castps512_ps128(_mm512_castsi512_ps(rotated))); + } + + template + XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return first(self, avx512f {}); + } + const auto rotated = _mm512_alignr_epi64(_mm512_castpd_si512(self), _mm512_castpd_si512(self), I); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(_mm512_castsi512_pd(rotated))); + } + + template ::value>> + XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(I == 0) + { + return first(self, avx512f {}); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + const auto rotated = _mm512_alignr_epi32(self, self, I); + return first(batch(_mm512_castsi512_si128(rotated)), sse4_2 {}); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + const auto rotated = _mm512_alignr_epi64(self, self, I); + return first(batch(_mm512_castsi512_si128(rotated)), sse4_2 {}); + } + else + { + // 8/16-bit lanes have no sub-dword rotate in AVX-512F; delegate to AVX halves. + constexpr size_t elements_per_lane = batch::size; + constexpr size_t lane = I / elements_per_lane; + constexpr size_t sub_index = I % elements_per_lane; + const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); + return kernel::get(batch(half), ::xsimd::index {}, avx {}); + } + } + // insert template XSIMD_INLINE batch insert(batch const& self, float val, index, requires_arch) noexcept @@ -1427,15 +1518,40 @@ namespace xsimd { // Adapted from https://github.com/serge-sans-paille/fast-bitset-from-bool-array // Generate a bitset from an array of boolean. - XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[8]) + template + XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[N]) { - uint64_t data; - memcpy(&data, unpacked, sizeof(uint64_t)); + static_assert(N == 8 || N == 4 || N == 2, "valid pack size"); + XSIMD_IF_CONSTEXPR(N == 8) + { + uint64_t data; + memcpy(&data, unpacked, sizeof(uint64_t)); - const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000); + const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000); - unsigned char res = ((data * magic) >> 56) & 0xFF; - return res; + unsigned char res = ((data * magic) >> 56) & 0xFF; + return res; + } + else XSIMD_IF_CONSTEXPR(N == 4) + { + uint32_t data; + memcpy(&data, unpacked, sizeof(uint32_t)); + + const uint32_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000); + + unsigned char res = ((data * magic) >> 24) & 0xFF; + return res; + } + else XSIMD_IF_CONSTEXPR(N == 2) + { + uint16_t data; + memcpy(&data, unpacked, sizeof(uint16_t)); + + const uint16_t magic = (0x80 + 0x4000); + + unsigned char res = ((data * magic) >> 8) & 0xFF; + return res; + } } } @@ -1450,7 +1566,7 @@ namespace xsimd register_type mask = 0; for (std::size_t i = 0; i < iter; ++i) { - unsigned char block = detail::tobitset((unsigned char*)mem + i * 8); + unsigned char block = detail::tobitset<8>((unsigned char*)mem + i * 8); mask |= (register_type(block) << (i * 8)); } return mask; @@ -1513,6 +1629,23 @@ namespace xsimd return _mm512_loadu_pd(mem); } + // load_stream + template ::value, void>> + XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept + { + return _mm512_stream_load_si512((__m512i*)mem); + } + template + XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept + { + return _mm512_castsi512_ps(_mm512_stream_load_si512((__m512i*)mem)); + } + template + XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept + { + return _mm512_castsi512_pd(_mm512_stream_load_si512((__m512i*)mem)); + } + // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept @@ -1664,6 +1797,41 @@ namespace xsimd } } + // mul_hi + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + __m512i even = _mm512_mul_epi32(self, other); + __m512i odd = _mm512_mul_epi32(_mm512_shuffle_epi32(self, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1))), + _mm512_shuffle_epi32(other, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1)))); + __m512i even_hi = _mm512_srli_epi64(even, 32); + // merge: even_hi has hi in low-32 of each 64, odd has hi in high-32 of each 64 + return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + __m512i even = _mm512_mul_epu32(self, other); + __m512i odd = _mm512_mul_epu32(_mm512_srli_epi64(self, 32), _mm512_srli_epi64(other, 32)); + __m512i even_hi = _mm512_srli_epi64(even, 32); + return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd); + } + + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhi_u64_core(self, other, + [](batch a, batch b) + { return batch(_mm512_mul_epu32(a, b)); }); + } + template + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::mulhi_i64_core(self, other, + [](batch a, batch b) + { return batch(_mm512_mul_epu32(a, b)); }); + } + // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept @@ -2285,6 +2453,23 @@ namespace xsimd return _mm512_storeu_pd(mem, self); } + // store_stream + template ::value, void>> + XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept + { + _mm512_stream_si512((__m512i*)mem, self); + } + template + XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept + { + _mm512_stream_ps(mem, self); + } + template + XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept + { + _mm512_stream_pd(mem, self); + } + // sub template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept @@ -2449,7 +2634,7 @@ namespace xsimd }; template - struct is_pair_of_contiguous_indices : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices, std::false_type>::type + struct is_pair_of_contiguous_indices : std::conditional_t<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices, std::false_type> { }; @@ -2463,30 +2648,50 @@ namespace xsimd I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>; }; + template + constexpr bool is_reduce_pattern() + { + // The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1} + if (sizeof...(Is) != batch::size) + return false; + uint16_t pattern[] = { Is... }; + if (pattern[0] != 1) + return false; + for (size_t i = 1; i < sizeof...(Is); i += 1) + { + if (pattern[i] != (i & 1)) + return false; + } + return true; + } } - template ::value>> - XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept - { - constexpr typename detail::fold_batch_constant::type mask32; - return _mm512_permutexvar_epi32(static_cast>(mask32), self); - } - - template - XSIMD_INLINE batch - swizzle(batch const& self, batch_constant, requires_arch) noexcept + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { - // FIXME: this sequence is very inefficient, but it's here to catch - // a pattern generated by detail::reduce from xsimd_common_math.hpp. - // The whole pattern is actually decently folded by GCC and Clang, - // so bare with it. - constexpr batch_constant mask32; - auto tmp = _mm512_permutexvar_epi32(static_cast>(mask32), self); + XSIMD_IF_CONSTEXPR(detail::is_pair_of_contiguous_indices::value) + { + constexpr typename detail::fold_batch_constant::type mask32; + return _mm512_permutexvar_epi32(static_cast>(mask32), self); + } + else XSIMD_IF_CONSTEXPR(detail::is_reduce_pattern()) + { + // FIXME: this sequence is very inefficient, but it's here to catch + // a pattern generated by detail::reduce from xsimd_common_math.hpp. + // The whole pattern is actually decently folded by GCC and Clang, + // so bare with it. + constexpr batch_constant mask32; + auto tmp = _mm512_permutexvar_epi32(static_cast>(mask32), self); - alignas(A::alignment()) uint16_t buffer[32]; - _mm512_store_si512((__m512i*)&buffer[0], tmp); - buffer[0] = buffer[1]; - return _mm512_load_si512(&buffer[0]); + alignas(A::alignment()) uint16_t buffer[32]; + _mm512_store_si512((__m512i*)&buffer[0], tmp); + buffer[0] = buffer[1]; + return _mm512_load_si512(&buffer[0]); + } + else + { + return swizzle(self, mask, common {}); + } } template @@ -2719,46 +2924,6 @@ namespace xsimd 2)); } - // first - template - XSIMD_INLINE float first(batch const& self, requires_arch) noexcept - { - return _mm512_cvtss_f32(self); - } - - template - XSIMD_INLINE double first(batch const& self, requires_arch) noexcept - { - return _mm512_cvtsd_f64(self); - } - - template ::value>> - XSIMD_INLINE T first(batch const& self, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self))); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - batch low = _mm512_castsi512_si128(self); - return first(low, sse4_2 {}); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } - // widen template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept diff --git a/include/xsimd/arch/xsimd_avx512ifma.hpp b/include/xsimd/arch/xsimd_avx512ifma.hpp index 206319348..13e55de8f 100644 --- a/include/xsimd/arch/xsimd_avx512ifma.hpp +++ b/include/xsimd/arch/xsimd_avx512ifma.hpp @@ -12,9 +12,6 @@ #ifndef XSIMD_AVX512IFMA_HPP #define XSIMD_AVX512IFMA_HPP -#include -#include - #include "../types/xsimd_avx512ifma_register.hpp" #endif diff --git a/include/xsimd/arch/xsimd_avx512pf.hpp b/include/xsimd/arch/xsimd_avx512pf.hpp index 6265c9171..5c21d6787 100644 --- a/include/xsimd/arch/xsimd_avx512pf.hpp +++ b/include/xsimd/arch/xsimd_avx512pf.hpp @@ -12,9 +12,6 @@ #ifndef XSIMD_AVX512PF_HPP #define XSIMD_AVX512PF_HPP -#include -#include - #include "../types/xsimd_avx512pf_register.hpp" #endif diff --git a/include/xsimd/arch/xsimd_avx512vbmi.hpp b/include/xsimd/arch/xsimd_avx512vbmi.hpp index 7c00d94ea..099adb7e0 100644 --- a/include/xsimd/arch/xsimd_avx512vbmi.hpp +++ b/include/xsimd/arch/xsimd_avx512vbmi.hpp @@ -12,11 +12,10 @@ #ifndef XSIMD_AVX512VBMI_HPP #define XSIMD_AVX512VBMI_HPP -#include -#include - #include "../types/xsimd_avx512vbmi_register.hpp" +#include + namespace xsimd { diff --git a/include/xsimd/arch/xsimd_avx512vbmi2.hpp b/include/xsimd/arch/xsimd_avx512vbmi2.hpp index 8852b2a1e..705b8beaf 100644 --- a/include/xsimd/arch/xsimd_avx512vbmi2.hpp +++ b/include/xsimd/arch/xsimd_avx512vbmi2.hpp @@ -12,11 +12,10 @@ #ifndef XSIMD_AVX512VBMI2_HPP #define XSIMD_AVX512VBMI2_HPP -#include -#include - #include "../types/xsimd_avx512vbmi2_register.hpp" +#include + namespace xsimd { diff --git a/include/xsimd/config/xsimd_inline.hpp b/include/xsimd/arch/xsimd_avx512vl.hpp similarity index 68% rename from include/xsimd/config/xsimd_inline.hpp rename to include/xsimd/arch/xsimd_avx512vl.hpp index f3becaf12..d47b0df40 100644 --- a/include/xsimd/config/xsimd_inline.hpp +++ b/include/xsimd/arch/xsimd_avx512vl.hpp @@ -9,21 +9,11 @@ * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ -#ifndef XSIMD_INLINE_HPP -#define XSIMD_INLINE_HPP +#ifndef XSIMD_AVX512VL_HPP +#define XSIMD_AVX512VL_HPP -#if defined(__VEC__) -#define XSIMD_INLINE inline -#elif defined __has_attribute -#if __has_attribute(always_inline) -#define XSIMD_INLINE inline __attribute__((always_inline)) -#else -#define XSIMD_INLINE inline -#endif -#elif defined(_MSC_VER) -#define XSIMD_INLINE inline __forceinline -#else -#define XSIMD_INLINE inline -#endif +#include "../types/xsimd_avx512vl_register.hpp" + +// no 512-bit operation with avx512-vl, it only provides 128 et 256 bits ones. #endif diff --git a/include/xsimd/arch/xsimd_avx512vl_128.hpp b/include/xsimd/arch/xsimd_avx512vl_128.hpp new file mode 100644 index 000000000..155338425 --- /dev/null +++ b/include/xsimd/arch/xsimd_avx512vl_128.hpp @@ -0,0 +1,647 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VL_128_HPP +#define XSIMD_AVX512VL_128_HPP + +#include "../types/xsimd_avx512vl_register.hpp" +#include "../types/xsimd_batch_constant.hpp" + +#include + +namespace xsimd +{ + namespace kernel + { + using namespace types; + + namespace detail + { + template + XSIMD_INLINE batch_bool compare_int_avx512vl_128(batch const& self, batch const& other) noexcept + { + using register_type = typename batch_bool::register_type; + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + // shifting to take sign into account + uint64_t mask_low0 = _mm_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, + (batch(other.data) & batch(0x000000FF)) << 24, + Cmp); + uint64_t mask_low1 = _mm_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, + (batch(other.data) & batch(0x0000FF00)) << 16, + Cmp); + uint64_t mask_high0 = _mm_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, + (batch(other.data) & batch(0x00FF0000)) << 8, + Cmp); + uint64_t mask_high1 = _mm_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), + (batch(other.data) & batch(0xFF000000)), + Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 8; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + // shifting to take sign into account + uint16_t mask_low = _mm_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, + (batch(other.data) & batch(0x0000FFFF)) << 16, + Cmp); + uint16_t mask_high = _mm_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), + (batch(other.data) & batch(0xFFFF0000)), + Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm_cmp_epi32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm_cmp_epi64_mask(self, other, Cmp); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + uint64_t mask_low0 = _mm_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); + uint64_t mask_low1 = _mm_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); + uint64_t mask_high0 = _mm_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); + uint64_t mask_high1 = _mm_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 8; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + uint16_t mask_low = _mm_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); + uint16_t mask_high = _mm_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm_cmp_epu32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm_cmp_epu64_mask(self, other, Cmp); + } + } + } + } + + // load mask + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + constexpr auto chunk_size = size >= 8 ? 8 : (size >= 4 ? 4 : 2); + constexpr auto iter = size / chunk_size; + static_assert((size % chunk_size) == 0, "incorrect size of bool batch"); + register_type mask = 0; + for (std::size_t i = 0; i < iter; ++i) + { + unsigned char block = detail::tobitset((unsigned char*)mem + i * chunk_size); + mask |= (register_type(block) << (i * chunk_size)); + } + return mask; + } + + // from bool + template + XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch) noexcept + { + return select(self, batch(1), batch(0)); + } + + // from_mask + template + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + assert(mask == (mask & ((uint64_t(1) << batch_bool::size) - 1)) && "inbound mask"); + return static_cast::register_type>(mask & ((uint64_t(1) << batch_bool::size) - 1)); + } + + // mask + template + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return self.data & ((uint64_t(1) << batch_bool::size) - 1); + } + + // batch_bool_cast + template + XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return self.data; + } + + // set + template + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + using register_type = typename batch_bool::register_type; + register_type r = 0; + unsigned shift = 0; + (void)std::initializer_list { (r |= register_type(values ? 1 : 0) << (shift++))... }; + return r; + } + + // store + template + XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept + { + constexpr auto size = batch_bool::size; + for (std::size_t i = 0; i < size; ++i) + mem[i] = (self.data >> i) & 0x1; + } + + // abs + template + XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept + { + return _mm_abs_epi64(self); + } + + // load masked + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + constexpr auto imm_mask = mask.mask(); + return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem); + } + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + constexpr auto imm_mask = mask.mask(); + return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem); + } + + // store masked + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_epi32(mem, mask.mask(), src); + } + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_epi32(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_epi64(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_epi64(mem, mask.mask(), src); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_ps(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm_mask_storeu_pd(mem, mask.mask(), src); + } + + // max + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_max_epi64(self, other); + } + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_max_epu64(self, other); + } + + // min + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_min_epi64(self, other); + } + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_min_epu64(self, other); + } + + // insert + template + XSIMD_INLINE batch insert(batch const& self, float val, index, requires_arch) noexcept + { + + int32_t tmp = bit_cast(val); + return _mm_castsi128_ps(_mm_mask_set1_epi32(_mm_castps_si128(self), __mmask8(1 << (I & 7)), tmp)); + } + + template + XSIMD_INLINE batch insert(batch const& self, double val, index, requires_arch) noexcept + { + int64_t tmp = bit_cast(val); + return _mm_castsi128_pd(_mm_mask_set1_epi64(_mm_castpd_si128(self), __mmask8(1 << (I & 3)), tmp)); + } + + template ::value>> + XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_mask_set1_epi32(self, __mmask8(1 << (I & 7)), val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_mask_set1_epi64(self, __mmask8(1 << (I & 3)), val); + } + else + { + return insert(self, val, pos, common {}); + } + } + + // isnan + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, self, _CMP_UNORD_Q); + } + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, self, _CMP_UNORD_Q); + } + + // rotl + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_rolv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_rolv_epi64(self, other); + } + else + { + return rotl(self, other, avx2_128 {}); + } + } + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, int32_t other, requires_arch) noexcept + { + return rotl(self, batch(other), A {}); + } + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(count < bits, "Count must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_rol_epi32(self, count); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_rol_epi64(self, count); + } + else + { + return rotl(self, avx2_128 {}); + } + } + + // rotr + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_rorv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_rorv_epi64(self, other); + } + } + return rotr(self, other, avx2_128 {}); + } + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, int32_t other, requires_arch) noexcept + { + return rotr(self, batch(other), A {}); + } + + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(count < bits, "Count must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_ror_epi32(self, count); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_ror_epi64(self, count); + } + } + return rotr(self, avx2_128 {}); + } + + // all + template + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr register_type bitmask = (register_type(1) << batch_bool::size) - 1; + return (self.data & bitmask) == bitmask; + } + + // any + template + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr register_type bitmask = (register_type(1) << batch_bool::size) - 1; + return (self.data & bitmask) != 0; + } + + // eq + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_EQ_OQ); + } + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_EQ_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_128(self, other); + } + template + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data ^ other.data); + } + + // neq + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_NEQ_OQ); + } + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_NEQ_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (~(self == other)); + } + template + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // gt + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_GT_OQ); + } + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_GT_OQ); + } + template ::value>> + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_128(self, other); + } + + // ge + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_GE_OQ); + } + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_GE_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_128(self, other); + } + + // lt + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_LT_OQ); + } + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_LT_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_128(self, other); + } + + // le + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_ps_mask(self, other, _CMP_LE_OQ); + } + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm_cmp_pd_mask(self, other, _CMP_LE_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_128(self, other); + } + + // select + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_mask_blend_ps(cond, false_br, true_br); + } + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_mask_blend_pd(cond, false_br, true_br); + } + template ::value>> + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + batch_bool batch_cond = batch_bool::from_mask(cond.mask()); + return _mm_blendv_epi8(false_br, true_br, batch_cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + batch_bool batch_cond = batch_bool::from_mask(cond.mask()); + return _mm_blendv_epi8(false_br, true_br, batch_cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_mask_blend_epi32(cond, false_br, true_br); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_mask_blend_epi64(cond, false_br, true_br); + } + } + template + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, avx512vl_128 {}); + } + + // reciprocal + template + XSIMD_INLINE batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm_rcp14_ps(self); + } + + template + XSIMD_INLINE batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm_rcp14_pd(self); + } + + // bitwise_and + template + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & other.data); + } + + // bitwise_andnot + template + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & ~other.data); + } + + // bitwise_not + template + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data); + } + + // bitwise_or + template + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + // bitwise_xor + template + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // sadd + template ::value>> + XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = other < 0; + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(mask, self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } + + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp new file mode 100644 index 000000000..a5ea546bc --- /dev/null +++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp @@ -0,0 +1,729 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VL_256_HPP +#define XSIMD_AVX512VL_256_HPP + +#include "../types/xsimd_avx512vl_register.hpp" +#include "../types/xsimd_batch_constant.hpp" + +#include + +namespace xsimd +{ + namespace kernel + { + using namespace types; + + namespace detail + { + template + XSIMD_INLINE batch_bool compare_int_avx512vl_256(batch const& self, batch const& other) noexcept + { + using register_type = typename batch_bool::register_type; + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + // shifting to take sign into account + uint64_t mask_low0 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, + (batch(other.data) & batch(0x000000FF)) << 24, + Cmp); + uint64_t mask_low1 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, + (batch(other.data) & batch(0x0000FF00)) << 16, + Cmp); + uint64_t mask_high0 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, + (batch(other.data) & batch(0x00FF0000)) << 8, + Cmp); + uint64_t mask_high1 = _mm256_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), + (batch(other.data) & batch(0xFF000000)), + Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 8; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + // shifting to take sign into account + uint16_t mask_low = _mm256_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, + (batch(other.data) & batch(0x0000FFFF)) << 16, + Cmp); + uint16_t mask_high = _mm256_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), + (batch(other.data) & batch(0xFFFF0000)), + Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm256_cmp_epi32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm256_cmp_epi64_mask(self, other, Cmp); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + uint64_t mask_low0 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); + uint64_t mask_low1 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); + uint64_t mask_high0 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); + uint64_t mask_high1 = _mm256_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); + uint64_t mask = 0; + for (unsigned i = 0; i < 8; ++i) + { + mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); + mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); + mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); + mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); + } + return (register_type)mask; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + uint16_t mask_low = _mm256_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); + uint16_t mask_high = _mm256_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); + return static_cast(morton(mask_low, mask_high)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return (register_type)_mm256_cmp_epu32_mask(self, other, Cmp); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return (register_type)_mm256_cmp_epu64_mask(self, other, Cmp); + } + } + } + } + + // load mask + template + XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr auto size = batch_bool::size; + constexpr auto chunk_size = size >= 8 ? 8 : 4; + constexpr auto iter = size / chunk_size; + static_assert((size % chunk_size) == 0, "incorrect size of bool batch"); + register_type mask = 0; + for (std::size_t i = 0; i < iter; ++i) + { + unsigned char block = detail::tobitset((unsigned char*)mem + i * chunk_size); + mask |= (register_type(block) << (i * chunk_size)); + } + return mask; + } + + // from bool + template + XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch) noexcept + { + return select(self, batch(1), batch(0)); + } + + // from_mask + template + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + assert(mask == (mask & ((uint64_t(1) << batch_bool::size) - 1)) && "inbound mask"); + return static_cast::register_type>(mask & ((uint64_t(1) << batch_bool::size) - 1)); + } + + // mask + template + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return self.data & ((uint64_t(1) << batch_bool::size) - 1); + } + + // batch_bool_cast + template + XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return self.data; + } + + // set + template + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + using register_type = typename batch_bool::register_type; + register_type r = 0; + unsigned shift = 0; + (void)std::initializer_list { (r |= register_type(values ? 1 : 0) << (shift++))... }; + return r; + } + + // store + template + XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept + { + constexpr auto size = batch_bool::size; + for (std::size_t i = 0; i < size; ++i) + mem[i] = (self.data >> i) & 0x1; + } + + // abs + template + XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept + { + return _mm256_abs_epi64(self); + } + + // load masked + template + XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + constexpr auto imm_mask = mask.mask(); + return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + } + template + XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + constexpr auto imm_mask = mask.mask(); + return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem); + } + + // store masked + template + XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_storeu_epi32(mem, mask.mask(), src); + } + template + XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_storeu_epi32(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_storeu_epi64(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(int64_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_storeu_epi64(mem, mask.mask(), src); + } + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_storeu_ps(mem, mask.mask(), src); + } + + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + _mm256_mask_storeu_pd(mem, mask.mask(), src); + } + + // max + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_max_epi64(self, other); + } + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_max_epu64(self, other); + } + + // min + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_min_epi64(self, other); + } + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm256_min_epu64(self, other); + } + + // swizzle (dynamic version) + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_ps(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_pd(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_epi64(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512vl_256 {})); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return _mm256_permutexvar_epi32(mask, self); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512vl_256 {})); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + return swizzle(batch { self.data }, batch { mask.data }, avx2 {}).data; + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); + } + + // swizzle + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept + { + return swizzle(self, mask, fma3 {}); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept + { + return swizzle(self, mask, fma3 {}); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept + { + return swizzle(self, mask, fma3 {}); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3); + return _mm256_permutex_pd(self, mask); + } + template = 0> + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3); + return _mm256_permutex_epi64(self, mask); + } + + // insert + template + XSIMD_INLINE batch insert(batch const& self, float val, index, requires_arch) noexcept + { + + int32_t tmp = bit_cast(val); + return _mm256_castsi256_ps(_mm256_mask_set1_epi32(_mm256_castps_si256(self), __mmask8(1 << (I & 7)), tmp)); + } + + template + XSIMD_INLINE batch insert(batch const& self, double val, index, requires_arch) noexcept + { + int64_t tmp = bit_cast(val); + return _mm256_castsi256_pd(_mm256_mask_set1_epi64(_mm256_castpd_si256(self), __mmask8(1 << (I & 3)), tmp)); + } + + template ::value>> + XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_mask_set1_epi32(self, __mmask8(1 << (I & 7)), val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_mask_set1_epi64(self, __mmask8(1 << (I & 3)), val); + } + else + { + return insert(self, val, pos, common {}); + } + } + + // isnan + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, self, _CMP_UNORD_Q); + } + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, self, _CMP_UNORD_Q); + } + + // rotl + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_rolv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_rolv_epi64(self, other); + } + else + { + return rotl(self, other, avx2 {}); + } + } + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, int32_t other, requires_arch) noexcept + { + return rotl(self, batch(other), A {}); + } + template ::value>> + XSIMD_INLINE batch rotl(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(count < bits, "Count must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_rol_epi32(self, count); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_rol_epi64(self, count); + } + else + { + return rotl(self, avx2 {}); + } + } + + // rotr + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_rorv_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_rorv_epi64(self, other); + } + } + return rotr(self, other, avx2 {}); + } + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, int32_t other, requires_arch) noexcept + { + return rotr(self, batch(other), A {}); + } + + template ::value>> + XSIMD_INLINE batch rotr(batch const& self, requires_arch) noexcept + { + constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; + static_assert(count < bits, "Count must be less than the number of bits in T"); + XSIMD_IF_CONSTEXPR(std::is_unsigned::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_ror_epi32(self, count); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_ror_epi64(self, count); + } + } + return rotr(self, avx2 {}); + } + + // all + template + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr register_type bitmask = (register_type(1) << batch_bool::size) - 1; + return (self.data & bitmask) == bitmask; + } + + // any + template + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + constexpr register_type bitmask = (register_type(1) << batch_bool::size) - 1; + return (self.data & bitmask) != 0; + } + + // eq + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_EQ_OQ); + } + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_EQ_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_256(self, other); + } + template + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data ^ other.data); + } + + // neq + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_NEQ_OQ); + } + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_NEQ_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return (~(self == other)); + } + template + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // gt + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_GT_OQ); + } + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_GT_OQ); + } + template ::value>> + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_256(self, other); + } + + // ge + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_GE_OQ); + } + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_GE_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_256(self, other); + } + + // lt + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_LT_OQ); + } + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_LT_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_256(self, other); + } + + // le + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_ps_mask(self, other, _CMP_LE_OQ); + } + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch_bool::register_type)_mm256_cmp_pd_mask(self, other, _CMP_LE_OQ); + } + + template ::value>> + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return detail::compare_int_avx512vl_256(self, other); + } + + // select + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm256_mask_blend_ps(cond, false_br, true_br); + } + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm256_mask_blend_pd(cond, false_br, true_br); + } + template ::value>> + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + batch_bool batch_cond = batch_bool::from_mask(cond.mask()); + return _mm256_blendv_epi8(false_br, true_br, batch_cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + batch_bool batch_cond = batch_bool::from_mask(cond.mask()); + return _mm256_blendv_epi8(false_br, true_br, batch_cond); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm256_mask_blend_epi32(cond, false_br, true_br); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm256_mask_blend_epi64(cond, false_br, true_br); + } + } + template + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, avx512vl_256 {}); + } + + // reciprocal + template + XSIMD_INLINE batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm256_rcp14_ps(self); + } + + template + XSIMD_INLINE batch + reciprocal(batch const& self, + kernel::requires_arch) noexcept + { + return _mm256_rcp14_pd(self); + } + + // bitwise_and + template + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & other.data); + } + + // bitwise_andnot + template + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data & ~other.data); + } + + // bitwise_not + template + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(~self.data); + } + + // bitwise_or + template + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data | other.data); + } + + // bitwise_xor + template + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + using register_type = typename batch_bool::register_type; + return register_type(self.data ^ other.data); + } + + // sadd + template ::value>> + XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + auto mask = other < 0; + auto self_pos_branch = min(std::numeric_limits::max() - other, self); + auto self_neg_branch = max(std::numeric_limits::min() - other, self); + return other + select(mask, self_neg_branch, self_pos_branch); + } + else + { + const auto diffmax = std::numeric_limits::max() - self; + const auto mindiff = min(diffmax, other); + return self + mindiff; + } + } + + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp index b285623d0..c95069df1 100644 --- a/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp +++ b/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp @@ -12,9 +12,6 @@ #ifndef XSIMD_AVX512VNNI_AVX512_BW_HPP #define XSIMD_AVX512VNNI_AVX512_BW_HPP -#include -#include - #include "../types/xsimd_avx512vnni_avx512bw_register.hpp" #endif diff --git a/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp b/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp index 0b4ffd2e4..552869d25 100644 --- a/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp +++ b/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp @@ -12,9 +12,6 @@ #ifndef XSIMD_AVX512VNNI_AVX512VBMI2_HPP #define XSIMD_AVX512VNNI_AVX512VBMI2_HPP -#include -#include - #include "../types/xsimd_avx512vnni_avx512vbmi2_register.hpp" #endif diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp new file mode 100644 index 000000000..46fc9acb7 --- /dev/null +++ b/include/xsimd/arch/xsimd_avx_128.hpp @@ -0,0 +1,164 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Marco Barbone * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX_128_HPP +#define XSIMD_AVX_128_HPP + +#include "../types/xsimd_avx_register.hpp" +#include "../types/xsimd_batch_constant.hpp" + +#include + +namespace xsimd +{ + namespace kernel + { + using namespace types; + + // broadcast + template ::value>> + XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept + { + return _mm_broadcast_ss(&val); + } + + // eq + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_EQ_OQ); + } + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_EQ_OQ); + } + + // gt + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_GT_OQ); + } + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_GT_OQ); + } + + // ge + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_GE_OQ); + } + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_GE_OQ); + } + + // lt + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_LT_OQ); + } + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_LT_OQ); + } + + // le + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_LE_OQ); + } + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_LE_OQ); + } + + // neq + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_ps(self, other, _CMP_NEQ_UQ); + } + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmp_pd(self, other, _CMP_NEQ_UQ); + } + + // load_masked + template + XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_ps(mem, mask.as_batch()); + } + template + XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept + { + return _mm_maskload_pd(mem, mask.as_batch()); + } + + // store_masked + template + XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + return _mm_maskstore_ps(mem, mask.as_batch(), src); + } + + template + XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept + { + return _mm_maskstore_pd(mem, mask.as_batch(), src); + } + + // swizzle (dynamic mask) + template ::value && sizeof(T) == sizeof(ITy)>> + XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(std::is_same::value) + { + return _mm_permutevar_ps(self, mask); + } + else + { + // VPERMILPD's variable control reads bit 1 of each 64-bit selector + // (bit 0 is ignored), so a {0,1} index needs to become {0,2}. + // Negation is a cheap alternative to a left shift by 1. + return _mm_permutevar_pd(self, -mask); + } + } + + // swizzle (constant mask) + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + return _mm_permute_ps(self, detail::mod_shuffle(V0, V1, V2, V3)); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + return _mm_permute_pd(self, detail::mod_shuffle(V0, V1)); + } + + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_avxvnni.hpp b/include/xsimd/arch/xsimd_avxvnni.hpp index a97ba9296..7c1fec96c 100644 --- a/include/xsimd/arch/xsimd_avxvnni.hpp +++ b/include/xsimd/arch/xsimd_avxvnni.hpp @@ -12,9 +12,6 @@ #ifndef XSIMD_AVXVNNI_HPP #define XSIMD_AVXVNNI_HPP -#include -#include - #include "../types/xsimd_avxvnni_register.hpp" #endif diff --git a/include/xsimd/arch/xsimd_common.hpp b/include/xsimd/arch/xsimd_common.hpp index 11f21bd82..1d800e349 100644 --- a/include/xsimd/arch/xsimd_common.hpp +++ b/include/xsimd/arch/xsimd_common.hpp @@ -13,6 +13,7 @@ #define XSIMD_COMMON_HPP #include "./common/xsimd_common_arithmetic.hpp" +#include "./common/xsimd_common_bit.hpp" #include "./common/xsimd_common_cast.hpp" #include "./common/xsimd_common_complex.hpp" #include "./common/xsimd_common_logical.hpp" diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp index 74bcd2351..f5a7f4ffe 100644 --- a/include/xsimd/arch/xsimd_common_fwd.hpp +++ b/include/xsimd/arch/xsimd_common_fwd.hpp @@ -15,6 +15,7 @@ #include #include +#include namespace xsimd { @@ -58,6 +59,11 @@ namespace xsimd template ::value>> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept; template ::value>> + XSIMD_INLINE batch mul_hi(batch const& self, batch const& other, requires_arch) noexcept; + template ::value>> + XSIMD_INLINE std::pair, batch> + mul_hilo(batch const& self, batch const& other, requires_arch) noexcept; + template ::value>> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept; template ::value>> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept; @@ -101,6 +107,11 @@ namespace xsimd // Forward declarations for pack-level helpers namespace detail { + template + XSIMD_INLINE void reassociation_barrier(T& x, const char*) noexcept; + template + XSIMD_INLINE void reassociation_barrier(batch& b, const char* reason) noexcept; + template XSIMD_INLINE constexpr bool is_identity() noexcept; template @@ -116,6 +127,14 @@ namespace xsimd template XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant) noexcept; + template + XSIMD_INLINE batch mulhi_u64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept; + template + XSIMD_INLINE batch mulhi_i64_core(batch const& x, + batch const& y, + WMul mul_epu32) noexcept; } } } diff --git a/include/xsimd/arch/xsimd_constants.hpp b/include/xsimd/arch/xsimd_constants.hpp index 916cdf70d..00b719fc9 100644 --- a/include/xsimd/arch/xsimd_constants.hpp +++ b/include/xsimd/arch/xsimd_constants.hpp @@ -12,10 +12,10 @@ #ifndef XSIMD_NUMERICAL_CONSTANT_HPP #define XSIMD_NUMERICAL_CONSTANT_HPP -#include - #include "../types/xsimd_utils.hpp" +#include + namespace xsimd { diff --git a/include/xsimd/arch/xsimd_emulated.hpp b/include/xsimd/arch/xsimd_emulated.hpp index 4437174a9..4426c543f 100644 --- a/include/xsimd/arch/xsimd_emulated.hpp +++ b/include/xsimd/arch/xsimd_emulated.hpp @@ -12,16 +12,14 @@ #ifndef XSIMD_EMULATED_HPP #define XSIMD_EMULATED_HPP -#include -#include -#include -#include - #include "../arch/xsimd_scalar.hpp" - #include "../types/xsimd_emulated_register.hpp" #include "../types/xsimd_utils.hpp" +#include +#include +#include + namespace xsimd { template @@ -508,7 +506,7 @@ namespace xsimd constexpr size_t size = batch::size; uint64_t res = 0; for (size_t i = 0; i < size; ++i) - res |= (self.data[i] ? 1u : 0u) << i; + res |= (uint64_t)(self.data[i] ? 1u : 0u) << i; return res; } diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index 1772159a0..cf88f64d7 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -13,7 +13,6 @@ #define XSIMD_ISA_HPP #include "../config/xsimd_arch.hpp" - #include "./xsimd_common_fwd.hpp" #if XSIMD_WITH_EMULATED @@ -50,6 +49,7 @@ #if XSIMD_WITH_AVX #include "./xsimd_avx.hpp" +#include "./xsimd_avx_128.hpp" #endif #if XSIMD_WITH_FMA3_AVX @@ -62,6 +62,7 @@ #if XSIMD_WITH_AVX2 #include "./xsimd_avx2.hpp" +#include "./xsimd_avx2_128.hpp" #endif #if XSIMD_WITH_FMA3_AVX2 @@ -72,6 +73,12 @@ #include "./xsimd_avx512f.hpp" #endif +#if XSIMD_WITH_AVX512VL +#include "./xsimd_avx512vl.hpp" +#include "./xsimd_avx512vl_128.hpp" +#include "./xsimd_avx512vl_256.hpp" +#endif + #if XSIMD_WITH_AVX512DQ #include "./xsimd_avx512dq.hpp" #endif @@ -88,6 +95,10 @@ #include "./xsimd_avx512pf.hpp" #endif +#if XSIMD_WITH_AVX512VL +#include "./xsimd_avx512pf.hpp" +#endif + #if XSIMD_WITH_AVX512IFMA #include "./xsimd_avx512ifma.hpp" #endif @@ -136,6 +147,10 @@ #include "./xsimd_vsx.hpp" #endif +#if XSIMD_WITH_VXE +#include "./xsimd_vxe.hpp" +#endif + // Must come last to have access to all conversion specializations. #include "./xsimd_common.hpp" diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 55b3b8d81..8ea2756f7 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -12,284 +12,28 @@ #ifndef XSIMD_NEON_HPP #define XSIMD_NEON_HPP -#include -#include -#include -#include -#include - +#include "../types/xsimd_batch_fwd.hpp" #include "../types/xsimd_neon_register.hpp" #include "../types/xsimd_utils.hpp" +#include "../utils/xsimd_type_traits.hpp" +#include "./common/xsimd_common_bit.hpp" #include "./common/xsimd_common_cast.hpp" +#include "./xsimd_common_fwd.hpp" -// Wrap intrinsics so we can pass them as function pointers -// - OP: intrinsics name prefix, e.g., vorrq -// - RT: type traits to deduce intrinsics return types -#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \ - namespace wrap \ - { \ - XSIMD_INLINE RT OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \ - { \ - return ::OP##_u8(a, b); \ - } \ - XSIMD_INLINE RT OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \ - { \ - return ::OP##_u16(a, b); \ - } \ - XSIMD_INLINE RT OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \ - { \ - return ::OP##_u32(a, b); \ - } \ - } - -#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ - WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \ - namespace wrap \ - { \ - XSIMD_INLINE RT OP##_s8(int8x16_t a, int8x16_t b) noexcept \ - { \ - return ::OP##_s8(a, b); \ - } \ - XSIMD_INLINE RT OP##_s16(int16x8_t a, int16x8_t b) noexcept \ - { \ - return ::OP##_s16(a, b); \ - } \ - XSIMD_INLINE RT OP##_s32(int32x4_t a, int32x4_t b) noexcept \ - { \ - return ::OP##_s32(a, b); \ - } \ - } - -#define WRAP_BINARY_INT(OP, RT) \ - WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ - namespace wrap \ - { \ - XSIMD_INLINE RT OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \ - { \ - return ::OP##_u64(a, b); \ - } \ - XSIMD_INLINE RT OP##_s64(int64x2_t a, int64x2_t b) noexcept \ - { \ - return ::OP##_s64(a, b); \ - } \ - } - -#define WRAP_BINARY_FLOAT(OP, RT) \ - namespace wrap \ - { \ - XSIMD_INLINE RT OP##_f32(float32x4_t a, float32x4_t b) noexcept \ - { \ - return ::OP##_f32(a, b); \ - } \ - } - -#define WRAP_UNARY_INT_EXCLUDING_64(OP) \ - namespace wrap \ - { \ - XSIMD_INLINE uint8x16_t OP##_u8(uint8x16_t a) noexcept \ - { \ - return ::OP##_u8(a); \ - } \ - XSIMD_INLINE int8x16_t OP##_s8(int8x16_t a) noexcept \ - { \ - return ::OP##_s8(a); \ - } \ - XSIMD_INLINE uint16x8_t OP##_u16(uint16x8_t a) noexcept \ - { \ - return ::OP##_u16(a); \ - } \ - XSIMD_INLINE int16x8_t OP##_s16(int16x8_t a) noexcept \ - { \ - return ::OP##_s16(a); \ - } \ - XSIMD_INLINE uint32x4_t OP##_u32(uint32x4_t a) noexcept \ - { \ - return ::OP##_u32(a); \ - } \ - XSIMD_INLINE int32x4_t OP##_s32(int32x4_t a) noexcept \ - { \ - return ::OP##_s32(a); \ - } \ - } - -#define WRAP_UNARY_INT(OP) \ - WRAP_UNARY_INT_EXCLUDING_64(OP) \ - namespace wrap \ - { \ - XSIMD_INLINE uint64x2_t OP##_u64(uint64x2_t a) noexcept \ - { \ - return ::OP##_u64(a); \ - } \ - XSIMD_INLINE int64x2_t OP##_s64(int64x2_t a) noexcept \ - { \ - return ::OP##_s64(a); \ - } \ - } - -#define WRAP_UNARY_FLOAT(OP) \ - namespace wrap \ - { \ - XSIMD_INLINE float32x4_t OP##_f32(float32x4_t a) noexcept \ - { \ - return ::OP##_f32(a); \ - } \ - } - -// Dummy identity caster to ease coding -XSIMD_INLINE uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; } -XSIMD_INLINE int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; } -XSIMD_INLINE uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; } -XSIMD_INLINE int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; } -XSIMD_INLINE uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; } -XSIMD_INLINE int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; } -XSIMD_INLINE uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; } -XSIMD_INLINE int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; } -XSIMD_INLINE float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; } +#include +#include +#include +#include +#include namespace xsimd { - template - struct batch_bool_constant; - namespace kernel { using namespace types; namespace detail { - template