diff --git a/.clang-format b/.clang-format
index d0da08381..08011e6bc 100644
--- a/.clang-format
+++ b/.clang-format
@@ -4,5 +4,24 @@ AlignAfterOpenBracket: Align
 AlignConsecutiveDeclarations: 'false'
 BreakBeforeBraces: Allman
 NamespaceIndentation: All
+IncludeBlocks: Regroup
+# First regex to match classifies the header into a group.
+# Group are ordered in the source code by increasing priority.
+IncludeCategories:
+  # Standard headers
+  - Regex: <[^\.]+>
+    Priority: 4
+  # Third party dependencies (prefer angle bracket over quotes)
+  - Regex: <.+\..+>
+    Priority: 3
+  # xsimd absolute headers (e.g. in tests)
+  - Regex: '["<]xsimd/.+[">]'
+    Priority: 2
+  # Relative header from project
+  - Regex: '"\.+.*"'
+    Priority: 1
+  # Not quoted header (anything else, avoid)
+  - Regex: '".*"'
+    Priority: 0
 
 ...
diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000..2d173401f
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,4 @@
+---
+Checks:          '-*,modernize-type-traits'
+WarningsAsErrors: true
+HeaderFilterRegex: '.*'
diff --git a/.github/toolchains/clang-powerpc64-linux-gnu.cmake b/.github/toolchains/clang-powerpc64-linux-gnu.cmake
new file mode 100644
index 000000000..771702fdf
--- /dev/null
+++ b/.github/toolchains/clang-powerpc64-linux-gnu.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_PROCESSOR powerpc64)
+set(triple powerpc64-linux-gnu)
+
+include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake)
+
diff --git a/.github/toolchains/clang-powerpc64le-linux-gnu.cmake b/.github/toolchains/clang-powerpc64le-linux-gnu.cmake
new file mode 100644
index 000000000..b4fa02506
--- /dev/null
+++ b/.github/toolchains/clang-powerpc64le-linux-gnu.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_PROCESSOR powerpc64le)
+set(triple powerpc64le-linux-gnu)
+
+include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake)
+
diff --git a/.github/toolchains/gcc-s390x-linux-gnu.cmake b/.github/toolchains/gcc-s390x-linux-gnu.cmake
new file mode 100644
index 000000000..05fba0b53
--- /dev/null
+++ b/.github/toolchains/gcc-s390x-linux-gnu.cmake
@@ -0,0 +1,4 @@
+set(CMAKE_SYSTEM_PROCESSOR s390x)
+set(triple s390x-linux-gnu)
+
+include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake)
diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index 3efa4de96..c1c5ca5a0 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -18,19 +18,17 @@ jobs:
           - 18
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v6
 
       - name: Build script
         env:
           TARGET: ${{ matrix.target }}
           API: ${{ matrix.api }}
         run: |
-          mkdir _build
           NDK="$($ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager --list_installed | sed -E 's/( +[|] +)/|/g;s/ +$//' | grep '^  ndk' | cut -d '|' -f 4 | sort | head -n1)"
-          cd _build && \
-            cmake .. -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/$NDK/build/cmake/android.toolchain.cmake \
-                     -DANDROID_ABI=$ABI \
-                     -DANDROID_PLATFORM=android-$API \
-                     -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release
-
-          cmake --build . --verbose
+          cmake -B _build \
+                -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/$NDK/build/cmake/android.toolchain.cmake \
+                -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \
+                -DANDROID_ABI=$ABI -DANDROID_PLATFORM=android-$API \
+                -DCMAKE_BUILD_TYPE=Release
+          cmake --build _build --verbose
diff --git a/.github/workflows/arch-consistency-check.yml b/.github/workflows/arch-consistency-check.yml
index dc57879dc..83839e136 100644
--- a/.github/workflows/arch-consistency-check.yml
+++ b/.github/workflows/arch-consistency-check.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout xsimd
-        uses: actions/checkout@v3
+        uses: actions/checkout@v6
       - name: Install dependencies
         run: sudo apt install g++
       - name: Check architecture consistency
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 679c93c5a..cccb01a53 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -7,14 +7,9 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
-    - name: Install dependencies
-      run: |
-        sudo apt install g++
+    - uses: actions/checkout@v6
     - name: Setup
-      run: |
-        mkdir _build
-        cd _build && cmake .. -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release
+      run: cmake -B _build -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release
     - name: Build
       run: cmake --build _build
     - name: Testing sequential
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index 1df779b8e..0ff2cd0a4 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -11,19 +11,16 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Checkout xsimd
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Configure build
-      run: |
-        mkdir _build && cd _build
-        cmake  .. -DCMAKE_BUILD_TYPE=Release \
-                  -DCMAKE_INSTALL_PREFIX=_install
+      run: cmake -B _build -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=_install
     - name: Build
       run: cmake --build _build --target install
     - name: Check install
       run: |
         mkdir _install_build && cd _install_build
         cp ${{ github.workspace }}/.github/cmake-test/* .
-        ls $PWD/../_build/_install/share/cmake/xsimd
-        cmake . -DCMAKE_PREFIX_PATH=$PWD/../_build/_install/share/cmake/xsimd
+        ls $PWD/../_install/share/cmake/xsimd
+        cmake . -DCMAKE_PREFIX_PATH=$PWD/../_install/share/cmake/xsimd
         cmake --build .
 
diff --git a/.github/workflows/cross-arm.yml b/.github/workflows/cross-arm.yml
index 071e85f25..79a83492a 100644
--- a/.github/workflows/cross-arm.yml
+++ b/.github/workflows/cross-arm.yml
@@ -15,6 +15,7 @@ jobs:
           - { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' }
         sys:
           - { compiler: 'gcc',   version: '10' }
+          - { compiler: 'gcc',   version: '14' }
     steps:
     - name: Setup compiler
       if: ${{ matrix.sys.compiler == 'gcc' }}
@@ -32,14 +33,18 @@ jobs:
       run: |
         sudo apt-get install ninja-build
     - name: Checkout xsimd
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Setup
       run: |
-        mkdir _build
-        cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake
+        cmake -B _build \
+              -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} \
+              -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release \
+              -DTARGET_ARCH=generic \
+              -DCMAKE_C_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" \
+              -DCMAKE_CXX_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" \
+              -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake
     - name: Build
       run: cmake --build _build
     - name: Testing xsimd
-      run: |
-        qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
+      run: qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
       working-directory: ${{ github.workspace }}/_build
diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml
index 92ffae333..6f07a763c 100644
--- a/.github/workflows/cross-ppc.yml
+++ b/.github/workflows/cross-ppc.yml
@@ -3,10 +3,11 @@ on: [push, pull_request]
 concurrency:
   group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
   cancel-in-progress: true
+
 jobs:
   build:
     runs-on: ubuntu-latest
-    name: '${{ matrix.target.arch }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}'
+    name: '${{ matrix.target.platform }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}'
     strategy:
       matrix:
         target:
@@ -14,16 +15,27 @@ jobs:
           - { platform: 'ppc64',     dir: 'powerpc64-linux-gnu',   flags: '-maltivec -mvsx -mcpu=power10', full: 'OFF' }
         sys:
           - { compiler: 'gcc',   version: '12' }
+          - { compiler: 'clang', version: '20', gcc_runtime: '12' }
     steps:
-    - name: Setup compiler
+    - name: Setup GCC
       if: ${{ matrix.sys.compiler == 'gcc' }}
       run: |
         sudo apt-get update || exit 1
-        sudo apt-get --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib || exit 1
+        sudo apt-get -y --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib cmake || exit 1
         sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true
         sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true
         sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20
         sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20
+    - name: Setup LLVM
+      if: ${{ matrix.sys.compiler == 'clang' }}
+      run: |
+        sudo apt-get update || exit 1
+        sudo apt-get -y --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.gcc_runtime }}-${{ matrix.target.dir }} g++-${{ matrix.sys.gcc_runtime }}-multilib cmake || exit 1
+        sudo apt-get -y --no-install-suggests --no-install-recommends install clang-${{ matrix.sys.version }} || exit 1
+        sudo update-alternatives --remove-all /usr/bin/clang || true
+        sudo update-alternatives --remove-all /usr/bin/clang++ || true
+        sudo update-alternatives --install /usr/bin/clang clang /usr/bin/clang-${{ matrix.sys.version }} 20
+        sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.sys.version }} 20
     - name: Setup QEMU
       run: |
         sudo apt-get --no-install-suggests --no-install-recommends install qemu-user
@@ -31,14 +43,27 @@ jobs:
       run: |
         sudo apt-get install ninja-build
     - name: Checkout xsimd
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Setup
       run: |
-        mkdir _build
-        cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake
+        cmake -B build/ \
+              -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \
+              -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" \
+              -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" \
+              -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake
     - name: Build
-      run: cmake --build _build --verbose -j1
+      run: cmake --build build/ --verbose -j1
+    - name: Set CPU feature test expectations
+      run: /bin/true
     - name: Testing xsimd
       run: |
-        qemu-${{ matrix.target.platform }} -cpu power10 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
-      working-directory: ${{ github.workspace }}/_build
+        # Set CPU feature test expectations, 0 is explicit absence of the feature
+        export XSIMD_TEST_CPU_ASSUME_SSE4_2="0"
+        export XSIMD_TEST_CPU_ASSUME_NEON64="0"
+        export XSIMD_TEST_CPU_ASSUME_RVV="0"
+        export XSIMD_TEST_CPU_ASSUME_VXE="0"
+        export XSIMD_TEST_CPU_ASSUME_VSX="1"
+
+        qemu-${{ matrix.target.platform }} -cpu power10 -L /usr/${{ matrix.target.dir}}/ ./build/test/test_xsimd
diff --git a/.github/workflows/cross-rvv-arch.yml b/.github/workflows/cross-rvv-arch.yml
new file mode 100644
index 000000000..ae482cc08
--- /dev/null
+++ b/.github/workflows/cross-rvv-arch.yml
@@ -0,0 +1,85 @@
+# RISC-V RVV cross-compilation build using qemu 11 + gcc 15 (Arch Linux).
+#
+# Why this workflow exists alongside cross-rvv.yml:
+#
+# QEMU's RISC-V Vector emulation is dramatically slower than scalar in
+# qemu < 11 (see QEMU issue #2137 for documented 100x+ slowdowns of
+# auto-vectorised RVV loops under TCG). At vlen=128 the slowdown is large
+# enough that gcc's RVV codegen for our test suite causes the qemu-user
+# emulator to make no observable progress within the 6h GHA timeout —
+# i.e. the apt-shipped qemu-user-static (8.2.x in noble, 9.x in plucky)
+# can't run xsimd's full test_xsimd at vlen=128.
+#
+# Empirically:
+#   qemu 8.2.2 (Ubuntu 24.04 apt)    : test_xsimd at vlen=128 times out
+#   qemu 9.2.1 (Ubuntu 25.04 plucky) : ditto
+#   qemu 10.0.8 (Debian trixie)      : ditto
+#   qemu 11.0.0 (Arch) + gcc 15.1    : 367 cases / 5664 asserts in <10 min
+#
+# So vlen=128 RVV coverage lives in this workflow, which runs the build
+# and test inside an `archlinux:latest` container (qemu 11 + gcc 15.1).
+# The matching ubuntu-runner workflow `cross-rvv.yml` keeps multi-compiler
+# matrix coverage (gcc-14, clang-17/18) for vlens >= 256, where the apt
+# qemu is fast enough.
+#
+# References:
+#   QEMU 11.0.0 release notes:  https://www.qemu.org/2026/04/22/qemu-11-0-0/
+#   QEMU RVV slowdowns issue:   https://gitlab.com/qemu-project/qemu/-/issues/2137
+#   Ubuntu RVV vstart bug:      https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2095169
+name: RISC-V RVV cross-compilation build (qemu 11)
+on: [push, pull_request]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    container: archlinux:latest
+    name: 'RISC-V RVV${{ matrix.vector_bits }} (qemu 11)'
+    strategy:
+      fail-fast: false
+      matrix:
+        vector_bits:
+          - 128
+          - 256
+          - 512
+    steps:
+    - name: Setup toolchain and qemu
+      run: |
+        pacman -Sy --noconfirm
+        pacman -S --noconfirm --needed \
+          qemu-user-static riscv64-linux-gnu-gcc riscv64-linux-gnu-glibc \
+          cmake ninja git ca-certificates
+        qemu-riscv64-static --version
+        riscv64-linux-gnu-gcc --version | head -1
+    - name: Checkout xsimd
+      uses: actions/checkout@v6
+    - name: Setup
+      run: >
+        cmake -S . -B _build
+        -GNinja
+        -DBUILD_TESTS=ON
+        -DDOWNLOAD_DOCTEST=ON
+        -DCMAKE_BUILD_TYPE=Release
+        -DTARGET_ARCH=generic
+        -DCMAKE_C_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl"
+        -DCMAKE_CXX_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl"
+        -DCMAKE_TOOLCHAIN_FILE=.github/toolchains/gcc-riscv64-linux-gnu.cmake
+    - name: Build
+      run: cmake --build _build
+    - name: Set CPU feature test expectations
+      run: |
+        echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV"
+        echo "XSIMD_TEST_CPU_ASSUME_SVE=0"    >> "$GITHUB_ENV"
+        echo "XSIMD_TEST_CPU_ASSUME_RVV=1" >> "$GITHUB_ENV"
+    - name: Testing xsimd
+      timeout-minutes: 15
+      # Invoke qemu-riscv64-static explicitly. Inside the archlinux:latest
+      # container we don't have permission to register binfmt_misc with the
+      # host kernel, so exec'ing the riscv64 ELF directly fails with
+      # "Exec format error".
+      run: >
+        QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0"
+        QEMU_LD_PREFIX="/usr/riscv64-linux-gnu"
+        qemu-riscv64-static ./test/test_xsimd
+      working-directory: _build
diff --git a/.github/workflows/cross-rvv.yml b/.github/workflows/cross-rvv.yml
index e7274627c..215823ee5 100644
--- a/.github/workflows/cross-rvv.yml
+++ b/.github/workflows/cross-rvv.yml
@@ -1,3 +1,17 @@
+# RISC-V RVV cross-compilation build (Ubuntu apt qemu, multi-compiler matrix).
+#
+# vlen=128 is intentionally NOT covered here. Ubuntu's qemu-user-static
+# (8.2.x in noble, 9.x in plucky) hangs on the xsimd test_xsimd binary at
+# vlen=128 — see QEMU issue #2137 (RVV TCG slowdowns) for the underlying
+# emulator behaviour. Until ubuntu-latest ships qemu 11+, vlen=128 coverage
+# lives in cross-rvv-arch.yml, which runs inside an archlinux:latest
+# container with qemu 11. Vlens >= 256 run fast enough under the apt qemu
+# to stay within the test step's timeout.
+#
+# References:
+#   QEMU 11.0.0 release notes:  https://www.qemu.org/2026/04/22/qemu-11-0-0/
+#   QEMU RVV slowdowns issue:   https://gitlab.com/qemu-project/qemu/-/issues/2137
+#   Ubuntu RVV vstart bug:      https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/2095169
 name: RISC-V RVV cross-compilation build
 on: [push, pull_request]
 concurrency:
@@ -8,13 +22,13 @@ jobs:
     runs-on: ubuntu-latest
     name: 'RISC-V RVV${{ matrix.vector_bits }}'
     strategy:
+      fail-fast: false
       matrix:
         sys:
           - { compiler: 'gcc', gcc_runtime: '14'}
           - { compiler: 'clang', version: '17', gcc_runtime: '14'}
           - { compiler: 'clang', version: '18', gcc_runtime: '14'}
         vector_bits:
-          - 128
           - 256
           - 512
     steps:
@@ -35,14 +49,19 @@ jobs:
         sudo ln -srf $(which clang++-${{ matrix.sys.version }}) /usr/bin/clang++
         rm llvm.sh
     - name: Setup QEMU
-      uses: docker/setup-qemu-action@v3.0.0
-      with:
-        platforms: riscv64
+      # Use the qemu-user-static package shipped by the runner image rather
+      # than docker/setup-qemu-action: tonistiigi/binfmt pins an even older
+      # qemu (~6.x/7.x) whose RVV implementation miscompiles vmulh* and is
+      # known to hang test_xsimd until the 6h GHA timeout.
+      run: |
+        sudo apt-get -y -qq update
+        sudo apt-get -y -qq --no-install-suggests --no-install-recommends install qemu-user-static
+        qemu-riscv64-static --version
     - name: Setup Ninja
       run: |
         sudo apt-get -y -qq install ninja-build
     - name: Checkout xsimd
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Setup
       run: >
         cmake -S . -B _build
@@ -56,7 +75,13 @@ jobs:
         -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-riscv64-linux-gnu.cmake
     - name: Build
       run: cmake --build _build
+    - name: Set CPU feature test expectations
+      run: |
+        echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV"
+        echo "XSIMD_TEST_CPU_ASSUME_SVE=0"    >> "$GITHUB_ENV"
+        echo "XSIMD_TEST_CPU_ASSUME_RVV=1" >> "$GITHUB_ENV"
     - name: Testing xsimd
+      timeout-minutes: 15
       run: >
         QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0"
         QEMU_LD_PREFIX="/usr/riscv64-linux-gnu"
diff --git a/.github/workflows/cross-s390x.yml b/.github/workflows/cross-s390x.yml
new file mode 100644
index 000000000..b748d328e
--- /dev/null
+++ b/.github/workflows/cross-s390x.yml
@@ -0,0 +1,55 @@
+name: IBM Z cross-compilation build
+on: [push, pull_request]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    name: '${{ matrix.target.platform }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}'
+    strategy:
+      matrix:
+        target:
+          - { platform: 's390x',     dir: 's390x-linux-gnu',   full: 'OFF' }
+        sys:
+          - { compiler: 'gcc',   version: '14' }
+    steps:
+    - name: Setup compiler
+      if: ${{ matrix.sys.compiler == 'gcc' }}
+      run: |
+        sudo apt-get update || exit 1
+        sudo apt-get -y --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib cmake || exit 1
+        sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true
+        sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true
+        sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20
+        sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20
+    - name: Setup QEMU
+      run: |
+        sudo apt-get --no-install-suggests --no-install-recommends install qemu-user
+    - name: Setup Ninja
+      run: |
+        sudo apt-get install ninja-build
+    - name: Checkout xsimd
+      uses: actions/checkout@v6
+    - name: Setup
+      run: |
+        cmake -B build/ \
+              -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \
+              -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" \
+              -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" \
+              -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake
+    - name: Build
+      run: cmake --build build/ --verbose -j1
+    - name: Testing xsimd
+      run: |
+        # Set CPU feature test expectations, 0 is explicit absence of the feature
+        export XSIMD_TEST_CPU_ASSUME_SSE4_2="0"
+        export XSIMD_TEST_CPU_ASSUME_NEON64="0"
+        export XSIMD_TEST_CPU_ASSUME_RVV="0"
+        export XSIMD_TEST_CPU_ASSUME_VSX="0"
+        export XSIMD_TEST_CPU_ASSUME_VXE="1"
+
+        qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./build/test/test_xsimd
diff --git a/.github/workflows/cross-sve.yml b/.github/workflows/cross-sve.yml
index 4cd292f27..8a2ffe102 100644
--- a/.github/workflows/cross-sve.yml
+++ b/.github/workflows/cross-sve.yml
@@ -27,14 +27,26 @@ jobs:
       run: |
         sudo apt-get install ninja-build
     - name: Checkout xsimd
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Setup
       run: |
-        mkdir _build
-        cd _build && cmake .. -GNinja -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" -DCMAKE_CXX_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/gcc-aarch64-linux-gnu.cmake
+        cmake -B _build \
+              -GNinja \
+              -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DTARGET_ARCH=generic \
+              -DCMAKE_C_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" \
+              -DCMAKE_CXX_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" \
+              -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/gcc-aarch64-linux-gnu.cmake
     - name: Build
       run: cmake --build _build
-    - name: Testing xsimd
+    - name: Set CPU feature test expectations
       run: |
-        qemu-aarch64 --cpu max,sve${{ matrix.vector_bits }}=on -L /usr/aarch64-linux-gnu/ ./test/test_xsimd
+        echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV"
+        echo "XSIMD_TEST_CPU_ASSUME_RVV=0" >> "$GITHUB_ENV"
+        echo "XSIMD_TEST_CPU_ASSUME_NEON64=1" >> "$GITHUB_ENV"
+        echo "XSIMD_TEST_CPU_ASSUME_SVE=1"    >> "$GITHUB_ENV"
+        echo "XSIMD_TEST_CPU_ASSUME_MANUFACTURER=unknown" >> "$GITHUB_ENV"
+    - name: Testing xsimd
+      run: qemu-aarch64 --cpu max,sve${{ matrix.vector_bits }}=on -L /usr/aarch64-linux-gnu/ ./test/test_xsimd
       working-directory: ${{ github.workspace }}/_build
diff --git a/.github/workflows/cxx-no-exceptions.yml b/.github/workflows/cxx-no-exceptions.yml
index add1c693b..e48ae31ca 100644
--- a/.github/workflows/cxx-no-exceptions.yml
+++ b/.github/workflows/cxx-no-exceptions.yml
@@ -4,14 +4,9 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
-    - name: Install dependencies
-      run: |
-          sudo apt install g++
+    - uses: actions/checkout@v6
     - name: Setup
-      run: |
-        mkdir _build
-        cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-fno-exceptions
+      run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-fno-exceptions
     - name: Build
       run: cmake --build _build
 
diff --git a/.github/workflows/cxx-versions.yml b/.github/workflows/cxx-versions.yml
index 5ec2f0768..7bfcac2f8 100644
--- a/.github/workflows/cxx-versions.yml
+++ b/.github/workflows/cxx-versions.yml
@@ -3,21 +3,46 @@ on: [push, pull_request]
 concurrency:
   group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
   cancel-in-progress: true
+
 jobs:
-  build:
+  build-unix:
+    name: 'Unix C++${{ matrix.cxx-version }}'
     runs-on: ubuntu-latest
     strategy:
       matrix:
         cxx-version: [14, 17, 20]
     steps:
-    - uses: actions/checkout@v3
-    - name: Install dependencies
-      run: |
-          sudo apt install g++
+    - uses: actions/checkout@v6
     - name: Setup
+      run: cmake -B build/ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${{matrix.cxx-version}}
+    - name: Build
+      run: cmake --build build/
+    - name: Test
+      run: ./build/test/test_xsimd
+
+  build-msvc:
+    name: 'MSVC C++${{ matrix.cxx-version }}'
+    defaults:
+      run:
+        shell: bash {0}
+    runs-on: windows-2022
+    strategy:
+      matrix:
+        cxx-version: [14, 17, 20]
+    steps:
+    - name: Setup compiler
+      uses: ilammy/msvc-dev-cmd@v1
+      with:
+        arch: amd64
+    - name: Setup Ninja
       run: |
-        mkdir _build
-        cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${{matrix.cxx-version}}
+        python3 -m pip install --upgrade pip setuptools wheel
+        python3 -m pip install ninja
+    - uses: actions/checkout@v6
+    - name: Setup
+      run: cmake -B build/ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${{matrix.cxx-version}} -G Ninja
     - name: Build
-      run: cmake --build _build
+      run: cmake --build build/
+    - name: Test
+      run: ./build/test/test_xsimd
 
diff --git a/.github/workflows/doxygen.yml b/.github/workflows/doxygen.yml
index 910206321..00826b921 100644
--- a/.github/workflows/doxygen.yml
+++ b/.github/workflows/doxygen.yml
@@ -7,7 +7,7 @@ jobs:
   build:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v6
     - name: Install dependencies
       run: sudo apt install doxygen python3-breathe python3-sphinx-rtd-theme
     - name: Render
diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml
index 619690589..988c348df 100644
--- a/.github/workflows/emscripten.yml
+++ b/.github/workflows/emscripten.yml
@@ -9,7 +9,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v6
 
       - uses: mamba-org/setup-micromamba@v2
         with:
diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml
index 0a5411d27..bc4781cbb 100644
--- a/.github/workflows/emulated.yml
+++ b/.github/workflows/emulated.yml
@@ -9,63 +9,36 @@ defaults:
 jobs:
   build:
     runs-on: ubuntu-latest
-    name: '${{ matrix.sys.compiler }} ${{ matrix.sys.version }} - emulated'
+    name: '${{ matrix.sys.compiler }} - emulated<${{ matrix.sys.size }}>'
     strategy:
       matrix:
         sys:
-          - { compiler: 'gcc',   version: '14'}
-          - { compiler: 'clang', version: '18'}
+          - { compiler: 'g++',   size: '128'}
+          - { compiler: 'g++',   size: '256'}
+          - { compiler: 'g++',   size: '512'}
     steps:
-    - name: Setup compiler
-      if: ${{ matrix.sys.compiler == 'gcc' }}
-      run: |
-        GCC_VERSION=${{ matrix.sys.version }}
-        sudo apt-get update
-        sudo apt-get --no-install-suggests --no-install-recommends install g++-$GCC_VERSION
-        CC=gcc-$GCC_VERSION
-        echo "CC=$CC" >> $GITHUB_ENV
-        CXX=g++-$GCC_VERSION
-        echo "CXX=$CXX" >> $GITHUB_ENV
-        CXXFLAGS="-Wno-noexcept-type -Wno-stringop-overflow -Wno-maybe-uninitialized"
-        echo "CXXFLAGS=$CXXFLAGS" >> $GITHUB_ENV
-    - name: Setup compiler
-      if: ${{ matrix.sys.compiler == 'clang' }}
-      run: |
-        LLVM_VERSION=${{ matrix.sys.version }}
-        sudo apt-get update || exit 1
-        sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1
-        sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1
-        sudo ln -s /usr/include/asm-generic /usr/include/asm
-        CC=clang-$LLVM_VERSION
-        echo "CC=$CC" >> $GITHUB_ENV
-        CXX=clang++-$LLVM_VERSION
-        echo "CXX=$CXX" >> $GITHUB_ENV
     - name: Checkout xsimd
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Install mamba
       uses: mamba-org/setup-micromamba@v1
       with:
         environment-file: environment.yml
+    - name: Setup GCC compiler
+      if: ${{ matrix.sys.compiler == 'g++' }}
+      run: echo "CXXFLAGS=-Wno-noexcept-type -Wno-stringop-overflow -Wno-maybe-uninitialized" >> $GITHUB_ENV
     - name: Configure build
-      env:
-        CC: ${{ env.CC }}
-        CXX: ${{ env.CXX }}
       run: |
-
-        mkdir _build
-        cd _build
-        cmake  .. -DBUILD_TESTS=ON \
-                  -DBUILD_BENCHMARK=ON \
-                  -DBUILD_EXAMPLES=ON \
-                  -DCMAKE_BUILD_TYPE=Release \
-                  -DCMAKE_C_COMPILER=$CC \
-                  -DCMAKE_CXX_COMPILER=$CXX \
-                  -DXSIMD_ENABLE_WERROR=ON \
-                  -DCMAKE_CXX_FLAGS="-DXSIMD_DEFAULT_ARCH=emulated\<128\> -DXSIMD_WITH_EMULATED=1 ${CXXFLAGS}" \
-                  -G Ninja
+        cmake -B _build \
+          -DBUILD_TESTS=ON \
+          -DBUILD_BENCHMARK=ON \
+          -DBUILD_EXAMPLES=ON \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_CXX_COMPILER=${{ matrix.sys.compiler }} \
+          -DXSIMD_ENABLE_WERROR=ON \
+          -DTARGET_ARCH="emulated<${{ matrix.sys.size }}>" \
+          -DCMAKE_CXX_FLAGS="${CXXFLAGS}" \
+          -GNinja
     - name: Build
       run: ninja -C _build
     - name: Test
-      run: |
-        cd _build/test
-        ./test_xsimd
+      run: ninja -C _build xtest
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 193038e75..03a914bda 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -1,4 +1,4 @@
-name: Linux build
+name: Linux x86 build
 on: [push, pull_request]
 concurrency:
   group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
@@ -6,6 +6,7 @@ concurrency:
 defaults:
   run:
     shell: bash -l {0}
+
 jobs:
   build:
     runs-on: ubuntu-latest
@@ -13,9 +14,10 @@ jobs:
     strategy:
       matrix:
         sys:
-          - { compiler: 'gcc',   version: '12',  flags: 'force_no_instr_set' }
-          - { compiler: 'gcc',   version: '13',  flags: 'enable_xtl_complex' }
-          - { compiler: 'gcc',   version: '14',  flags: 'avx' }
+          - { compiler: 'gcc',   version: '12', flags: 'force_no_instr_set' }
+          - { compiler: 'gcc',   version: '13', flags: 'enable_xtl_complex' }
+          - { compiler: 'gcc',   version: '14', flags: 'avx' }
+          - { compiler: 'gcc',   version: '14', flags: 'avx2' }
           - { compiler: 'gcc',   version: '13', flags: 'avx512' }
           - { compiler: 'gcc',   version: '10', flags: 'avx512' }
           - { compiler: 'gcc',   version: '12', flags: 'i386' }
@@ -23,11 +25,15 @@ jobs:
           - { compiler: 'gcc',   version: '13', flags: 'avx512vbmi' }
           - { compiler: 'gcc',   version: '14', flags: 'avx512vbmi2' }
           - { compiler: 'gcc',   version: '13', flags: 'avx512vnni' }
-          - { compiler: 'clang', version: '16',  flags: 'force_no_instr_set' }
+          - { compiler: 'clang', version: '16', flags: 'force_no_instr_set' }
           - { compiler: 'clang', version: '16', flags: 'enable_xtl_complex' }
           - { compiler: 'clang', version: '17', flags: 'avx' }
           - { compiler: 'clang', version: '17', flags: 'sse3' }
           - { compiler: 'clang', version: '18', flags: 'avx512' }
+          - { compiler: 'clang', version: '18', flags: 'avx_128' }
+          - { compiler: 'clang', version: '18', flags: 'avx2_128' }
+          - { compiler: 'clang', version: '18', flags: 'avx512vl_128' }
+          - { compiler: 'clang', version: '18', flags: 'avx512vl_256' }
     steps:
     - name: Setup compiler
       if: ${{ matrix.sys.compiler == 'gcc' }}
@@ -56,7 +62,7 @@ jobs:
         CXX=clang++-$LLVM_VERSION
         echo "CXX=$CXX" >> $GITHUB_ENV
     - name: Checkout xsimd
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Install mamba
       uses: mamba-org/setup-micromamba@v2
       with:
@@ -75,12 +81,31 @@ jobs:
         if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then
           CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge"
         fi
+        if [[ '${{ matrix.sys.flags }}' == 'avx_128' ]]; then
+          CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge"
+          CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx_128"
+        fi
+        if [[ '${{ matrix.sys.flags }}' == 'avx2' ]]; then
+          CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell"
+        fi
+        if [[ '${{ matrix.sys.flags }}' == 'avx2_128' ]]; then
+          CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=haswell"
+          CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx2_128"
+        fi
         if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then
           CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona"
         fi
         if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then
           CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
         fi
+        if [[ '${{ matrix.sys.flags }}' == 'avx512vl_128' ]]; then
+          CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
+          CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_128"
+        fi
+        if [[ '${{ matrix.sys.flags }}' == 'avx512vl_256' ]]; then
+          CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512"
+          CXXFLAGS="$CXX_FLAGS -DXSIMD_DEFAULT_ARCH=avx512vl_256"
+        fi
         if [[ '${{ matrix.sys.flags }}' == 'avx512pf' ]]; then
           CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knl"
         fi
@@ -105,25 +130,34 @@ jobs:
         # Cheap way of spotting uninitialized read
         CXX_FLAGS="$CXX_FLAGS -ftrivial-auto-var-init=pattern"
 
-        mkdir _build
-        cd _build
-        cmake  .. -DBUILD_TESTS=ON \
-                  -DBUILD_BENCHMARK=ON \
-                  -DBUILD_EXAMPLES=ON \
-                  -DCMAKE_BUILD_TYPE=Release \
-                  -DCMAKE_C_COMPILER=$CC \
-                  -DCMAKE_CXX_COMPILER=$CXX \
-                  $CMAKE_EXTRA_ARGS \
-                  -DCMAKE_CXX_FLAGS='$CXX_FLAGS' \
-                  -G Ninja
+        cmake -B _build \
+              -DBUILD_TESTS=ON \
+              -DBUILD_BENCHMARK=ON \
+              -DBUILD_EXAMPLES=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DCMAKE_C_COMPILER=$CC \
+              -DCMAKE_CXX_COMPILER=$CXX \
+              $CMAKE_EXTRA_ARGS \
+              -DCMAKE_CXX_FLAGS='$CXX_FLAGS' \
+              -G Ninja
     - name: Build
-      run: ninja -C _build
+      run: cmake --build _build
     - name: Test
       run: |
-        cd _build
-        cd test
+        # Set CPU feature test expectations, 0 is explicit absence of the feature
+        export XSIMD_TEST_CPU_ASSUME_NEON64="0"
+        export XSIMD_TEST_CPU_ASSUME_RVV="0"
+        export XSIMD_TEST_CPU_ASSUME_VSX="0"
+        export XSIMD_TEST_CPU_ASSUME_VXE="0"
+        cd _build/test
         if echo '${{ matrix.sys.flags }}' | grep -q 'avx512' ; then
+          # Running with emulation, must have AVX512, lower tier are checked by implications in tests
+          export XSIMD_TEST_CPU_ASSUME_AVX512F="1"
           ../../sde-external-9.48.0-2024-11-25-lin/sde64 -tgl -- ./test_xsimd
         else
+          export XSIMD_TEST_CPU_ASSUME_SSE4_2=$(grep -q 'sse4_2' /proc/cpuinfo && echo "1" || echo "0")
+          export XSIMD_TEST_CPU_ASSUME_AVX=$(grep -q 'avx' /proc/cpuinfo && echo "1" || echo "0")
+          export XSIMD_TEST_CPU_ASSUME_AVX512F=$(grep -q 'avx512f' /proc/cpuinfo && echo "1" || echo "0")
+          export XSIMD_TEST_CPU_ASSUME_MANUFACTURER="intel,amd"
           ./test_xsimd
         fi
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index f2b17faa9..14132f5db 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -3,6 +3,7 @@ on: [push, pull_request]
 concurrency:
   group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
   cancel-in-progress: true
+
 jobs:
   build:
     strategy:
@@ -14,14 +15,24 @@ jobs:
     runs-on: macos-${{ matrix.os }}
     name: 'macos-${{ matrix.os }}'
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v6
     - name: Setup
-      run: |
-        mkdir _build
-        cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
+      run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
     - name: Build
       run: cmake --build _build --verbose
     - name: Testing sequential
       run: cmake --build _build --target xbenchmark --verbose
+    - name: Set CPU feature test expectations
+      run: |
+        echo "XSIMD_TEST_CPU_ASSUME_RVV=0" >> "$GITHUB_ENV"
+        if echo '${{ matrix.os }}' | grep -q intel; then
+          echo "XSIMD_TEST_CPU_ASSUME_NEON64=0" >> "$GITHUB_ENV"
+          echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=1" >> "$GITHUB_ENV"
+          echo "XSIMD_TEST_CPU_ASSUME_MANUFACTURER=intel" >> "$GITHUB_ENV"
+        else
+          echo "XSIMD_TEST_CPU_ASSUME_NEON64=1" >> "$GITHUB_ENV"
+          echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV"
+          echo "XSIMD_TEST_CPU_ASSUME_MANUFACTURER=unknown" >> "$GITHUB_ENV"
+        fi
     - name: Testing xsimd
       run: ${{github.workspace}}/_build/test/test_xsimd
diff --git a/.github/workflows/sanitizer.yml b/.github/workflows/sanitizer.yml
index 7eceeadc9..520be087f 100644
--- a/.github/workflows/sanitizer.yml
+++ b/.github/workflows/sanitizer.yml
@@ -22,7 +22,7 @@ jobs:
       CXX: clang++-${{ matrix.llvm-version }}
     steps:
     - name: Checkout xsimd
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Setup compiler
       run: |
           wget https://apt.llvm.org/llvm.sh
@@ -30,18 +30,15 @@ jobs:
           sudo ./llvm.sh ${{ matrix.llvm-version }}
     - name: Configure build
       run: |
-        mkdir _build
-        cd _build
-        cmake  .. -DBUILD_TESTS=ON \
-                  -DBUILD_BENCHMARK=ON \
-                  -DBUILD_EXAMPLES=ON \
-                  -DDOWNLOAD_DOCTEST=ON \
-                  -DCMAKE_BUILD_TYPE=Debug \
-                  -DCMAKE_CXX_FLAGS='-f${{ matrix.flags }} -O0 -g -fno-inline' \
-                  -G Ninja
+        cmake -B_build \
+              -DBUILD_TESTS=ON \
+              -DBUILD_BENCHMARK=ON \
+              -DBUILD_EXAMPLES=ON \
+              -DDOWNLOAD_DOCTEST=ON \
+              -DCMAKE_BUILD_TYPE=Debug \
+              -DCMAKE_CXX_FLAGS='-f${{ matrix.flags }} -O0 -g -fno-inline' \
+              -G Ninja
     - name: Build
-      run: ninja -C _build
+      run: cmake --build _build
     - name: Test
-      run: |
-        cd _build/test
-        ./test_xsimd
+      run: ./_build/test/test_xsimd
diff --git a/.github/workflows/style-check.yml b/.github/workflows/style-check.yml
index 81f0e8383..f308b99eb 100644
--- a/.github/workflows/style-check.yml
+++ b/.github/workflows/style-check.yml
@@ -8,17 +8,43 @@ jobs:
     name: Format check
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
-    - name: Run clang-format style check for C/C++ programs.
-      uses: jidicula/clang-format-action@v4.11.0
-      with:
-        clang-format-version: '17'
-        exclude-regex: 'doctest.h'
+    - uses: actions/checkout@v6
+    - run: sudo apt install clang-format
+    - run: |
+        git fetch origin ${{ github.event.pull_request.base.sha }}
+        git clang-format --diff ${{ github.event.pull_request.base.sha }} | tee diff.patch
+        ! grep -q '^diff ' diff.patch
+
   inlining-check:
     runs-on: ubuntu-latest
     name: Check inline keyword usage
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v6
     - run: sudo apt install clang-tools
     - run: sh ./test/check_inline_specifier.sh .
 
+  include-check:
+    runs-on: ubuntu-latest
+    name: Check unused standard includes
+    steps:
+    - uses: actions/checkout@v6
+    - run: pip install diskarzhan
+    - run: diskarzhan `find -name '*.[ch]pp'`
+
+  clang-tidy-check:
+    name: Clang-tidy check (x86_64)
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v6
+    - run: sudo apt install clang-tidy
+    - name: Configure
+      run: cmake -B _build
+              -DCMAKE_CXX_COMPILER=clang++
+              -DBUILD_TESTS=ON
+              -DDOWNLOAD_DOCTEST=ON
+              -DCMAKE_BUILD_TYPE=Debug
+              -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+              -DCMAKE_CXX_FLAGS='-march=tigerlake'
+              .
+    - name: Check
+      run: run-clang-tidy -p _build
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index e9e782aed..10ce6e982 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -3,8 +3,9 @@ on: [push, pull_request]
 concurrency:
   group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
   cancel-in-progress: true
+
 jobs:
-  build:
+  build-windows-x86:
     name: 'MSVC ${{ matrix.os }}, ${{ matrix.target }} ${{ matrix.sys.set }}'
     defaults:
       run:
@@ -41,18 +42,21 @@ jobs:
         python3 -m pip install --upgrade pip setuptools wheel
         python3 -m pip install ninja
     - name: Checkout xsimd
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Setup
-      run: |
-        mkdir _build
-        cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="${{ matrix.sys.flags }}" -G Ninja
+      run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="${{ matrix.sys.flags }}" -G Ninja
     - name: Build
-      run: |
-        cd _build && cmake --build .
+      run: cmake --build _build
     - name: Testing xsimd
       if: ${{ !startsWith(matrix.sys.set, 'AVX512') }}
-      run: |
-        cd _build && ./test/test_xsimd
+      env:
+        # Set CPU feature test expectations
+        # Assuming the runner always has AVX2 (independent of compilation option)
+        XSIMD_TEST_CPU_ASSUME_NEON64: "0"
+        XSIMD_TEST_CPU_ASSUME_SSE4_2: "1"
+        XSIMD_TEST_CPU_ASSUME_AVX2: "1"
+        XSIMD_TEST_CPU_ASSUME_MANUFACTURER: "intel,amd"
+      run: ./_build/test/test_xsimd
 
   build-windows-mingw:
     name: 'MSYS2 ${{ matrix.msystem }}'
@@ -80,29 +84,34 @@ jobs:
             cmake:p
             ninja:p
       - name: Checkout xsimd
-        uses: actions/checkout@v2
+        uses: actions/checkout@v6
       - name: Configure
-        run: |
-          mkdir _build
-          cd _build
-          cmake .. -DBUILD_TESTS=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DDOWNLOAD_DOCTEST=ON -G Ninja
+        run: cmake -B _build -DBUILD_TESTS=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DDOWNLOAD_DOCTEST=ON -G Ninja
       - name: Build
-        run: ninja -C _build
+        run: cmake --build _build
       - name: Test
-        run: |
-          cd _build && ./test/test_xsimd
+        run: ./_build/test/test_xsimd
 
-  build-windows-arm64:
-    name: 'MSVC arm64'
+  build-windows-clang-cl:
+    name: 'clang-cl x64 ${{ matrix.config.name }}'
     defaults:
       run:
         shell: bash {0}
-    runs-on: windows-11-arm
+    strategy:
+      matrix:
+        config:
+          - { name: "AVX2", flags: "/arch:AVX2", benchmark: "ON", examples: "ON" }
+          - { name: "/fp:fast", flags: "/fp:fast", benchmark: "OFF", examples: "OFF" }
+    runs-on: windows-2025
     steps:
     - name: Setup compiler
       uses: ilammy/msvc-dev-cmd@v1
       with:
         arch: amd64
+    - name: Check clang-cl
+      run: |
+        command -v clang-cl
+        clang-cl --version
     - name: Setup Ninja
       run: |
         python3 -m pip install --upgrade pip setuptools wheel
@@ -111,11 +120,41 @@ jobs:
       uses: actions/checkout@v3
     - name: Setup
       run: |
-        mkdir _build
-        cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -G Ninja
+        cmake -B _build \
+              -DBUILD_TESTS=ON \
+              -DDOWNLOAD_DOCTEST=ON \
+              -DBUILD_BENCHMARK=${{ matrix.config.benchmark }} \
+              -DBUILD_EXAMPLES=${{ matrix.config.examples }} \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DCMAKE_C_COMPILER=clang-cl \
+              -DCMAKE_CXX_COMPILER=clang-cl \
+              -DCMAKE_CXX_FLAGS="${{ matrix.config.flags }} -DXSIMD_REASSOCIATIVE_MATH=1" \
+              -G Ninja
     - name: Build
-      run: |
-        cd _build && cmake --build .
+      run: cmake --build _build
     - name: Testing xsimd
+      run: ./_build/test/test_xsimd
+
+  build-windows-arm64:
+    name: 'MSVC arm64'
+    defaults:
+      run:
+        shell: bash {0}
+    runs-on: windows-11-arm
+    steps:
+    - name: Setup compiler
+      uses: ilammy/msvc-dev-cmd@v1
+      with:
+        arch: arm64
+    - name: Setup Ninja
       run: |
-        cd _build && ./test/test_xsimd
+        python3 -m pip install --upgrade pip setuptools wheel
+        python3 -m pip install ninja
+    - name: Checkout xsimd
+      uses: actions/checkout@v6
+    - name: Setup
+      run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -G Ninja
+    - name: Build
+      run: cmake --build _build
+    - name: Testing xsimd
+      run: ./_build/test/test_xsimd
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f817b6b51..66c01f281 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,16 +9,17 @@
 # The full license is in the file LICENSE, distributed with this software. #
 ############################################################################
 
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.13)
+
 project(xsimd)
-option(XSIMD_REFACTORING ON)
 
-set(XSIMD_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+OPTION(ENABLE_XTL_COMPLEX "enables support for xcomplex defined in xtl" OFF)
+OPTION(BUILD_TESTS "xsimd test suite" OFF)
 
 # Versioning
 # ==========
 
-file(STRINGS "${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp" xsimd_version_defines
+file(STRINGS "include/xsimd/config/xsimd_config.hpp" xsimd_version_defines
      REGEX "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH)")
 foreach(ver ${xsimd_version_defines})
     if(ver MATCHES "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$")
@@ -32,74 +33,25 @@ message(STATUS "xsimd v${${PROJECT_NAME}_VERSION}")
 # Build
 # =====
 
-set(XSIMD_HEADERS
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_constants.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx2.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_sse.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma4.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_common.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_isa.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_neon.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_neon64.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_rvv.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_scalar.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse2.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse3.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_1.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_2.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_ssse3.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_vsx.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sve.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_arch.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_cpuid.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_aligned_allocator.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_alignment.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_all_registers.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_api.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon64_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx2_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512f_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_batch.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_batch_constant.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_avx_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_avx2_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_sse_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma4_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_common_arch.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_vsx_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_rvv_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse2_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse3_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse4_1_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse4_2_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_ssse3_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sve_register.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_traits.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_utils.hpp
-${XSIMD_INCLUDE_DIR}/xsimd/xsimd.hpp
-)
-
 add_library(xsimd INTERFACE)
+add_library(xsimd::xsimd ALIAS xsimd)
 
 target_include_directories(xsimd INTERFACE
-    $<BUILD_INTERFACE:${XSIMD_INCLUDE_DIR}>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
     $<INSTALL_INTERFACE:include>)
 
-OPTION(ENABLE_XTL_COMPLEX "enables support for xcomplex defined in xtl" OFF)
-OPTION(BUILD_TESTS "xsimd test suite" OFF)
+target_compile_features(xsimd INTERFACE cxx_std_14)
 
+# Only add xtl build option to the build tree, that is, if xsimd being locally
+# developed or is vendored.
+# Otherwise (if an install is performed), this will be handled in the user
+# cmake script (xsimdConfig.cmake).
 if(ENABLE_XTL_COMPLEX)
     find_package(xtl 0.8.0 REQUIRED)
-    target_compile_features(xsimd INTERFACE cxx_std_14)
-    target_compile_definitions(xsimd INTERFACE XSIMD_ENABLE_XTL_COMPLEX=1)
-    target_link_libraries(xsimd INTERFACE xtl)
-else()
-    target_compile_features(xsimd INTERFACE cxx_std_11)
+    target_link_libraries(xsimd INTERFACE $<BUILD_INTERFACE:xtl>)
+    target_compile_definitions(xsimd INTERFACE
+        $<BUILD_INTERFACE:XSIMD_ENABLE_XTL_COMPLEX=1>
+    )
 endif()
 
 if(BUILD_TESTS)
@@ -125,8 +77,6 @@ if(${XSIMD_SKIP_INSTALL})
     return() # skip installation
 endif ()
 
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-include(JoinPaths)
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 
@@ -137,7 +87,7 @@ install(TARGETS xsimd
 export(EXPORT ${PROJECT_NAME}-targets
        FILE "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Targets.cmake")
 
-install(DIRECTORY ${XSIMD_INCLUDE_DIR}/xsimd
+install(DIRECTORY include/xsimd
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
diff --git a/Changelog.rst b/Changelog.rst
index cc5d373cb..2a600241b 100644
--- a/Changelog.rst
+++ b/Changelog.rst
@@ -9,6 +9,102 @@
 Changelog
 =========
 
+14.2.0
+------
+
+    * **New architecture**: IBM Z (s390x) support
+
+    * [API] New cross-platform ``cpu_features`` API for querying CPU features available at runtime
+
+    * [API] Add ``xsimd::get<I>()`` for compile-time lane extraction
+
+    * [API] Add ``xsimd::stream_load``, ``xsimd::stream_store``, and ``xsimd::fence`` for non-temporal memory transfers
+
+    * [VSX] Fix dynamic dispatch support with runtime cpu feature inspection
+
+    * [VSX] Fix rounding
+
+    * [SVE/RVV] Fix dynamic dispatch by inspecting available vector length
+
+    * [AVX2] Add native ``uint64``/``int64`` multiplication kernel
+
+    * [NEON] Add support for Windows ARM
+
+    * [NEON] Simplify static dispatch of intrinsicts
+
+    * [NEON] Fix ``batch_bool`` store on ARM by replacing ``vst1_lane_u32`` with a full
+      lane store followed by a memcpy
+
+    * [SVE] Fix dynamic dispatch ODR violation
+
+    * [ci] Fix emulated architecture interaction with AVX512 leading to CI failures.
+      Provide a cmake-level configuration switch for emulated build
+
+    * Fix build with compilers that do not support C++20 (even though we only require C++14)
+
+    * Fix ``xsimd::signbit`` scalar overload leaking into non-scalar overload resolution
+
+    * Fix complex batch load
+
+    * Harden fast-math reassociation barriers
+
+    * Publish the C++14 requirement through the CMake interface
+
+14.1.0
+------
+
+    * Add popcnt and bmi
+
+    * [API] Add bitwise-shift batch constant api
+
+    * Refactor x86 CPU features
+
+    * [NEON] Unsigned bitwise shifts are never called
+
+    * Improve coverage of emulated architectures
+
+    * Introduce `count{l,r}_{zero,one}` for `batch_bool`
+
+    * Fix emulated mask()
+
+    * [neon] Implement bitwise_rshift for 64 bit integers on arm32
+
+    * Fix fast_cast int64/uint64→double under -ffast-math
+
+    * Small complexity reduction
+
+    * Add make_batch_constant from std::array in C++20
+
+    * [ci] Use home-baked clang-format action
+
+    * Fix apple detection
+
+    * [ci] add GCC 10 with AVX-512 to test matrix
+
+    * Slighly less pessimistic detection of neon64
+
+    * Fix runtime detection of SVE
+
+    * [ci] Setup Windows arm64 runner
+
+    * iota batch constant and a few overloads
+
+    * [test] Improve testing logging and accuracy
+
+    * Fix default values for AVX and AVX512 OS state enabled flags
+
+    * Implement batch_bool::mask() for riscv
+
+    * [ci] Revert emscripten to 4.0.21
+
+    * Restore RISCV support
+
+    * Implement optimized movemasks for NEON
+
+    * Fix limit behavior of atan2 under -ffast-math
+
+    * Move to C++14
+
 14.0.0
 ------
 
diff --git a/README.md b/README.md
index 9b7861210..2c2e459b1 100644
--- a/README.md
+++ b/README.md
@@ -50,12 +50,13 @@ The following SIMD instruction set extensions are supported:
 Architecture | Instruction set extensions
 -------------|-----------------------------------------------------
 x86          | SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA3+SSE, FMA3+AVX, FMA3+AVX2
-x86          | AVX512BW, AVX512CD, AVX512DQ, AVX512F (gcc7 and higher)
+x86          | AVX512BW, AVX512CD, AVX512DQ, AVX512F, AVX512VL (gcc7 and higher)
 x86 AMD      | FMA4
 ARM          | NEON, NEON64, SVE128/256/512 (fixed vector size)
 WebAssembly  | WASM
 powerpc64    | VSX
 RISC-V       | RISC-V128/256/512 (fixed vector size)
+IBM Z (s390x)| VXE (IBM z14)
 
 ## Installation
 
@@ -104,7 +105,7 @@ http://xsimd.readthedocs.io/
 |   9.x   |     ^0.7.0       |
 |   8.x   |     ^0.7.0       |
 
-The dependency on `xtl` is required if you want to support vectorization for `xtl::xcomplex`. In this case, you must build your project with C++14 support enabled.
+The dependency on `xtl` is required if you want to support vectorization for `xtl::xcomplex`.
 
 ## Usage
 
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 732d8d7b7..dc9011f3f 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -9,13 +9,12 @@
 # The full license is in the file LICENSE, distributed with this software. #
 ############################################################################
 
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.13)
 
 if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
     project(xsimd-benchmark)
 
     find_package(xsimd REQUIRED CONFIG)
-    set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIRS})
 endif ()
 
 if(NOT CMAKE_BUILD_TYPE)
@@ -63,18 +62,16 @@ if(MSVC)
     endforeach()
 endif()
 
-include_directories(${XSIMD_INCLUDE_DIR})
-
-set(XSIMD_BENCHMARK
+set(XSIMD_BENCHMARK_SRC
     main.cpp
     xsimd_benchmark.hpp
 )
 
-set(XSIMD_BENCHMARK_TARGET benchmark_xsimd)
-add_executable(${XSIMD_BENCHMARK_TARGET} ${XSIMD_BENCHMARK} ${XSIMD_HEADERS})
+add_executable(benchmark_xsimd ${XSIMD_BENCHMARK_SRC})
+target_link_libraries(benchmark_xsimd PRIVATE xsimd)
 
 if(ENABLE_XTL_COMPLEX)
     target_link_libraries(benchmark_xsimd PRIVATE xtl)
 endif()
 
-add_custom_target(xbenchmark COMMAND benchmark_xsimd DEPENDS ${XSIMD_BENCHMARK_TARGET})
+add_custom_target(xbenchmark COMMAND benchmark_xsimd DEPENDS benchmark_xsimd)
diff --git a/benchmark/main.cpp b/benchmark/main.cpp
index 7a630e461..e5ef24365 100644
--- a/benchmark/main.cpp
+++ b/benchmark/main.cpp
@@ -10,7 +10,10 @@
  ****************************************************************************/
 
 #include "xsimd_benchmark.hpp"
+
+#include <iostream>
 #include <map>
+#include <string>
 
 void benchmark_operation()
 {
diff --git a/benchmark/xsimd_benchmark.hpp b/benchmark/xsimd_benchmark.hpp
index 6f6b91bf2..3f48cff40 100644
--- a/benchmark/xsimd_benchmark.hpp
+++ b/benchmark/xsimd_benchmark.hpp
@@ -14,8 +14,8 @@
 
 #include "xsimd/arch/xsimd_scalar.hpp"
 #include "xsimd/xsimd.hpp"
+
 #include <chrono>
-#include <iostream>
 #include <string>
 #include <vector>
 
diff --git a/cmake/JoinPaths.cmake b/cmake/JoinPaths.cmake
deleted file mode 100644
index 32d6d6685..000000000
--- a/cmake/JoinPaths.cmake
+++ /dev/null
@@ -1,26 +0,0 @@
-# This module provides function for joining paths
-# known from from most languages
-#
-# Original license:
-# SPDX-License-Identifier: (MIT OR CC0-1.0)
-# Explicit permission given to distribute this module under
-# the terms of the project as described in /LICENSE.rst.
-# Copyright 2020 Jan Tojnar
-# https://github.com/jtojnar/cmake-snips
-#
-# Modelled after Python’s os.path.join
-# https://docs.python.org/3.7/library/os.path.html#os.path.join
-# Windows not supported
-function(join_paths joined_path first_path_segment)
-    set(temp_path "${first_path_segment}")
-    foreach(current_segment IN LISTS ARGN)
-        if(NOT ("${current_segment}" STREQUAL ""))
-            if(IS_ABSOLUTE "${current_segment}")
-                set(temp_path "${current_segment}")
-            else()
-                set(temp_path "${temp_path}/${current_segment}")
-            endif()
-        endif()
-    endforeach()
-    set(${joined_path} "${temp_path}" PARENT_SCOPE)
-endfunction()
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 72cd9c32e..c574a8579 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -15,6 +15,7 @@ INPUT             = ../include/xsimd/types/xsimd_api.hpp \
                     ../include/xsimd/types/xsimd_avx512cd_register.hpp \
                     ../include/xsimd/types/xsimd_avx512dq_register.hpp \
                     ../include/xsimd/types/xsimd_avx512f_register.hpp \
+                    ../include/xsimd/types/xsimd_avx512vl_register.hpp \
                     ../include/xsimd/types/xsimd_avx_register.hpp \
                     ../include/xsimd/types/xsimd_fma3_avx_register.hpp \
                     ../include/xsimd/types/xsimd_fma3_avx2_register.hpp \
diff --git a/docs/source/api/arithmetic_index.rst b/docs/source/api/arithmetic_index.rst
index 429600cb3..d4f5deb19 100644
--- a/docs/source/api/arithmetic_index.rst
+++ b/docs/source/api/arithmetic_index.rst
@@ -40,6 +40,12 @@ Binary operations:
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`mul`                       | per slot multiply                                  |
 +---------------------------------------+----------------------------------------------------+
+| :cpp:func:`mul_lo`                     | low N bits of the 2N-bit integer product           |
++---------------------------------------+----------------------------------------------------+
+| :cpp:func:`mul_hi`                     | high N bits of the 2N-bit integer product          |
++---------------------------------------+----------------------------------------------------+
+| :cpp:func:`mul_hilo`                   | pair {hi, lo} of the 2N-bit integer product        |
++---------------------------------------+----------------------------------------------------+
 | :cpp:func:`div`                       | per slot division                                  |
 +---------------------------------------+----------------------------------------------------+
 | :cpp:func:`mod`                       | per slot modulo                                    |
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d26751525..464892f87 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -9,16 +9,14 @@
 # The full license is in the file LICENSE, distributed with this software. #
 ############################################################################
 
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.13)
+
 if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
     project(xsimd-examples)
 
     find_package(xsimd REQUIRED CONFIG)
-    set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIR})
 endif ()
 
-include_directories(${XSIMD_INCLUDE_DIR})
-
 if(NOT CMAKE_BUILD_TYPE)
     message(STATUS "Setting examples build type to Release")
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
@@ -35,7 +33,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU"
     endif()
 endif()
 
-add_executable(mandelbrot mandelbrot.cpp ${XSIMD_HEADERS})
+add_executable(mandelbrot mandelbrot.cpp)
+target_link_libraries(mandelbrot PRIVATE xsimd)
 set_property(TARGET mandelbrot PROPERTY CXX_STANDARD 14)
 if(ENABLE_XTL_COMPLEX)
     target_link_libraries(mandelbrot PRIVATE xtl)
diff --git a/examples/mandelbrot.cpp b/examples/mandelbrot.cpp
index 9a0d80e7a..0e001689f 100644
--- a/examples/mandelbrot.cpp
+++ b/examples/mandelbrot.cpp
@@ -13,15 +13,15 @@
 // https://github.com/ospray/tsimd/blob/master/benchmarks/mandelbrot.cpp
 // Author Jefferson Amstutz / intel
 
+#include "pico_bench.hpp"
+
+#include <xsimd/xsimd.hpp>
+
 #include <cstdio>
 #include <iostream>
 #include <string>
 #include <vector>
 
-#include "pico_bench.hpp"
-
-#include <xsimd/xsimd.hpp>
-
 // helper function to write the rendered image as PPM file
 inline void writePPM(const std::string& fileName,
                      const int sizeX,
diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
index ff2fb4118..27b5ef24f 100644
--- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
+++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
@@ -12,12 +12,13 @@
 #ifndef XSIMD_COMMON_ARITHMETIC_HPP
 #define XSIMD_COMMON_ARITHMETIC_HPP
 
+#include "../../types/xsimd_batch_constant.hpp"
+#include "./xsimd_common_details.hpp"
+
 #include <complex>
 #include <limits>
 #include <type_traits>
-
-#include "../../types/xsimd_batch_constant.hpp"
-#include "./xsimd_common_details.hpp"
+#include <utility>
 
 namespace xsimd
 {
@@ -177,6 +178,122 @@ namespace xsimd
                                  self, other);
         }
 
+        // mul_hi
+        namespace detail
+        {
+            template <class T>
+            struct mulhi_helper
+            {
+                using wider = std::conditional_t<
+                    std::is_signed<T>::value,
+                    std::conditional_t<sizeof(T) == 1, int16_t,
+                                       std::conditional_t<sizeof(T) == 2, int32_t, int64_t>>,
+                    std::conditional_t<sizeof(T) == 1, uint16_t,
+                                       std::conditional_t<sizeof(T) == 2, uint32_t, uint64_t>>>;
+
+                static XSIMD_INLINE T compute(T x, T y) noexcept
+                {
+                    constexpr int shift = 8 * sizeof(T);
+                    return static_cast<T>((static_cast<wider>(x) * static_cast<wider>(y)) >> shift);
+                }
+            };
+
+            // 64-bit unsigned software mul_hi via 32-bit splits
+            XSIMD_INLINE uint64_t mulhi_u64(uint64_t x, uint64_t y) noexcept
+            {
+#if defined(__SIZEOF_INT128__)
+                return static_cast<uint64_t>((static_cast<unsigned __int128>(x) * static_cast<unsigned __int128>(y)) >> 64);
+#else
+                uint64_t xl = x & 0xffffffffULL;
+                uint64_t xh = x >> 32;
+                uint64_t yl = y & 0xffffffffULL;
+                uint64_t yh = y >> 32;
+                uint64_t ll = xl * yl;
+                uint64_t lh = xl * yh;
+                uint64_t hl = xh * yl;
+                uint64_t hh = xh * yh;
+                uint64_t mid = (ll >> 32) + (lh & 0xffffffffULL) + (hl & 0xffffffffULL);
+                return hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
+#endif
+            }
+
+            XSIMD_INLINE int64_t mulhi_i64(int64_t x, int64_t y) noexcept
+            {
+#if defined(__SIZEOF_INT128__)
+                return static_cast<int64_t>((static_cast<__int128>(x) * static_cast<__int128>(y)) >> 64);
+#else
+                uint64_t uhi = mulhi_u64(static_cast<uint64_t>(x), static_cast<uint64_t>(y));
+                if (x < 0)
+                    uhi -= static_cast<uint64_t>(y);
+                if (y < 0)
+                    uhi -= static_cast<uint64_t>(x);
+                return static_cast<int64_t>(uhi);
+#endif
+            }
+
+            template <>
+            struct mulhi_helper<uint64_t>
+            {
+                static XSIMD_INLINE uint64_t compute(uint64_t x, uint64_t y) noexcept { return mulhi_u64(x, y); }
+            };
+
+            template <>
+            struct mulhi_helper<int64_t>
+            {
+                static XSIMD_INLINE int64_t compute(int64_t x, int64_t y) noexcept { return mulhi_i64(x, y); }
+            };
+
+            // 64x64 unsigned mul_hi via 32x32->64 widening mul (WMul wraps _mm*_mul_epu32).
+            template <class A, class WMul>
+            XSIMD_INLINE batch<uint64_t, A> mulhi_u64_core(batch<uint64_t, A> const& x,
+                                                           batch<uint64_t, A> const& y,
+                                                           WMul mul_epu32) noexcept
+            {
+                using B = batch<uint64_t, A>;
+                const B mask(uint64_t(0xffffffffULL));
+                B xl = x & mask;
+                B xh = x >> 32;
+                B yl = y & mask;
+                B yh = y >> 32;
+                B ll = mul_epu32(xl, yl);
+                B lh = mul_epu32(xl, yh);
+                B hl = mul_epu32(xh, yl);
+                B hh = mul_epu32(xh, yh);
+                B mid = (ll >> 32) + (lh & mask) + (hl & mask);
+                return hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
+            }
+
+            // Signed variant: unsigned core + sign fixup via arithmetic shift-by-63.
+            template <class A, class WMul>
+            XSIMD_INLINE batch<int64_t, A> mulhi_i64_core(batch<int64_t, A> const& x,
+                                                          batch<int64_t, A> const& y,
+                                                          WMul mul_epu32) noexcept
+            {
+                auto ux = ::xsimd::bitwise_cast<uint64_t>(x);
+                auto uy = ::xsimd::bitwise_cast<uint64_t>(y);
+                auto uhi = mulhi_u64_core<A>(ux, uy, mul_epu32);
+                auto sa = ::xsimd::bitwise_cast<uint64_t>(x >> 63);
+                auto sb = ::xsimd::bitwise_cast<uint64_t>(y >> 63);
+                return ::xsimd::bitwise_cast<int64_t>(uhi - (uy & sa) - (ux & sb));
+            }
+        }
+
+        template <class A, class T, class /*=std::enable_if_t<std::is_integral<T>::value>*/>
+        XSIMD_INLINE batch<T, A> mul_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return detail::mulhi_helper<T>::compute(x, y); },
+                                 self, other);
+        }
+
+        // mul_hilo
+        template <class A, class T, class /*=std::enable_if_t<std::is_integral<T>::value>*/>
+        XSIMD_INLINE std::pair<batch<T, A>, batch<T, A>>
+        mul_hilo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept
+        {
+            return std::pair<batch<T, A>, batch<T, A>> { mul_hi<A>(self, other, A {}), self * other };
+        }
+
         // rotl
         template <class A, class T, class STy>
         XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, STy other, requires_arch<common>) noexcept
diff --git a/include/xsimd/arch/common/xsimd_common_bit.hpp b/include/xsimd/arch/common/xsimd_common_bit.hpp
new file mode 100644
index 000000000..5cd99c1cf
--- /dev/null
+++ b/include/xsimd/arch/common/xsimd_common_bit.hpp
@@ -0,0 +1,232 @@
+/****************************************************************
+ * Partial backport of `__cpp_lib_bitops == 201907L` from C++20 *
+ ****************************************************************/
+
+#ifndef XSIMD_BIT_HPP
+#define XSIMD_BIT_HPP
+
+#include "../../config/xsimd_config.hpp"
+
+#if XSIMD_CPP_VERSION > 202002L
+
+#include <version>
+
+#if __cpp_lib_bitops >= 201907L
+
+#include <bit>
+
+namespace xsimd
+{
+    namespace detail
+    {
+        using std::countl_one;
+        using std::countl_zero;
+        using std::countr_one;
+        using std::countr_zero;
+        using std::popcount;
+    }
+}
+
+#endif
+
+#else
+
+#include <climits>
+#include <type_traits>
+
+#ifdef __has_builtin
+#define XSIMD_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define XSIMD_HAS_BUILTIN(x) 0
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+namespace xsimd
+{
+    namespace detail
+    {
+        // FIXME: We could do better by dispatching to the appropriate popcount instruction
+        // depending on the arch.
+
+        template <class T, class = std::enable_if_t<std::is_unsigned<T>::value>>
+        XSIMD_INLINE int popcount(T x) noexcept
+        {
+#if XSIMD_HAS_BUILTIN(__builtin_popcountg)
+            return __builtin_popcountg(x);
+#else
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+#if XSIMD_HAS_BUILTIN(__builtin_popcount)
+                return __builtin_popcount(x);
+#elif defined(_MSC_VER)
+                return __popcnt(x);
+#else
+                // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64
+                return ((uint64_t)x * 0x200040008001ULL & 0x111111111111111ULL) % 0xf;
+#endif
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+#if XSIMD_HAS_BUILTIN(__builtin_popcount)
+                return __builtin_popcount(x);
+#elif defined(_MSC_VER)
+                return __popcnt16(x);
+#else
+                // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64
+                constexpr unsigned long long msb12 = 0x1001001001001ULL;
+                constexpr unsigned long long mask5 = 0x84210842108421ULL;
+
+                unsigned int v = (unsigned int)x;
+
+                return ((v & 0xfff) * msb12 & mask5) % 0x1f
+                    + (((v & 0xfff000) >> 12) * msb12 & mask5) % 0x1f;
+#endif
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+#if XSIMD_HAS_BUILTIN(__builtin_popcount)
+                return __builtin_popcount(x);
+#elif defined(_MSC_VER)
+                return __popcnt(x);
+#else
+                // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+                x = x - ((x >> 1) & (T) ~(T)0 / 3);
+                x = (x & (T) ~(T)0 / 15 * 3) + ((x >> 2) & (T) ~(T)0 / 15 * 3);
+                x = (x + (x >> 4)) & (T) ~(T)0 / 255 * 15;
+                return (x * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * CHAR_BIT;
+#endif
+            }
+            else
+            {
+                // sizeof(T) == 8
+#if XSIMD_HAS_BUILTIN(__builtin_popcountll)
+                return __builtin_popcountll(x);
+#elif XSIMD_HAS_BUILTIN(__builtin_popcount)
+                return __builtin_popcount((unsigned int)x) + __builtin_popcount((unsigned int)(x >> 32));
+#elif defined(_MSC_VER)
+#ifdef _M_X64
+                return (int)__popcnt64(x);
+#else
+                return (int)(__popcnt((unsigned int)x) + __popcnt((unsigned int)(x >> 32)));
+#endif
+#else
+                // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+                x = x - ((x >> 1) & (T) ~(T)0 / 3);
+                x = (x & (T) ~(T)0 / 15 * 3) + ((x >> 2) & (T) ~(T)0 / 15 * 3);
+                x = (x + (x >> 4)) & (T) ~(T)0 / 255 * 15;
+                return (x * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * CHAR_BIT;
+#endif
+            }
+#endif
+        }
+
+        template <class T, class = std::enable_if_t<std::is_unsigned<T>::value>>
+        XSIMD_INLINE int countl_zero(T x) noexcept
+        {
+#if XSIMD_HAS_BUILTIN(__builtin_clzg)
+            return __builtin_clzg(x, (int)(sizeof(T) * CHAR_BIT));
+#else
+            if (x == 0)
+                return sizeof(T) * CHAR_BIT;
+
+            XSIMD_IF_CONSTEXPR(sizeof(T) <= 4)
+            {
+#if XSIMD_HAS_BUILTIN(__builtin_clz)
+                return __builtin_clz((unsigned int)x) - (4 - sizeof(T)) * CHAR_BIT;
+#elif defined(_MSC_VER)
+                unsigned long index;
+                _BitScanReverse(&index, (unsigned long)x);
+                return sizeof(T) * CHAR_BIT - index - 1;
+#else
+                x |= x >> 1;
+                x |= x >> 2;
+                x |= x >> 4;
+                XSIMD_IF_CONSTEXPR(sizeof(T) >= 2)
+                {
+                    x |= x >> 8;
+                }
+                XSIMD_IF_CONSTEXPR(sizeof(T) >= 4)
+                {
+                    x |= x >> 16;
+                }
+                return sizeof(T) * CHAR_BIT - popcount(x);
+#endif
+            }
+            else
+            {
+                // sizeof(T) == 8
+#if XSIMD_HAS_BUILTIN(__builtin_clzll)
+                return __builtin_clzll((unsigned long long)x);
+#elif defined(_MSC_VER) && defined(_M_X64)
+                unsigned long index;
+                _BitScanReverse64(&index, (unsigned long long)x);
+                return sizeof(T) * CHAR_BIT - index - 1;
+#else
+                x |= x >> 1;
+                x |= x >> 2;
+                x |= x >> 4;
+                x |= x >> 8;
+                x |= x >> 16;
+                x |= x >> 32;
+                return sizeof(T) * CHAR_BIT - popcount(x);
+#endif
+            }
+#endif
+        }
+
+        template <class T, class = std::enable_if_t<std::is_unsigned<T>::value>>
+        XSIMD_INLINE int countl_one(T x) noexcept
+        {
+            return countl_zero(T(~x));
+        }
+
+        template <class T, class = std::enable_if_t<std::is_unsigned<T>::value>>
+        XSIMD_INLINE int countr_zero(T x) noexcept
+        {
+#if XSIMD_HAS_BUILTIN(__builtin_ctzg)
+            return __builtin_ctzg(x, (int)(sizeof(T) * CHAR_BIT));
+#else
+            if (x == 0)
+                return sizeof(T) * CHAR_BIT;
+
+            XSIMD_IF_CONSTEXPR(sizeof(T) <= 4)
+            {
+#if XSIMD_HAS_BUILTIN(__builtin_ctz)
+                return __builtin_ctz((unsigned int)x);
+#elif defined(_MSC_VER)
+                unsigned long index;
+                _BitScanForward(&index, (unsigned long)x);
+                return index;
+#endif
+            }
+            else
+            {
+                // sizeof(T) == 8
+#if XSIMD_HAS_BUILTIN(__builtin_ctzll)
+                return __builtin_ctzll((unsigned long long)x);
+#elif defined(_MSC_VER) && defined(_M_X64)
+                unsigned long index;
+                _BitScanForward64(&index, (unsigned long long)x);
+                return index;
+#endif
+            }
+
+            // https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup
+            return popcount((T)((x & -x) - 1));
+#endif
+        }
+
+        template <class T, class = std::enable_if_t<std::is_unsigned<T>::value>>
+        XSIMD_INLINE int countr_one(T x) noexcept
+        {
+            return countr_zero(T(~x));
+        }
+
+    }
+}
+
+#endif
+#endif
diff --git a/include/xsimd/arch/common/xsimd_common_cast.hpp b/include/xsimd/arch/common/xsimd_common_cast.hpp
index 1226c887c..95753babd 100644
--- a/include/xsimd/arch/common/xsimd_common_cast.hpp
+++ b/include/xsimd/arch/common/xsimd_common_cast.hpp
@@ -12,7 +12,10 @@
 #ifndef XSIMD_COMMON_CAST_HPP
 #define XSIMD_COMMON_CAST_HPP
 
-#include "../../types/xsimd_traits.hpp"
+#include "../../config/xsimd_macros.hpp"
+#include "../../utils/xsimd_type_traits.hpp"
+
+#include <array>
 
 namespace xsimd
 {
diff --git a/include/xsimd/arch/common/xsimd_common_complex.hpp b/include/xsimd/arch/common/xsimd_common_complex.hpp
index 874825182..cb25535e1 100644
--- a/include/xsimd/arch/common/xsimd_common_complex.hpp
+++ b/include/xsimd/arch/common/xsimd_common_complex.hpp
@@ -12,10 +12,10 @@
 #ifndef XSIMD_COMMON_COMPLEX_HPP
 #define XSIMD_COMMON_COMPLEX_HPP
 
-#include <complex>
-
 #include "./xsimd_common_details.hpp"
 
+#include <complex>
+
 namespace xsimd
 {
 
diff --git a/include/xsimd/arch/common/xsimd_common_details.hpp b/include/xsimd/arch/common/xsimd_common_details.hpp
index efe01806b..a99f19319 100644
--- a/include/xsimd/arch/common/xsimd_common_details.hpp
+++ b/include/xsimd/arch/common/xsimd_common_details.hpp
@@ -12,13 +12,13 @@
 #ifndef XSIMD_COMMON_DETAILS_HPP
 #define XSIMD_COMMON_DETAILS_HPP
 
-#include <complex>
-
 #include "../../math/xsimd_rem_pio2.hpp"
 #include "../../types/xsimd_common_arch.hpp"
 #include "../../types/xsimd_utils.hpp"
 #include "../xsimd_constants.hpp"
 
+#include <complex>
+
 namespace xsimd
 {
     // Forward declaration. Should we put them in a separate file?
@@ -111,6 +111,69 @@ namespace xsimd
 
         namespace detail
         {
+            // Prevent -ffast-math from reassociating floating-point
+            // arithmetic across this point.  The reason string
+            // documents *why* at each call site; unused at runtime.
+            //
+            // Zero-cost register constraints per target:
+            //   x86  "+x"  (XMM/YMM/ZMM, also scalar float/double)
+            //   ARM  "+w"  (V-reg / SVE Z-reg, also scalar float/double)
+            //   PPC  "+wa" (VS register, also scalar float/double)
+            //   RISC-V "+f" (F/D register, scalar float/double)
+            //   RISC-V RVV "+vr" (V register; GCC 15+ / Clang 20+)
+            //
+            // On unknown targets the "+m" fallback spills; it is
+            // only emitted when the compiler can actually reassociate.
+            template <class T>
+            XSIMD_INLINE void reassociation_barrier(T& x, const char*) noexcept
+            {
+#if XSIMD_REASSOCIATIVE_MATH && XSIMD_WITH_INLINE_ASM && !defined(__EMSCRIPTEN__)
+#if XSIMD_WITH_SSE2
+                __asm__ volatile("" : "+x"(x));
+#elif XSIMD_WITH_NEON || XSIMD_WITH_SVE
+                __asm__ volatile("" : "+w"(x));
+#elif XSIMD_WITH_VSX
+                __asm__ volatile("" : "+wa"(x));
+#else
+                __asm__ volatile("" : "+m"(x));
+#endif
+#else
+                (void)x;
+#endif
+            }
+
+            // RISC-V scalar float/double: use F/D registers instead of
+            // spilling through "+m".  These overloads also serve
+            // emulated batches on RISC-V via the std::array overload.
+#if XSIMD_REASSOCIATIVE_MATH && XSIMD_WITH_INLINE_ASM && defined(__riscv)
+            XSIMD_INLINE void reassociation_barrier(float& x, const char*) noexcept
+            {
+                __asm__ volatile("" : "+f"(x));
+            }
+            XSIMD_INLINE void reassociation_barrier(double& x, const char*) noexcept
+            {
+                __asm__ volatile("" : "+f"(x));
+            }
+#endif
+
+            template <class T, size_t N>
+            XSIMD_INLINE void reassociation_barrier(std::array<T, N>& arr, const char* reason) noexcept
+            {
+                for (auto& v : arr)
+                    reassociation_barrier(v, reason);
+            }
+
+            template <class T, class A>
+            XSIMD_INLINE void reassociation_barrier(batch<T, A>& b, const char* reason) noexcept
+            {
+#if XSIMD_REASSOCIATIVE_MATH && XSIMD_WITH_RVV && XSIMD_WITH_INLINE_ASM && ((__GNUC__ >= 15) || (__clang_major__ >= 20))
+                __asm__ volatile("" : "+vr"(b.data.value.value));
+                (void)reason;
+#else
+                reassociation_barrier(b.data, reason);
+#endif
+            }
+
             template <class F, class A, class T, class... Batches>
             XSIMD_INLINE batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
             {
diff --git a/include/xsimd/arch/common/xsimd_common_logical.hpp b/include/xsimd/arch/common/xsimd_common_logical.hpp
index 3716f6282..6ee5218aa 100644
--- a/include/xsimd/arch/common/xsimd_common_logical.hpp
+++ b/include/xsimd/arch/common/xsimd_common_logical.hpp
@@ -12,6 +12,7 @@
 #ifndef XSIMD_COMMON_LOGICAL_HPP
 #define XSIMD_COMMON_LOGICAL_HPP
 
+#include "./xsimd_common_bit.hpp"
 #include "./xsimd_common_details.hpp"
 
 #include <climits>
@@ -28,43 +29,37 @@ namespace xsimd
         template <class A, class T>
         XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<common>) noexcept
         {
-            uint64_t m = self.mask();
-            XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size < 14)
-            {
-                // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64
-                return (m * 0x200040008001ULL & 0x111111111111111ULL) % 0xf;
-            }
-            else
-            {
-#if defined __has_builtin
-#if __has_builtin(__builtin_popcountg)
-#define builtin_popcount(v) __builtin_popcountg(v)
-#endif
-#endif
+            return xsimd::detail::popcount(self.mask());
+        }
 
-#ifdef builtin_popcount
-                return builtin_popcount(m);
-#else
-                // FIXME: we could do better by dispatching to the appropriate
-                // popcount instruction depending on the arch...
-                XSIMD_IF_CONSTEXPR(batch_bool<T, A>::size <= 32)
-                {
-                    uint32_t m32 = static_cast<uint32_t>(m);
-                    // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-                    m32 = m32 - ((m32 >> 1) & 0x55555555); // reuse input as temporary
-                    m32 = (m32 & 0x33333333) + ((m32 >> 2) & 0x33333333); // temp
-                    return (((m32 + (m32 >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; // count
-                }
-                else
-                {
-                    // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-                    m = m - ((m >> 1) & (uint64_t) ~(uint64_t)0 / 3); // temp
-                    m = (m & (uint64_t) ~(uint64_t)0 / 15 * 3) + ((m >> 2) & (uint64_t) ~(uint64_t)0 / 15 * 3); // temp
-                    m = (m + (m >> 4)) & (uint64_t) ~(uint64_t)0 / 255 * 15; // temp
-                    return (m * ((uint64_t) ~(uint64_t)0 / 255)) >> (sizeof(uint64_t) - 1) * CHAR_BIT; // count
-                }
-#endif
-            }
+        template <class A, class T>
+        XSIMD_INLINE size_t countl_zero(batch_bool<T, A> const& self, requires_arch<common>) noexcept
+        {
+            constexpr size_t unused_bits = 64 - batch_bool<T, A>::size;
+            constexpr uint64_t lower_mask = batch_bool<T, A>::size < 64 ? ((uint64_t)1 << (batch_bool<T, A>::size % 64)) - 1 : (uint64_t)-1;
+            return xsimd::detail::countl_zero(self.mask() & lower_mask) - unused_bits;
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE size_t countl_one(batch_bool<T, A> const& self, requires_arch<common>) noexcept
+        {
+            constexpr size_t unused_bits = 64 - batch_bool<T, A>::size;
+            constexpr uint64_t upper_mask = batch_bool<T, A>::size < 64 ? ~(((uint64_t)1 << (batch_bool<T, A>::size % 64)) - 1) : (uint64_t)0;
+            return xsimd::detail::countl_one(self.mask() | upper_mask) - unused_bits;
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE size_t countr_zero(batch_bool<T, A> const& self, requires_arch<common>) noexcept
+        {
+            constexpr uint64_t stop = batch_bool<T, A>::size < 64 ? (uint64_t)1 << (batch_bool<T, A>::size % 64) : 0;
+            return xsimd::detail::countr_zero(self.mask() | stop);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE size_t countr_one(batch_bool<T, A> const& self, requires_arch<common>) noexcept
+        {
+            constexpr uint64_t stop = batch_bool<T, A>::size < 64 ? ~((uint64_t)1 << (batch_bool<T, A>::size % 64)) : (uint64_t)-1;
+            return xsimd::detail::countr_one(self.mask() & stop);
         }
 
         // from  mask
diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp
index f84883405..7dbb0ac05 100644
--- a/include/xsimd/arch/common/xsimd_common_math.hpp
+++ b/include/xsimd/arch/common/xsimd_common_math.hpp
@@ -743,7 +743,9 @@ namespace xsimd
                 static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                    detail::reassociation_barrier(k, "compensated exp range reduction");
                     x = fnma(k, constants::log_2hi<batch_type>(), a);
+                    detail::reassociation_barrier(x, "compensated exp range reduction");
                     x = fnma(k, constants::log_2lo<batch_type>(), x);
                     return k;
                 }
@@ -769,7 +771,9 @@ namespace xsimd
                 static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
+                    detail::reassociation_barrier(k, "compensated exp10 range reduction");
                     x = fnma(k, constants::log10_2hi<batch_type>(), a);
+                    detail::reassociation_barrier(x, "compensated exp10 range reduction");
                     x -= k * constants::log10_2lo<batch_type>();
                     return k;
                 }
@@ -794,6 +798,7 @@ namespace xsimd
                 static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(a);
+                    detail::reassociation_barrier(k, "compensated exp2 range reduction");
                     x = (a - k);
                     return k;
                 }
@@ -819,7 +824,9 @@ namespace xsimd
                 static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                    detail::reassociation_barrier(k, "compensated exp range reduction");
                     hi = fnma(k, constants::log_2hi<batch_type>(), a);
+                    detail::reassociation_barrier(hi, "compensated exp range reduction");
                     lo = k * constants::log_2lo<batch_type>();
                     x = hi - lo;
                     return k;
@@ -846,7 +853,9 @@ namespace xsimd
                 static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
                 {
                     batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
+                    detail::reassociation_barrier(k, "compensated exp10 range reduction");
                     x = fnma(k, constants::log10_2hi<batch_type>(), a);
+                    detail::reassociation_barrier(x, "compensated exp10 range reduction");
                     x = fnma(k, constants::log10_2lo<batch_type>(), x);
                     return k;
                 }
@@ -878,6 +887,7 @@ namespace xsimd
                 {
                     batch_type k = nearbyint(a);
                     x = (a - k) * constants::log_2<batch_type>();
+                    detail::reassociation_barrier(x, "keep reduced exponent ordered before finalize");
                     return k;
                 }
 
@@ -937,7 +947,10 @@ namespace xsimd
         template <class A, class T>
         XSIMD_INLINE batch<T, A> exp10(batch<T, A> const& self, requires_arch<common>) noexcept
         {
-            return detail::exp<detail::exp10_tag>(self);
+            using batch_type = batch<T, A>;
+            batch_type out = detail::exp<detail::exp10_tag>(self);
+            detail::reassociation_barrier(out, "prevent folding exp10 for literal inputs");
+            return out;
         }
 
         // exp2
@@ -1494,6 +1507,7 @@ namespace xsimd
             batch_type R = t2 + t1;
             batch_type hfsq = batch_type(0.5) * f * f;
             batch_type dk = to_float(k);
+            detail::reassociation_barrier(dk, "keep compensated k conversion before split log(2) scaling");
             batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
 #ifdef __FAST_MATH__
             return r;
@@ -1525,6 +1539,7 @@ namespace xsimd
             hx += 0x3ff00000 - 0x3fe6a09e;
             k += (hx >> 20) - 0x3ff;
             batch_type dk = to_float(k);
+            detail::reassociation_barrier(dk, "keep compensated k conversion before split log(2) scaling");
             hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
             x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
 
@@ -1584,6 +1599,7 @@ namespace xsimd
             batch_type R = t1 + t2;
             batch_type hfsq = batch_type(0.5) * f * f;
             batch_type dk = to_float(k);
+            detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
             batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2<batch_type>(), dk);
 #ifdef __FAST_MATH__
             return r;
@@ -1629,7 +1645,9 @@ namespace xsimd
             batch_type val_hi = hi * constants::invlog_2hi<batch_type>();
             batch_type val_lo = fma(lo + hi, constants::invlog_2lo<batch_type>(), lo * constants::invlog_2hi<batch_type>());
             batch_type dk = to_float(k);
+            detail::reassociation_barrier(dk, "Kahan compensated log2 summation");
             batch_type w1 = dk + val_hi;
+            detail::reassociation_barrier(w1, "Kahan compensated log2 summation");
             val_lo += (dk - w1) + val_hi;
             val_hi = w1;
             batch_type r = val_lo + val_hi;
@@ -1705,6 +1723,7 @@ namespace xsimd
             batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
             batch_type R = t2 + t1;
             batch_type dk = to_float(k);
+            detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
             batch_type hfsq = batch_type(0.5) * f * f;
             batch_type hibits = f - hfsq;
             hibits &= ::xsimd::bitwise_cast<float>(i_type(0xfffff000));
@@ -1752,10 +1771,11 @@ namespace xsimd
 #endif
             hx += 0x3ff00000 - 0x3fe6a09e;
             k += (hx >> 20) - 0x3ff;
+            batch_type dk = to_float(k);
+            detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
             hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
             x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
             batch_type f = --x;
-            batch_type dk = to_float(k);
             batch_type s = f / (batch_type(2.) + f);
             batch_type z = s * s;
             batch_type w = z * z;
@@ -1818,6 +1838,7 @@ namespace xsimd
             batch_type R = t2 + t1;
             batch_type hfsq = batch_type(0.5) * f * f;
             batch_type dk = to_float(k);
+            detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
             /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
             batch_type c = select(batch_bool_cast<float>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
             batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
@@ -1853,6 +1874,7 @@ namespace xsimd
             batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
             batch_type R = t2 + t1;
             batch_type dk = to_float(k);
+            detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion");
             batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, hfsq + R, dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
 #ifdef __FAST_MATH__
             return r;
@@ -1900,17 +1922,9 @@ namespace xsimd
                 batch_type s = bitofsign(self);
                 batch_type v = self ^ s;
                 batch_type t2n = constants::twotonmb<batch_type>();
-                // Under fast-math, reordering is possible and the compiler optimizes d
-                // to v. That's not what we want, so prevent compiler optimization here.
-                // FIXME: it may be better to emit a memory barrier here (?).
-#ifdef __FAST_MATH__
                 batch_type d0 = v + t2n;
-                asm volatile("" ::"r"(&d0) : "memory");
+                detail::reassociation_barrier(d0, "prevent collapsing (v + 2^n) - 2^n back to v");
                 batch_type d = d0 - t2n;
-#else
-                batch_type d0 = v + t2n;
-                batch_type d = d0 - t2n;
-#endif
                 return s ^ select(v < t2n, d, v);
             }
         }
@@ -2192,12 +2206,16 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<common>) noexcept
         {
-            return fnma(nearbyint(self / other), other, self);
+            batch<float, A> q = nearbyint(self / other);
+            detail::reassociation_barrier(q, "prevent pulling multiply back through rounded quotient");
+            return fnma(q, other, self);
         }
         template <class A>
         XSIMD_INLINE batch<double, A> remainder(batch<double, A> const& self, batch<double, A> const& other, requires_arch<common>) noexcept
         {
-            return fnma(nearbyint(self / other), other, self);
+            batch<double, A> q = nearbyint(self / other);
+            detail::reassociation_barrier(q, "prevent pulling multiply back through rounded quotient");
+            return fnma(q, other, self);
         }
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> remainder(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept
diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp
index 6a301dd44..7a1ed73a3 100644
--- a/include/xsimd/arch/common/xsimd_common_memory.hpp
+++ b/include/xsimd/arch/common/xsimd_common_memory.hpp
@@ -12,13 +12,12 @@
 #ifndef XSIMD_COMMON_MEMORY_HPP
 #define XSIMD_COMMON_MEMORY_HPP
 
+#include "../../types/xsimd_batch_constant.hpp"
+#include "./xsimd_common_details.hpp"
+
 #include <algorithm>
 #include <array>
 #include <complex>
-#include <stdexcept>
-
-#include "../../types/xsimd_batch_constant.hpp"
-#include "./xsimd_common_details.hpp"
 
 namespace xsimd
 {
@@ -224,7 +223,8 @@ namespace xsimd
         template <class A, size_t I, class T>
         XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<common>) noexcept
         {
-            alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
+            using value_type = typename batch<std::complex<T>, A>::value_type;
+            alignas(A::alignment()) value_type buffer[batch<std::complex<T>, A>::size];
             self.store_aligned(&buffer[0]);
             return buffer[I];
         }
@@ -292,6 +292,12 @@ namespace xsimd
             return load_unaligned(mem, b, A {});
         }
 
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> load_stream(bool const* mem, batch_bool<T, A> b, requires_arch<common>) noexcept
+        {
+            return load_aligned(mem, b, A {});
+        }
+
         // load_aligned
         namespace detail
         {
@@ -438,6 +444,12 @@ namespace xsimd
             store_masked<A>(reinterpret_cast<double*>(mem), bitwise_cast<double>(src), batch_bool_constant<double, A, Values...> {}, Mode {}, A {});
         }
 
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE batch<T_out, A> load_stream(T_in const* mem, convert<T_out> cvt, requires_arch<common>) noexcept
+        {
+            return load_aligned<A>(mem, cvt, A {});
+        }
+
         // rotate_right
         template <size_t N, class A, class T>
         XSIMD_INLINE batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<common>) noexcept
@@ -679,6 +691,12 @@ namespace xsimd
                 mem[i] = bool(buffer[i]);
         }
 
+        template <class A, class T>
+        XSIMD_INLINE void store_stream(batch_bool<T, A> const& self, bool* mem, requires_arch<common>) noexcept
+        {
+            store(self, mem, A {});
+        }
+
         // store_aligned
         template <class A, class T_in, class T_out>
         XSIMD_INLINE void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
@@ -697,6 +715,12 @@ namespace xsimd
             return store_aligned<A>(mem, self, common {});
         }
 
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE void store_stream(T_out* mem, batch<T_in, A> const& self, requires_arch<common>) noexcept
+        {
+            store_aligned<A>(mem, self, A {});
+        }
+
         // swizzle
         template <class A, class T, class ITy, ITy... Vs>
         XSIMD_INLINE batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<common>) noexcept
@@ -778,6 +802,12 @@ namespace xsimd
             return detail::load_complex(hi, lo, A {});
         }
 
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch<std::complex<T_out>, A> load_complex_stream(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<common>) noexcept
+        {
+            return load_complex_aligned<A>(mem, kernel::convert<std::complex<T_out>> {}, A {});
+        }
+
         // store_complex_aligned
         template <class A, class T_out, class T_in>
         XSIMD_INLINE void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
@@ -802,6 +832,12 @@ namespace xsimd
             hi.store_unaligned(buffer + real_batch::size);
         }
 
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE void store_complex_stream(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<common>) noexcept
+        {
+            store_complex_aligned<A>(dst, src, A {});
+        }
+
         // transpose
         template <class A, class T>
         XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<common>) noexcept
diff --git a/include/xsimd/arch/common/xsimd_common_swizzle.hpp b/include/xsimd/arch/common/xsimd_common_swizzle.hpp
index 4af2225cd..326340f92 100644
--- a/include/xsimd/arch/common/xsimd_common_swizzle.hpp
+++ b/include/xsimd/arch/common/xsimd_common_swizzle.hpp
@@ -12,12 +12,12 @@
 #ifndef XSIMD_COMMON_SWIZZLE_HPP
 #define XSIMD_COMMON_SWIZZLE_HPP
 
+#include "../../config/xsimd_macros.hpp"
+
 #include <cstddef>
 #include <cstdint>
 #include <type_traits>
 
-#include "../../config/xsimd_inline.hpp"
-
 namespace xsimd
 {
     template <typename T, class A, T... Values>
diff --git a/include/xsimd/arch/common/xsimd_common_trigo.hpp b/include/xsimd/arch/common/xsimd_common_trigo.hpp
index 78c1ea30e..d85511d2e 100644
--- a/include/xsimd/arch/common/xsimd_common_trigo.hpp
+++ b/include/xsimd/arch/common/xsimd_common_trigo.hpp
@@ -551,33 +551,45 @@ namespace xsimd
                     {
                         auto test = x > constants::pio4<B>();
                         xr = x - constants::pio2_1<B>();
+                        detail::reassociation_barrier(xr, "ordered pio2 subtraction");
                         xr -= constants::pio2_2<B>();
+                        detail::reassociation_barrier(xr, "ordered pio2 subtraction");
                         xr -= constants::pio2_3<B>();
+                        detail::reassociation_barrier(xr, "ordered pio2 subtraction");
                         xr = select(test, xr, x);
                         return select(test, B(1.), B(0.));
                     }
                     else if (all(x <= constants::twentypi<B>()))
                     {
                         B xi = nearbyint(x * constants::twoopi<B>());
+                        detail::reassociation_barrier(xi, "preserve quadrant selection");
                         xr = fnma(xi, constants::pio2_1<B>(), x);
+                        detail::reassociation_barrier(xr, "compensated range reduction");
                         xr -= xi * constants::pio2_2<B>();
+                        detail::reassociation_barrier(xr, "compensated range reduction");
                         xr -= xi * constants::pio2_3<B>();
+                        detail::reassociation_barrier(xr, "compensated range reduction");
                         return quadrant(xi);
                     }
                     else if (all(x <= constants::mediumpi<B>()))
                     {
                         B fn = nearbyint(x * constants::twoopi<B>());
+                        detail::reassociation_barrier(fn, "multi-term range reduction");
                         B r = x - fn * constants::pio2_1<B>();
+                        detail::reassociation_barrier(r, "multi-term range reduction");
                         B w = fn * constants::pio2_1t<B>();
                         B t = r;
                         w = fn * constants::pio2_2<B>();
                         r = t - w;
+                        detail::reassociation_barrier(r, "multi-term range reduction");
                         w = fn * constants::pio2_2t<B>() - ((t - r) - w);
                         t = r;
                         w = fn * constants::pio2_3<B>();
                         r = t - w;
+                        detail::reassociation_barrier(r, "multi-term range reduction");
                         w = fn * constants::pio2_3t<B>() - ((t - r) - w);
                         xr = r - w;
+                        detail::reassociation_barrier(xr, "multi-term range reduction");
                         return quadrant(fn);
                     }
                     else
diff --git a/include/xsimd/arch/utils/shifts.hpp b/include/xsimd/arch/utils/shifts.hpp
new file mode 100644
index 000000000..719ecfb7a
--- /dev/null
+++ b/include/xsimd/arch/utils/shifts.hpp
@@ -0,0 +1,82 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_UTILS_SHIFTS_HPP
+#define XSIMD_UTILS_SHIFTS_HPP
+
+#include "../../config/xsimd_macros.hpp"
+#include "../../types/xsimd_batch.hpp"
+#include "../../types/xsimd_batch_constant.hpp"
+#include "../../types/xsimd_traits.hpp"
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        namespace utils
+        {
+            template <typename I, I offset, I length, I... Vs>
+            struct select_stride
+            {
+                template <typename K>
+                static constexpr K get(K i, K)
+                {
+                    constexpr I values_array[] = { Vs... };
+                    return static_cast<K>(values_array[length * i + offset]);
+                }
+            };
+
+            template <typename I>
+            constexpr I lsb_mask(I bit_index)
+            {
+                if (bit_index == 8 * sizeof(I))
+                {
+                    return ~I { 0 };
+                }
+                return static_cast<I>((I { 1 } << bit_index) - I { 1 });
+            }
+
+            template <class T, class A, T V0, T... Vs>
+            constexpr bool all_equals(batch_constant<T, A, V0, Vs...> c)
+            {
+                return (c == std::integral_constant<T, V0> {}).all();
+            }
+
+            template <class T, class A, T... Vs>
+            XSIMD_INLINE batch<T, A> bitwise_lshift_as_twice_larger(
+                batch<T, A> const& self, batch_constant<T, A, Vs...>) noexcept
+            {
+                using T2 = widen_t<T>;
+
+                const auto self2 = bitwise_cast<T2>(self);
+
+                // Lower byte: shift as twice the size and mask bits flowing to higher byte.
+                constexpr auto shifts_lo = make_batch_constant<T2, select_stride<T, 0, 2, Vs...>, A>();
+                constexpr auto mask_lo = lsb_mask<T2>(8 * sizeof(T));
+                const auto shifted_lo = bitwise_lshift(self2, shifts_lo);
+                constexpr auto batch_mask_lo = make_batch_constant<T2, mask_lo, A>();
+                const auto masked_lo = bitwise_and(shifted_lo, batch_mask_lo.as_batch());
+
+                // Higher byte: mask bits that would flow from lower byte and shift as twice the size.
+                constexpr auto shifts_hi = make_batch_constant<T2, select_stride<T, 1, 2, Vs...>, A>();
+                constexpr auto mask_hi = mask_lo << (8 * sizeof(T));
+                constexpr auto batch_mask_hi = make_batch_constant<T2, mask_hi, A>();
+                const auto masked_hi = bitwise_and(self2, batch_mask_hi.as_batch());
+                const auto shifted_hi = bitwise_lshift(masked_hi, shifts_hi);
+
+                return bitwise_cast<T>(bitwise_or(masked_lo, shifted_hi));
+            }
+        }
+    }
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
index 4af728e07..1ee0c5b89 100644
--- a/include/xsimd/arch/xsimd_avx.hpp
+++ b/include/xsimd/arch/xsimd_avx.hpp
@@ -13,13 +13,13 @@
 #ifndef XSIMD_AVX_HPP
 #define XSIMD_AVX_HPP
 
+#include "../types/xsimd_avx_register.hpp"
+#include "../types/xsimd_batch_constant.hpp"
+
 #include <complex>
 #include <limits>
 #include <type_traits>
 
-#include "../types/xsimd_avx_register.hpp"
-#include "../types/xsimd_batch_constant.hpp"
-
 namespace xsimd
 {
     namespace kernel
@@ -748,6 +748,80 @@ namespace xsimd
             return self - batch<T, A>(mask.data);
         }
 
+        // first (must precede get for two-phase lookup)
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtss_f32(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtsd_f64(self);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFFFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                batch<T, sse4_2> low = _mm256_castsi256_si128(self);
+                return first(low, sse4_2 {});
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // get
+        template <class A, size_t I>
+        XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
+            constexpr size_t elements_per_lane = batch<float, sse4_1>::size;
+            constexpr size_t lane = I / elements_per_lane;
+            constexpr size_t sub_index = I % elements_per_lane;
+            const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+            return kernel::get(batch<float, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
+        }
+
+        template <class A, size_t I>
+        XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
+            constexpr size_t elements_per_lane = batch<double, sse4_1>::size;
+            constexpr size_t lane = I / elements_per_lane;
+            constexpr size_t sub_index = I % elements_per_lane;
+            const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+            return kernel::get(batch<double, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
+        }
+
+        template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); }
+            constexpr size_t elements_per_lane = batch<T, sse4_1>::size;
+            constexpr size_t lane = I / elements_per_lane;
+            constexpr size_t sub_index = I % elements_per_lane;
+            const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+            return kernel::get(batch<T, sse4_1>(half), ::xsimd::index<sub_index> {}, sse4_1 {});
+        }
+
         // insert
         template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
@@ -920,18 +994,18 @@ namespace xsimd
             using int_t = as_integer_t<T>;
             constexpr size_t half_size = batch<T, A>::size / 2;
 
-            // confined to lower 128-bit half → forward to SSE2
+            // confined to lower 128-bit half → forward to 128 bit
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
             {
                 constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(batch_bool_constant<int_t, A, Values...> {});
-                const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, sse4_2 {});
+                const auto lo = load_masked(reinterpret_cast<int_t const*>(mem), mlo, convert<int_t> {}, Mode {}, avx_128 {});
                 return bitwise_cast<T>(batch<int_t, A>(_mm256_zextsi128_si256(lo)));
             }
-            // confined to upper 128-bit half → forward to SSE2
+            // confined to upper 128-bit half → forward to 128 bit
             else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
             {
                 constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
-                const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, sse4_2 {});
+                const auto hi = load_masked(mem + half_size, mhi, convert<T> {}, Mode {}, avx_128 {});
                 return detail::zero_extend<A>(hi);
             }
             else
@@ -962,19 +1036,19 @@ namespace xsimd
         {
             constexpr size_t half_size = batch<T, A>::size / 2;
 
-            // confined to lower 128-bit half → forward to SSE2
+            // confined to lower 128-bit half → forward to 128 bit
             XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size)
             {
                 constexpr auto mlo = ::xsimd::detail::lower_half<sse4_2>(mask);
                 const auto lo = detail::lower_half(src);
-                store_masked<sse4_2>(mem, lo, mlo, Mode {}, sse4_2 {});
+                store_masked<avx_128>(mem, lo, mlo, Mode {}, sse4_2 {});
             }
-            // confined to upper 128-bit half → forward to SSE2
+            // confined to upper 128-bit half → forward to 128 bit
             else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size)
             {
                 constexpr auto mhi = ::xsimd::detail::upper_half<sse4_2>(mask);
                 const auto hi = detail::upper_half(src);
-                store_masked<sse4_2>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
+                store_masked<avx_128>(mem + half_size, hi, mhi, Mode {}, sse4_2 {});
             }
             else
             {
@@ -1515,6 +1589,23 @@ namespace xsimd
             return _mm256_storeu_pd(mem, self);
         }
 
+        // store_stream
+        template <class A>
+        XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            _mm256_stream_ps(mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            _mm256_stream_pd(mem, self);
+        }
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
+        XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            _mm256_stream_si256((__m256i*)mem, self);
+        }
+
         // sub
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
@@ -1998,46 +2089,6 @@ namespace xsimd
             return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
         }
 
-        // first
-        template <class A>
-        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx>) noexcept
-        {
-            return _mm256_cvtss_f32(self);
-        }
-
-        template <class A>
-        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx>) noexcept
-        {
-            return _mm256_cvtsd_f64(self);
-        }
-
-        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
-        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return static_cast<T>(_mm256_cvtsi256_si32(self) & 0xFFFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return static_cast<T>(_mm256_cvtsi256_si32(self));
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                batch<T, sse4_2> low = _mm256_castsi256_si128(self);
-                return first(low, sse4_2 {});
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-
         // widen
         template <class A, class T>
         XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
index bf6d9e7de..e2c223cc7 100644
--- a/include/xsimd/arch/xsimd_avx2.hpp
+++ b/include/xsimd/arch/xsimd_avx2.hpp
@@ -12,13 +12,13 @@
 #ifndef XSIMD_AVX2_HPP
 #define XSIMD_AVX2_HPP
 
-#include <complex>
-#include <type_traits>
-
 #include "../types/xsimd_avx2_register.hpp"
 #include "../types/xsimd_batch_constant.hpp"
+#include "./utils/shifts.hpp"
 
+#include <complex>
 #include <limits>
+#include <type_traits>
 
 namespace xsimd
 {
@@ -229,6 +229,23 @@ namespace xsimd
             store_masked<A>(reinterpret_cast<int64_t*>(mem), s64, batch_bool_constant<int64_t, A, Values...> {}, Mode {}, avx2 {});
         }
 
+        // load_stream
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
+        XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx2>) noexcept
+        {
+            return _mm256_stream_load_si256((__m256i const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx2>) noexcept
+        {
+            return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i const*)mem));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx2>) noexcept
+        {
+            return _mm256_castsi256_pd(_mm256_stream_load_si256((__m256i const*)mem));
+        }
+
         // bitwise_and
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
@@ -332,6 +349,29 @@ namespace xsimd
             }
         }
 
+        // bitwise_lshift multiple (constant) specific implementations.
+        // Missing implementations are dispatched to the `batch` overload in xsimd_api.
+        // The 1 byte constant implementation calls the 2 bytes constant version, the 2 bytes
+        // constant version calls into the 4 bytes version which resolves to the dynamic one above.
+        template <class A, class T, T... Vs,
+                  std::enable_if_t<std::is_integral<T>::value && (sizeof(T) <= 2), int> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(
+            batch<T, A> const& self, batch_constant<T, A, Vs...> shifts, requires_arch<avx2> req) noexcept
+        {
+            using uint_t = std::make_unsigned_t<T>;
+
+            // AVX2 only supports 16-bit shifts with a uniform bitshift value,
+            // otherwise emulate using 32-bit shifts.
+            XSIMD_IF_CONSTEXPR(utils::all_equals(shifts))
+            {
+                return bitwise_lshift<shifts.get(0), A>(self, req);
+            }
+            return bitwise_cast<T>(
+                utils::bitwise_lshift_as_twice_larger<uint_t>(
+                    bitwise_cast<uint_t>(self),
+                    batch_constant<uint_t, A, static_cast<uint_t>(Vs)...> {}));
+        }
+
         // bitwise_or
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
@@ -528,6 +568,7 @@ namespace xsimd
                                                  0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
                 __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
                 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
                 return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
             }
 
@@ -543,6 +584,7 @@ namespace xsimd
                                                  0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
                 __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
                 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
                 return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
             }
         }
@@ -869,12 +911,95 @@ namespace xsimd
             {
                 return _mm256_mullo_epi32(self, other);
             }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_add_epi64(
+                    _mm256_mul_epu32(self, other),
+                    _mm256_slli_epi64(
+                        _mm256_add_epi64(
+                            _mm256_mul_epu32(other, _mm256_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
+                            _mm256_mul_epu32(self, _mm256_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
+                        32));
+            }
             else
             {
                 return mul(self, other, avx {});
             }
         }
 
+        // mul_hi
+        template <class A>
+        XSIMD_INLINE batch<int8_t, A> mul_hi(batch<int8_t, A> const& self, batch<int8_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            // Sign-extend bytes to 16-bit (unpack-with-self followed by srai 8
+            // duplicates the byte then arithmetic-shifts the sign in), do the
+            // 16x16->16 multiply, then take the high byte of each product.
+            // unpacklo/unpackhi and packs are all per-128-bit-lane, so the
+            // round trip preserves byte ordering and no vpermq is needed.
+            __m256i a_lo = _mm256_srai_epi16(_mm256_unpacklo_epi8(self, self), 8);
+            __m256i a_hi = _mm256_srai_epi16(_mm256_unpackhi_epi8(self, self), 8);
+            __m256i b_lo = _mm256_srai_epi16(_mm256_unpacklo_epi8(other, other), 8);
+            __m256i b_hi = _mm256_srai_epi16(_mm256_unpackhi_epi8(other, other), 8);
+            __m256i p_lo = _mm256_srai_epi16(_mm256_mullo_epi16(a_lo, b_lo), 8);
+            __m256i p_hi = _mm256_srai_epi16(_mm256_mullo_epi16(a_hi, b_hi), 8);
+            // results already lie in [-128, 127], so packs is exact (no saturation kicks in).
+            return _mm256_packs_epi16(p_lo, p_hi);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint8_t, A> mul_hi(batch<uint8_t, A> const& self, batch<uint8_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            __m256i zero = _mm256_setzero_si256();
+            __m256i a_lo = _mm256_unpacklo_epi8(self, zero);
+            __m256i a_hi = _mm256_unpackhi_epi8(self, zero);
+            __m256i b_lo = _mm256_unpacklo_epi8(other, zero);
+            __m256i b_hi = _mm256_unpackhi_epi8(other, zero);
+            __m256i p_lo = _mm256_srli_epi16(_mm256_mullo_epi16(a_lo, b_lo), 8);
+            __m256i p_hi = _mm256_srli_epi16(_mm256_mullo_epi16(a_hi, b_hi), 8);
+            return _mm256_packus_epi16(p_lo, p_hi);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> mul_hi(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_mulhi_epi16(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint16_t, A> mul_hi(batch<uint16_t, A> const& self, batch<uint16_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_mulhi_epu16(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> mul_hi(batch<int32_t, A> const& self, batch<int32_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            __m256i even = _mm256_mul_epi32(self, other);
+            __m256i odd = _mm256_mul_epi32(_mm256_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)),
+                                           _mm256_shuffle_epi32(other, _MM_SHUFFLE(3, 3, 1, 1)));
+            __m256i even_hi = _mm256_srli_epi64(even, 32);
+            return _mm256_blend_epi16(even_hi, odd, 0xCC);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> mul_hi(batch<uint32_t, A> const& self, batch<uint32_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            __m256i even = _mm256_mul_epu32(self, other);
+            __m256i odd = _mm256_mul_epu32(_mm256_srli_epi64(self, 32), _mm256_srli_epi64(other, 32));
+            __m256i even_hi = _mm256_srli_epi64(even, 32);
+            return _mm256_blend_epi16(even_hi, odd, 0xCC);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> mul_hi(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return detail::mulhi_u64_core<A>(self, other,
+                                             [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                             { return batch<uint64_t, A>(_mm256_mul_epu32(a, b)); });
+        }
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> mul_hi(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return detail::mulhi_i64_core<A>(self, other,
+                                             [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                             { return batch<uint64_t, A>(_mm256_mul_epu32(a, b)); });
+        }
+
         // reduce_add
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
@@ -1225,11 +1350,9 @@ namespace xsimd
             __m256i r0 = _mm256_shuffle_epi8(self, half_mask);
             __m256i r1 = _mm256_shuffle_epi8(swapped, half_mask);
 
-            // select lane by the mask index divided by 16
-            constexpr auto lane = batch_constant<
-                uint8_t, A,
-                00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00,
-                16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16> {};
+            // select lane by the mask index divided by 16, first lane is 0, second is 16.
+            constexpr auto lane_size = make_batch_constant<uint8_t, 16, A>();
+            constexpr auto lane = (make_iota_batch_constant<uint8_t, A>() / lane_size) * lane_size;
             batch_bool<uint8_t, A> blend_mask = (mask & 0b10000u) != lane;
             return _mm256_blendv_epi8(r0, r1, blend_mask);
         }
@@ -1259,66 +1382,32 @@ namespace xsimd
 
         namespace detail
         {
-            template <typename T>
-            constexpr T swizzle_val_none()
+            template <bool cross_batch, typename T, T... Vals>
+            struct swizzle_mask
             {
-                // Most significant bit of the byte must be 1
-                return 0x80;
-            }
-
-            template <typename T>
-            constexpr bool swizzle_val_is_cross_lane(T val, T idx, T size)
-            {
-                return (idx < (size / 2)) != (val < (size / 2));
-            }
+                static constexpr auto values = std::array<T, sizeof...(Vals)> { Vals... };
 
-            template <typename T>
-            constexpr bool swizzle_val_is_defined(T val, T size)
-            {
-                return (0 <= val) && (val < size);
-            }
-
-            template <typename T>
-            constexpr T swizzle_self_val(T val, T idx, T size)
-            {
-                return (swizzle_val_is_defined(val, size) && !swizzle_val_is_cross_lane(val, idx, size))
-                    ? val % (size / 2)
-                    : swizzle_val_none<T>();
-            }
-
-            template <typename T, typename A, T... Vals, std::size_t... Ids>
-            constexpr batch_constant<T, A, swizzle_self_val(Vals, T(Ids), static_cast<T>(sizeof...(Vals)))...>
-            swizzle_make_self_batch_impl(std::index_sequence<Ids...>)
-            {
-                return {};
-            }
-
-            template <typename T, typename A, T... Vals>
-            constexpr auto swizzle_make_self_batch()
-            {
-                return swizzle_make_self_batch_impl<T, A, Vals...>(std::make_index_sequence<sizeof...(Vals)>());
-            }
-
-            template <typename T>
-            constexpr T swizzle_cross_val(T val, T idx, T size)
-            {
-                return (swizzle_val_is_defined(val, size) && swizzle_val_is_cross_lane(val, idx, size))
-                    ? val % (size / 2)
-                    : swizzle_val_none<T>();
-            }
+                static constexpr T get(std::size_t idx_, std::size_t size_) noexcept
+                {
+                    const T size = static_cast<T>(size_);
+                    const T idx = static_cast<T>(idx_);
+                    const T val = values[idx_];
 
-            template <typename T, typename A, T... Vals, std::size_t... Ids>
-            constexpr batch_constant<T, A, swizzle_cross_val(Vals, T(Ids), static_cast<T>(sizeof...(Vals)))...>
-            swizzle_make_cross_batch_impl(std::index_sequence<Ids...>)
-            {
-                return {};
-            }
+                    // Check if value in bounds
+                    if ((T(0) <= val) && (val < size))
+                    {
+                        // Whether we need to access the value from the other lane
+                        const bool val_is_cross_lane = (idx < (size / 2)) != (val < (size / 2));
+                        if (val_is_cross_lane == cross_batch)
+                        {
+                            return val % (size / 2);
+                        }
+                    }
 
-            template <typename T, typename A, T... Vals>
-            constexpr auto swizzle_make_cross_batch()
-            {
-                return swizzle_make_cross_batch_impl<T, A, Vals...>(std::make_index_sequence<sizeof...(Vals)>());
-            }
+                    // Out of bounds with most significant bit set to 1 will set the swizzle target to 0
+                    return ~T {};
+                }
+            };
         }
 
         // swizzle (constant mask)
@@ -1354,8 +1443,8 @@ namespace xsimd
 
             // We can outsmart the dynamic version by creating a compile-time mask that leaves zeros
             // where it does not need to select data, resulting in a simple OR merge of the two batches.
-            constexpr auto self_mask = detail::swizzle_make_self_batch<uint8_t, A, Vals...>();
-            constexpr auto cross_mask = detail::swizzle_make_cross_batch<uint8_t, A, Vals...>();
+            constexpr auto self_mask = make_batch_constant<uint8_t, detail::swizzle_mask<false, uint8_t, Vals...>, A>();
+            constexpr auto cross_mask = make_batch_constant<uint8_t, detail::swizzle_mask<true, uint8_t, Vals...>, A>();
 
             // permute bytes within each lane (AVX2 only)
             __m256i r0 = _mm256_shuffle_epi8(self, self_mask.as_batch());
diff --git a/include/xsimd/arch/xsimd_avx2_128.hpp b/include/xsimd/arch/xsimd_avx2_128.hpp
new file mode 100644
index 000000000..7a590c74f
--- /dev/null
+++ b/include/xsimd/arch/xsimd_avx2_128.hpp
@@ -0,0 +1,170 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_128_HPP
+#define XSIMD_AVX2_128_HPP
+
+#include "../types/xsimd_avx2_register.hpp"
+#include "../types/xsimd_batch_constant.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        using namespace types;
+
+        // select
+        template <class A, class T, bool... Values, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2_128>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_blend_epi32(false_br, true_br, mask);
+            }
+            else
+            {
+                return select(batch_bool_constant<T, A, Values...>(), true_br, false_br, avx_128 {});
+            }
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2_128>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_sllv_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_sllv_epi64(self, other);
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx {});
+            }
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2_128>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srav_epi32(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx_128 {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srlv_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm_srlv_epi64(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx_128 {});
+                }
+            }
+        }
+
+        // load_masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx2_128>) noexcept
+        {
+            return _mm_maskload_epi32(mem, mask.as_batch());
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...> mask, convert<uint32_t>, Mode, requires_arch<avx2_128>) noexcept
+        {
+            return _mm_maskload_epi32((int32_t*)mem, mask.as_batch());
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<int64_t, A> load_masked(int64_t const* mem, batch_bool_constant<int64_t, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_epi64(mem, mask.as_batch());
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<uint64_t, A> load_masked(uint64_t const* mem, batch_bool_constant<uint64_t, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_epi64((int64_t*)mem, mask.as_batch());
+        }
+
+        // store_masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<int32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
+        {
+            return _mm_maskstore_epi32(mem, mask.as_batch(), src);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx2_128>) noexcept
+        {
+            return _mm_maskstore_epi32((int32_t*)mem, mask.as_batch(), src);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<int64_t, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskstore_epi64(mem, mask.as_batch(), src);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskstore_epi64((int64_t*)mem, mask.as_batch(), src);
+        }
+
+        // gather
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx2_128>) noexcept
+        {
+            return _mm_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
+        }
+
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
+        XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                        kernel::requires_arch<avx2_128>) noexcept
+        {
+            return _mm_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
+        }
+
+        template <class A, class U,
+                  detail::enable_sized_integral_t<U, 4> = 0>
+        XSIMD_INLINE batch<float, A> gather(batch<float, A> const&, float const* src,
+                                            batch<U, A> const& index,
+                                            kernel::requires_arch<avx2_128>) noexcept
+        {
+            return _mm_i32gather_ps(src, index, sizeof(float));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
+        XSIMD_INLINE batch<double, A> gather(batch<double, A> const&, double const* src,
+                                             batch<U, A> const& index,
+                                             requires_arch<avx2_128>) noexcept
+        {
+            return _mm_i64gather_pd(src, index, sizeof(double));
+        }
+    }
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512bw.hpp
index 28e2e98d6..57894a831 100644
--- a/include/xsimd/arch/xsimd_avx512bw.hpp
+++ b/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -12,11 +12,11 @@
 #ifndef XSIMD_AVX512BW_HPP
 #define XSIMD_AVX512BW_HPP
 
+#include "../types/xsimd_avx512bw_register.hpp"
+
 #include <array>
 #include <type_traits>
 
-#include "../types/xsimd_avx512bw_register.hpp"
-
 namespace xsimd
 {
 
@@ -470,6 +470,43 @@ namespace xsimd
             }
         }
 
+        // mul_hi
+        template <class A>
+        XSIMD_INLINE batch<int8_t, A> mul_hi(batch<int8_t, A> const& self, batch<int8_t, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            // Per-128-bit-lane unpack/pack pair preserves byte ordering across
+            // the four 128-bit lanes of a ZMM, so no inter-lane permute needed.
+            __m512i a_lo = _mm512_srai_epi16(_mm512_unpacklo_epi8(self, self), 8);
+            __m512i a_hi = _mm512_srai_epi16(_mm512_unpackhi_epi8(self, self), 8);
+            __m512i b_lo = _mm512_srai_epi16(_mm512_unpacklo_epi8(other, other), 8);
+            __m512i b_hi = _mm512_srai_epi16(_mm512_unpackhi_epi8(other, other), 8);
+            __m512i p_lo = _mm512_srai_epi16(_mm512_mullo_epi16(a_lo, b_lo), 8);
+            __m512i p_hi = _mm512_srai_epi16(_mm512_mullo_epi16(a_hi, b_hi), 8);
+            return _mm512_packs_epi16(p_lo, p_hi);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint8_t, A> mul_hi(batch<uint8_t, A> const& self, batch<uint8_t, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            __m512i zero = _mm512_setzero_si512();
+            __m512i a_lo = _mm512_unpacklo_epi8(self, zero);
+            __m512i a_hi = _mm512_unpackhi_epi8(self, zero);
+            __m512i b_lo = _mm512_unpacklo_epi8(other, zero);
+            __m512i b_hi = _mm512_unpackhi_epi8(other, zero);
+            __m512i p_lo = _mm512_srli_epi16(_mm512_mullo_epi16(a_lo, b_lo), 8);
+            __m512i p_hi = _mm512_srli_epi16(_mm512_mullo_epi16(a_hi, b_hi), 8);
+            return _mm512_packus_epi16(p_lo, p_hi);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> mul_hi(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_mulhi_epi16(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint16_t, A> mul_hi(batch<uint16_t, A> const& self, batch<uint16_t, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_mulhi_epu16(self, other);
+        }
+
         // neq
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx512er.hpp b/include/xsimd/arch/xsimd_avx512er.hpp
index be02f9850..ee69ef9f8 100644
--- a/include/xsimd/arch/xsimd_avx512er.hpp
+++ b/include/xsimd/arch/xsimd_avx512er.hpp
@@ -12,9 +12,6 @@
 #ifndef XSIMD_AVX512ER_HPP
 #define XSIMD_AVX512ER_HPP
 
-#include <array>
-#include <type_traits>
-
 #include "../types/xsimd_avx512er_register.hpp"
 
 #endif
diff --git a/include/xsimd/arch/xsimd_avx512f.hpp b/include/xsimd/arch/xsimd_avx512f.hpp
index 5ccf165f1..6a7316722 100644
--- a/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/include/xsimd/arch/xsimd_avx512f.hpp
@@ -12,13 +12,13 @@
 #ifndef XSIMD_AVX512F_HPP
 #define XSIMD_AVX512F_HPP
 
+#include "../types/xsimd_avx512f_register.hpp"
+#include "../types/xsimd_batch_constant.hpp"
+
 #include <complex>
 #include <limits>
 #include <type_traits>
 
-#include "../types/xsimd_avx512f_register.hpp"
-#include "../types/xsimd_batch_constant.hpp"
-
 namespace xsimd
 {
 
@@ -1346,6 +1346,97 @@ namespace xsimd
             }
         }
 
+        // first (must precede get for two-phase lookup)
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cvtss_f32(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cvtsd_f64(self);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                batch<T, sse4_2> low = _mm512_castsi512_si128(self);
+                return first(low, sse4_2 {});
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // get: use valignd/valignq to rotate lane I into position 0 in a single op.
+        template <class A, size_t I>
+        XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, avx512f {});
+            }
+            const auto rotated = _mm512_alignr_epi32(_mm512_castps_si512(self), _mm512_castps_si512(self), I);
+            return _mm_cvtss_f32(_mm512_castps512_ps128(_mm512_castsi512_ps(rotated)));
+        }
+
+        template <class A, size_t I>
+        XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, avx512f {});
+            }
+            const auto rotated = _mm512_alignr_epi64(_mm512_castpd_si512(self), _mm512_castpd_si512(self), I);
+            return _mm_cvtsd_f64(_mm512_castpd512_pd128(_mm512_castsi512_pd(rotated)));
+        }
+
+        template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, avx512f {});
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                const auto rotated = _mm512_alignr_epi32(self, self, I);
+                return first(batch<T, sse4_2>(_mm512_castsi512_si128(rotated)), sse4_2 {});
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                const auto rotated = _mm512_alignr_epi64(self, self, I);
+                return first(batch<T, sse4_2>(_mm512_castsi512_si128(rotated)), sse4_2 {});
+            }
+            else
+            {
+                // 8/16-bit lanes have no sub-dword rotate in AVX-512F; delegate to AVX halves.
+                constexpr size_t elements_per_lane = batch<T, avx>::size;
+                constexpr size_t lane = I / elements_per_lane;
+                constexpr size_t sub_index = I % elements_per_lane;
+                const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self);
+                return kernel::get(batch<T, avx>(half), ::xsimd::index<sub_index> {}, avx {});
+            }
+        }
+
         // insert
         template <class A, size_t I>
         XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<avx512f>) noexcept
@@ -1427,15 +1518,40 @@ namespace xsimd
         {
             // Adapted from https://github.com/serge-sans-paille/fast-bitset-from-bool-array
             // Generate a bitset from an array of boolean.
-            XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[8])
+            template <size_t N>
+            XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[N])
             {
-                uint64_t data;
-                memcpy(&data, unpacked, sizeof(uint64_t));
+                static_assert(N == 8 || N == 4 || N == 2, "valid pack size");
+                XSIMD_IF_CONSTEXPR(N == 8)
+                {
+                    uint64_t data;
+                    memcpy(&data, unpacked, sizeof(uint64_t));
 
-                const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000);
+                    const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000);
 
-                unsigned char res = ((data * magic) >> 56) & 0xFF;
-                return res;
+                    unsigned char res = ((data * magic) >> 56) & 0xFF;
+                    return res;
+                }
+                else XSIMD_IF_CONSTEXPR(N == 4)
+                {
+                    uint32_t data;
+                    memcpy(&data, unpacked, sizeof(uint32_t));
+
+                    const uint32_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000);
+
+                    unsigned char res = ((data * magic) >> 24) & 0xFF;
+                    return res;
+                }
+                else XSIMD_IF_CONSTEXPR(N == 2)
+                {
+                    uint16_t data;
+                    memcpy(&data, unpacked, sizeof(uint16_t));
+
+                    const uint16_t magic = (0x80 + 0x4000);
+
+                    unsigned char res = ((data * magic) >> 8) & 0xFF;
+                    return res;
+                }
             }
         }
 
@@ -1450,7 +1566,7 @@ namespace xsimd
             register_type mask = 0;
             for (std::size_t i = 0; i < iter; ++i)
             {
-                unsigned char block = detail::tobitset((unsigned char*)mem + i * 8);
+                unsigned char block = detail::tobitset<8>((unsigned char*)mem + i * 8);
                 mask |= (register_type(block) << (i * 8));
             }
             return mask;
@@ -1513,6 +1629,23 @@ namespace xsimd
             return _mm512_loadu_pd(mem);
         }
 
+        // load_stream
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
+        XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_stream_load_si512((__m512i*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_stream_load_si512((__m512i*)mem));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_stream_load_si512((__m512i*)mem));
+        }
+
         // lt
         template <class A>
         XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
@@ -1664,6 +1797,41 @@ namespace xsimd
             }
         }
 
+        // mul_hi
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> mul_hi(batch<int32_t, A> const& self, batch<int32_t, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            __m512i even = _mm512_mul_epi32(self, other);
+            __m512i odd = _mm512_mul_epi32(_mm512_shuffle_epi32(self, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1))),
+                                           _mm512_shuffle_epi32(other, _MM_PERM_ENUM(_MM_SHUFFLE(3, 3, 1, 1))));
+            __m512i even_hi = _mm512_srli_epi64(even, 32);
+            // merge: even_hi has hi in low-32 of each 64, odd has hi in high-32 of each 64
+            return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> mul_hi(batch<uint32_t, A> const& self, batch<uint32_t, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            __m512i even = _mm512_mul_epu32(self, other);
+            __m512i odd = _mm512_mul_epu32(_mm512_srli_epi64(self, 32), _mm512_srli_epi64(other, 32));
+            __m512i even_hi = _mm512_srli_epi64(even, 32);
+            return _mm512_mask_blend_epi32(static_cast<__mmask16>(0xAAAA), even_hi, odd);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> mul_hi(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::mulhi_u64_core<A>(self, other,
+                                             [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                             { return batch<uint64_t, A>(_mm512_mul_epu32(a, b)); });
+        }
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> mul_hi(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::mulhi_i64_core<A>(self, other,
+                                             [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                             { return batch<uint64_t, A>(_mm512_mul_epu32(a, b)); });
+        }
+
         // nearbyint
         template <class A>
         XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
@@ -2285,6 +2453,23 @@ namespace xsimd
             return _mm512_storeu_pd(mem, self);
         }
 
+        // store_stream
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
+        XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            _mm512_stream_si512((__m512i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            _mm512_stream_ps(mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            _mm512_stream_pd(mem, self);
+        }
+
         // sub
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
@@ -2449,7 +2634,7 @@ namespace xsimd
             };
 
             template <class T, class A, T Idx0, T Idx1, T... Idx>
-            struct is_pair_of_contiguous_indices<T, A, Idx0, Idx1, Idx...> : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices<T, A, Idx...>, std::false_type>::type
+            struct is_pair_of_contiguous_indices<T, A, Idx0, Idx1, Idx...> : std::conditional_t<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices<T, A, Idx...>, std::false_type>
             {
             };
 
@@ -2463,30 +2648,50 @@ namespace xsimd
                                             I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
             };
 
+            template <class A, uint16_t... Is>
+            constexpr bool is_reduce_pattern()
+            {
+                // The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1}
+                if (sizeof...(Is) != batch<uint16_t, A>::size)
+                    return false;
+                uint16_t pattern[] = { Is... };
+                if (pattern[0] != 1)
+                    return false;
+                for (size_t i = 1; i < sizeof...(Is); i += 1)
+                {
+                    if (pattern[i] != (i & 1))
+                        return false;
+                }
+                return true;
+            }
         }
 
-        template <class A, uint16_t... Idx, class = std::enable_if_t<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value>>
-        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
-        {
-            constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
-            return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
-        }
-
-        template <class A>
-        XSIMD_INLINE batch<uint16_t, A>
-        swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
+        template <class A, uint16_t... Idx>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...> mask, requires_arch<avx512f>) noexcept
         {
-            // FIXME: this sequence is very inefficient, but it's here to catch
-            // a pattern generated by detail::reduce from xsimd_common_math.hpp.
-            // The whole pattern is actually decently folded by GCC and Clang,
-            // so bare with it.
-            constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
-            auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+            XSIMD_IF_CONSTEXPR(detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value)
+            {
+                constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
+                return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+            }
+            else XSIMD_IF_CONSTEXPR(detail::is_reduce_pattern<A, Idx...>())
+            {
+                // FIXME: this sequence is very inefficient, but it's here to catch
+                // a pattern generated by detail::reduce from xsimd_common_math.hpp.
+                // The whole pattern is actually decently folded by GCC and Clang,
+                // so bare with it.
+                constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
+                auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
 
-            alignas(A::alignment()) uint16_t buffer[32];
-            _mm512_store_si512((__m512i*)&buffer[0], tmp);
-            buffer[0] = buffer[1];
-            return _mm512_load_si512(&buffer[0]);
+                alignas(A::alignment()) uint16_t buffer[32];
+                _mm512_store_si512((__m512i*)&buffer[0], tmp);
+                buffer[0] = buffer[1];
+                return _mm512_load_si512(&buffer[0]);
+            }
+            else
+            {
+                return swizzle(self, mask, common {});
+            }
         }
 
         template <class A, uint16_t... Vs>
@@ -2719,46 +2924,6 @@ namespace xsimd
                 2));
         }
 
-        // first
-        template <class A>
-        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<avx512f>) noexcept
-        {
-            return _mm512_cvtss_f32(self);
-        }
-
-        template <class A>
-        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<avx512f>) noexcept
-        {
-            return _mm512_cvtsd_f64(self);
-        }
-
-        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
-        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<avx512f>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)));
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                batch<T, sse4_2> low = _mm512_castsi512_si128(self);
-                return first(low, sse4_2 {});
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-
         // widen
         template <class A, class T>
         XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx512f>) noexcept
diff --git a/include/xsimd/arch/xsimd_avx512ifma.hpp b/include/xsimd/arch/xsimd_avx512ifma.hpp
index 206319348..13e55de8f 100644
--- a/include/xsimd/arch/xsimd_avx512ifma.hpp
+++ b/include/xsimd/arch/xsimd_avx512ifma.hpp
@@ -12,9 +12,6 @@
 #ifndef XSIMD_AVX512IFMA_HPP
 #define XSIMD_AVX512IFMA_HPP
 
-#include <array>
-#include <type_traits>
-
 #include "../types/xsimd_avx512ifma_register.hpp"
 
 #endif
diff --git a/include/xsimd/arch/xsimd_avx512pf.hpp b/include/xsimd/arch/xsimd_avx512pf.hpp
index 6265c9171..5c21d6787 100644
--- a/include/xsimd/arch/xsimd_avx512pf.hpp
+++ b/include/xsimd/arch/xsimd_avx512pf.hpp
@@ -12,9 +12,6 @@
 #ifndef XSIMD_AVX512PF_HPP
 #define XSIMD_AVX512PF_HPP
 
-#include <array>
-#include <type_traits>
-
 #include "../types/xsimd_avx512pf_register.hpp"
 
 #endif
diff --git a/include/xsimd/arch/xsimd_avx512vbmi.hpp b/include/xsimd/arch/xsimd_avx512vbmi.hpp
index 7c00d94ea..099adb7e0 100644
--- a/include/xsimd/arch/xsimd_avx512vbmi.hpp
+++ b/include/xsimd/arch/xsimd_avx512vbmi.hpp
@@ -12,11 +12,10 @@
 #ifndef XSIMD_AVX512VBMI_HPP
 #define XSIMD_AVX512VBMI_HPP
 
-#include <array>
-#include <type_traits>
-
 #include "../types/xsimd_avx512vbmi_register.hpp"
 
+#include <type_traits>
+
 namespace xsimd
 {
 
diff --git a/include/xsimd/arch/xsimd_avx512vbmi2.hpp b/include/xsimd/arch/xsimd_avx512vbmi2.hpp
index 8852b2a1e..705b8beaf 100644
--- a/include/xsimd/arch/xsimd_avx512vbmi2.hpp
+++ b/include/xsimd/arch/xsimd_avx512vbmi2.hpp
@@ -12,11 +12,10 @@
 #ifndef XSIMD_AVX512VBMI2_HPP
 #define XSIMD_AVX512VBMI2_HPP
 
-#include <array>
-#include <type_traits>
-
 #include "../types/xsimd_avx512vbmi2_register.hpp"
 
+#include <type_traits>
+
 namespace xsimd
 {
 
diff --git a/include/xsimd/config/xsimd_inline.hpp b/include/xsimd/arch/xsimd_avx512vl.hpp
similarity index 68%
rename from include/xsimd/config/xsimd_inline.hpp
rename to include/xsimd/arch/xsimd_avx512vl.hpp
index f3becaf12..d47b0df40 100644
--- a/include/xsimd/config/xsimd_inline.hpp
+++ b/include/xsimd/arch/xsimd_avx512vl.hpp
@@ -9,21 +9,11 @@
  * The full license is in the file LICENSE, distributed with this software. *
  ****************************************************************************/
 
-#ifndef XSIMD_INLINE_HPP
-#define XSIMD_INLINE_HPP
+#ifndef XSIMD_AVX512VL_HPP
+#define XSIMD_AVX512VL_HPP
 
-#if defined(__VEC__)
-#define XSIMD_INLINE inline
-#elif defined __has_attribute
-#if __has_attribute(always_inline)
-#define XSIMD_INLINE inline __attribute__((always_inline))
-#else
-#define XSIMD_INLINE inline
-#endif
-#elif defined(_MSC_VER)
-#define XSIMD_INLINE inline __forceinline
-#else
-#define XSIMD_INLINE inline
-#endif
+#include "../types/xsimd_avx512vl_register.hpp"
+
+// no 512-bit operation with avx512-vl, it only provides 128 et 256 bits ones.
 
 #endif
diff --git a/include/xsimd/arch/xsimd_avx512vl_128.hpp b/include/xsimd/arch/xsimd_avx512vl_128.hpp
new file mode 100644
index 000000000..155338425
--- /dev/null
+++ b/include/xsimd/arch/xsimd_avx512vl_128.hpp
@@ -0,0 +1,647 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VL_128_HPP
+#define XSIMD_AVX512VL_128_HPP
+
+#include "../types/xsimd_avx512vl_register.hpp"
+#include "../types/xsimd_batch_constant.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            template <class A, class T, int Cmp>
+            XSIMD_INLINE batch_bool<T, A> compare_int_avx512vl_128(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                using register_type = typename batch_bool<T, A>::register_type;
+                if (std::is_signed<T>::value)
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        // shifting to take sign into account
+                        uint64_t mask_low0 = _mm_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x000000FF)) << 24,
+                                                                (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x000000FF)) << 24,
+                                                                Cmp);
+                        uint64_t mask_low1 = _mm_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FF00)) << 16,
+                                                                (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FF00)) << 16,
+                                                                Cmp);
+                        uint64_t mask_high0 = _mm_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x00FF0000)) << 8,
+                                                                 (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x00FF0000)) << 8,
+                                                                 Cmp);
+                        uint64_t mask_high1 = _mm_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFF000000)),
+                                                                 (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFF000000)),
+                                                                 Cmp);
+                        uint64_t mask = 0;
+                        for (unsigned i = 0; i < 8; ++i)
+                        {
+                            mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
+                            mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
+                            mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
+                            mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
+                        }
+                        return (register_type)mask;
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        // shifting to take sign into account
+                        uint16_t mask_low = _mm_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
+                                                               (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
+                                                               Cmp);
+                        uint16_t mask_high = _mm_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFFFF0000)),
+                                                                (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFFFF0000)),
+                                                                Cmp);
+                        return static_cast<register_type>(morton(mask_low, mask_high));
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm_cmp_epi32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm_cmp_epi64_mask(self, other, Cmp);
+                    }
+                }
+                else
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        uint64_t mask_low0 = _mm_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x000000FF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x000000FF)), Cmp);
+                        uint64_t mask_low1 = _mm_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FF00)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FF00)), Cmp);
+                        uint64_t mask_high0 = _mm_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x00FF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x00FF0000)), Cmp);
+                        uint64_t mask_high1 = _mm_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFF000000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFF000000)), Cmp);
+                        uint64_t mask = 0;
+                        for (unsigned i = 0; i < 8; ++i)
+                        {
+                            mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
+                            mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
+                            mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
+                            mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
+                        }
+                        return (register_type)mask;
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        uint16_t mask_low = _mm_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FFFF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FFFF)), Cmp);
+                        uint16_t mask_high = _mm_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFFFF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFFFF0000)), Cmp);
+                        return static_cast<register_type>(morton(mask_low, mask_high));
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm_cmp_epu32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm_cmp_epu64_mask(self, other, Cmp);
+                    }
+                }
+            }
+        }
+
+        // load mask
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx512vl_128>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            constexpr auto size = batch_bool<T, A>::size;
+            constexpr auto chunk_size = size >= 8 ? 8 : (size >= 4 ? 4 : 2);
+            constexpr auto iter = size / chunk_size;
+            static_assert((size % chunk_size) == 0, "incorrect size of bool batch");
+            register_type mask = 0;
+            for (std::size_t i = 0; i < iter; ++i)
+            {
+                unsigned char block = detail::tobitset<chunk_size>((unsigned char*)mem + i * chunk_size);
+                mask |= (register_type(block) << (i * chunk_size));
+            }
+            return mask;
+        }
+
+        // from bool
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512vl_128>) noexcept
+        {
+            return select(self, batch<T, A>(1), batch<T, A>(0));
+        }
+
+        // from_mask
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx512vl_128>) noexcept
+        {
+            assert(mask == (mask & ((uint64_t(1) << batch_bool<T, A>::size) - 1)) && "inbound mask");
+            return static_cast<typename batch_bool<T, A>::register_type>(mask & ((uint64_t(1) << batch_bool<T, A>::size) - 1));
+        }
+
+        // mask
+        template <class A, class T>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx512vl_128>) noexcept
+        {
+            return self.data & ((uint64_t(1) << batch_bool<T, A>::size) - 1);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx512vl_128>) noexcept
+        {
+            return self.data;
+        }
+
+        // set
+        template <class A, class T, class... Values>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx512vl_128>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
+            using register_type = typename batch_bool<T, A>::register_type;
+            register_type r = 0;
+            unsigned shift = 0;
+            (void)std::initializer_list<register_type> { (r |= register_type(values ? 1 : 0) << (shift++))... };
+            return r;
+        }
+
+        // store
+        template <class T, class A>
+        XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512vl_128>) noexcept
+        {
+            constexpr auto size = batch_bool<T, A>::size;
+            for (std::size_t i = 0; i < size; ++i)
+                mem[i] = (self.data >> i) & 0x1;
+        }
+
+        // abs
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> abs(batch<int64_t, A> const& self, requires_arch<avx512vl_128>) noexcept
+        {
+            return _mm_abs_epi64(self);
+        }
+
+        // load masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx512vl_128>) noexcept
+        {
+            constexpr auto imm_mask = mask.mask();
+            return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...> mask, convert<uint32_t>, Mode, requires_arch<avx512vl_128>) noexcept
+        {
+            constexpr auto imm_mask = mask.mask();
+            return _mm_mask_loadu_epi32(_mm_setzero_si128(), imm_mask, mem);
+        }
+
+        // store masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx512vl_128>) noexcept
+        {
+            _mm_mask_storeu_epi32(mem, mask.mask(), src);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx512vl_128>) noexcept
+        {
+            _mm_mask_storeu_epi32(mem, mask.mask(), src);
+        }
+
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx512vl_128>) noexcept
+        {
+            _mm_mask_storeu_epi64(mem, mask.mask(), src);
+        }
+
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx512vl_128>) noexcept
+        {
+            _mm_mask_storeu_epi64(mem, mask.mask(), src);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx512vl_128>) noexcept
+        {
+            _mm_mask_storeu_ps(mem, mask.mask(), src);
+        }
+
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx512vl_128>) noexcept
+        {
+            _mm_mask_storeu_pd(mem, mask.mask(), src);
+        }
+
+        // max
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> max(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return _mm_max_epi64(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> max(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return _mm_max_epu64(self, other);
+        }
+
+        // min
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> min(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return _mm_min_epi64(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> min(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return _mm_min_epu64(self, other);
+        }
+
+        // insert
+        template <class A, size_t I>
+        XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<avx512vl_128>) noexcept
+        {
+
+            int32_t tmp = bit_cast<int32_t>(val);
+            return _mm_castsi128_ps(_mm_mask_set1_epi32(_mm_castps_si128(self), __mmask8(1 << (I & 7)), tmp));
+        }
+
+        template <class A, size_t I>
+        XSIMD_INLINE batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<avx512vl_128>) noexcept
+        {
+            int64_t tmp = bit_cast<int64_t>(val);
+            return _mm_castsi128_pd(_mm_mask_set1_epi64(_mm_castpd_si128(self), __mmask8(1 << (I & 3)), tmp));
+        }
+
+        template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx512vl_128>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_mask_set1_epi32(self, __mmask8(1 << (I & 7)), val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_mask_set1_epi64(self, __mmask8(1 << (I & 3)), val);
+            }
+            else
+            {
+                return insert(self, val, pos, common {});
+            }
+        }
+
+        // isnan
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm_cmp_ps_mask(self, self, _CMP_UNORD_Q);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm_cmp_pd_mask(self, self, _CMP_UNORD_Q);
+        }
+
+        // rotl
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_rolv_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_rolv_epi64(self, other);
+            }
+            else
+            {
+                return rotl(self, other, avx2_128 {});
+            }
+        }
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, int32_t other, requires_arch<avx512vl_128>) noexcept
+        {
+            return rotl(self, batch<T, A>(other), A {});
+        }
+        template <size_t count, class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, requires_arch<avx512vl_128>) noexcept
+        {
+            constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+            static_assert(count < bits, "Count must be less than the number of bits in T");
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_rol_epi32(self, count);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_rol_epi64(self, count);
+            }
+            else
+            {
+                return rotl<count>(self, avx2_128 {});
+            }
+        }
+
+        // rotr
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_rorv_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm_rorv_epi64(self, other);
+                }
+            }
+            return rotr(self, other, avx2_128 {});
+        }
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, int32_t other, requires_arch<avx512vl_128>) noexcept
+        {
+            return rotr(self, batch<T, A>(other), A {});
+        }
+
+        template <size_t count, class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, requires_arch<avx512vl_128>) noexcept
+        {
+            constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+            static_assert(count < bits, "Count must be less than the number of bits in T");
+            XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_ror_epi32(self, count);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm_ror_epi64(self, count);
+                }
+            }
+            return rotr<count>(self, avx2_128 {});
+        }
+
+        // all
+        template <class A, class T>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx512vl_128>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            constexpr register_type bitmask = (register_type(1) << batch_bool<T, A>::size) - 1;
+            return (self.data & bitmask) == bitmask;
+        }
+
+        // any
+        template <class A, class T>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx512vl_128>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            constexpr register_type bitmask = (register_type(1) << batch_bool<T, A>::size) - 1;
+            return (self.data & bitmask) != 0;
+        }
+
+        // eq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm_cmp_ps_mask(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm_cmp_pd_mask(self, other, _CMP_EQ_OQ);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return detail::compare_int_avx512vl_128<A, T, _MM_CMPINT_EQ>(self, other);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(~self.data ^ other.data);
+        }
+
+        // neq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm_cmp_ps_mask(self, other, _CMP_NEQ_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm_cmp_pd_mask(self, other, _CMP_NEQ_OQ);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (~(self == other));
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data ^ other.data);
+        }
+
+        // gt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm_cmp_ps_mask(self, other, _CMP_GT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm_cmp_pd_mask(self, other, _CMP_GT_OQ);
+        }
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return detail::compare_int_avx512vl_128<A, T, _MM_CMPINT_GT>(self, other);
+        }
+
+        // ge
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm_cmp_ps_mask(self, other, _CMP_GE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm_cmp_pd_mask(self, other, _CMP_GE_OQ);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return detail::compare_int_avx512vl_128<A, T, _MM_CMPINT_GE>(self, other);
+        }
+
+        // lt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm_cmp_ps_mask(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm_cmp_pd_mask(self, other, _CMP_LT_OQ);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return detail::compare_int_avx512vl_128<A, T, _MM_CMPINT_LT>(self, other);
+        }
+
+        // le
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm_cmp_ps_mask(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm_cmp_pd_mask(self, other, _CMP_LE_OQ);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            return detail::compare_int_avx512vl_128<A, T, _MM_CMPINT_LE>(self, other);
+        }
+
+        // select
+        template <class A>
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx512vl_128>) noexcept
+        {
+            return _mm_mask_blend_ps(cond, false_br, true_br);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx512vl_128>) noexcept
+        {
+            return _mm_mask_blend_pd(cond, false_br, true_br);
+        }
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512vl_128>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                batch_bool<T, avx2_128> batch_cond = batch_bool<T, avx2_128>::from_mask(cond.mask());
+                return _mm_blendv_epi8(false_br, true_br, batch_cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                batch_bool<T, avx2_128> batch_cond = batch_bool<T, avx2_128>::from_mask(cond.mask());
+                return _mm_blendv_epi8(false_br, true_br, batch_cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_mask_blend_epi32(cond, false_br, true_br);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_mask_blend_epi64(cond, false_br, true_br);
+            }
+        }
+        template <class A, class T, bool... Values>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512vl_128>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512vl_128 {});
+        }
+
+        // reciprocal
+        template <class A>
+        XSIMD_INLINE batch<float, A>
+        reciprocal(batch<float, A> const& self,
+                   kernel::requires_arch<avx512vl_128>) noexcept
+        {
+            return _mm_rcp14_ps(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A>
+        reciprocal(batch<double, A> const& self,
+                   kernel::requires_arch<avx512vl_128>) noexcept
+        {
+            return _mm_rcp14_pd(self);
+        }
+
+        // bitwise_and
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data & other.data);
+        }
+
+        // bitwise_andnot
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data & ~other.data);
+        }
+
+        // bitwise_not
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx512vl_128>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(~self.data);
+        }
+
+        // bitwise_or
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        // bitwise_xor
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data ^ other.data);
+        }
+
+        // sadd
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_128>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = other < 0;
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(mask, self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
+
+    }
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_avx512vl_256.hpp b/include/xsimd/arch/xsimd_avx512vl_256.hpp
new file mode 100644
index 000000000..a5ea546bc
--- /dev/null
+++ b/include/xsimd/arch/xsimd_avx512vl_256.hpp
@@ -0,0 +1,729 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VL_256_HPP
+#define XSIMD_AVX512VL_256_HPP
+
+#include "../types/xsimd_avx512vl_register.hpp"
+#include "../types/xsimd_batch_constant.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            template <class A, class T, int Cmp>
+            XSIMD_INLINE batch_bool<T, A> compare_int_avx512vl_256(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                using register_type = typename batch_bool<T, A>::register_type;
+                if (std::is_signed<T>::value)
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        // shifting to take sign into account
+                        uint64_t mask_low0 = _mm256_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x000000FF)) << 24,
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x000000FF)) << 24,
+                                                                   Cmp);
+                        uint64_t mask_low1 = _mm256_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FF00)) << 16,
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FF00)) << 16,
+                                                                   Cmp);
+                        uint64_t mask_high0 = _mm256_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x00FF0000)) << 8,
+                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x00FF0000)) << 8,
+                                                                    Cmp);
+                        uint64_t mask_high1 = _mm256_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFF000000)),
+                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFF000000)),
+                                                                    Cmp);
+                        uint64_t mask = 0;
+                        for (unsigned i = 0; i < 8; ++i)
+                        {
+                            mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
+                            mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
+                            mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
+                            mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
+                        }
+                        return (register_type)mask;
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        // shifting to take sign into account
+                        uint16_t mask_low = _mm256_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
+                                                                  (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
+                                                                  Cmp);
+                        uint16_t mask_high = _mm256_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFFFF0000)),
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFFFF0000)),
+                                                                   Cmp);
+                        return static_cast<register_type>(morton(mask_low, mask_high));
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm256_cmp_epi32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm256_cmp_epi64_mask(self, other, Cmp);
+                    }
+                }
+                else
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        uint64_t mask_low0 = _mm256_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x000000FF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x000000FF)), Cmp);
+                        uint64_t mask_low1 = _mm256_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FF00)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FF00)), Cmp);
+                        uint64_t mask_high0 = _mm256_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x00FF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x00FF0000)), Cmp);
+                        uint64_t mask_high1 = _mm256_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFF000000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFF000000)), Cmp);
+                        uint64_t mask = 0;
+                        for (unsigned i = 0; i < 8; ++i)
+                        {
+                            mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
+                            mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
+                            mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
+                            mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
+                        }
+                        return (register_type)mask;
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        uint16_t mask_low = _mm256_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FFFF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FFFF)), Cmp);
+                        uint16_t mask_high = _mm256_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFFFF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFFFF0000)), Cmp);
+                        return static_cast<register_type>(morton(mask_low, mask_high));
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm256_cmp_epu32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm256_cmp_epu64_mask(self, other, Cmp);
+                    }
+                }
+            }
+        }
+
+        // load mask
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<avx512vl_256>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            constexpr auto size = batch_bool<T, A>::size;
+            constexpr auto chunk_size = size >= 8 ? 8 : 4;
+            constexpr auto iter = size / chunk_size;
+            static_assert((size % chunk_size) == 0, "incorrect size of bool batch");
+            register_type mask = 0;
+            for (std::size_t i = 0; i < iter; ++i)
+            {
+                unsigned char block = detail::tobitset<chunk_size>((unsigned char*)mem + i * chunk_size);
+                mask |= (register_type(block) << (i * chunk_size));
+            }
+            return mask;
+        }
+
+        // from bool
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512vl_256>) noexcept
+        {
+            return select(self, batch<T, A>(1), batch<T, A>(0));
+        }
+
+        // from_mask
+        template <class T, class A>
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx512vl_256>) noexcept
+        {
+            assert(mask == (mask & ((uint64_t(1) << batch_bool<T, A>::size) - 1)) && "inbound mask");
+            return static_cast<typename batch_bool<T, A>::register_type>(mask & ((uint64_t(1) << batch_bool<T, A>::size) - 1));
+        }
+
+        // mask
+        template <class A, class T>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx512vl_256>) noexcept
+        {
+            return self.data & ((uint64_t(1) << batch_bool<T, A>::size) - 1);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx512vl_256>) noexcept
+        {
+            return self.data;
+        }
+
+        // set
+        template <class A, class T, class... Values>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx512vl_256>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
+            using register_type = typename batch_bool<T, A>::register_type;
+            register_type r = 0;
+            unsigned shift = 0;
+            (void)std::initializer_list<register_type> { (r |= register_type(values ? 1 : 0) << (shift++))... };
+            return r;
+        }
+
+        // store
+        template <class T, class A>
+        XSIMD_INLINE void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512vl_256>) noexcept
+        {
+            constexpr auto size = batch_bool<T, A>::size;
+            for (std::size_t i = 0; i < size; ++i)
+                mem[i] = (self.data >> i) & 0x1;
+        }
+
+        // abs
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> abs(batch<int64_t, A> const& self, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_abs_epi64(self);
+        }
+
+        // load masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<int32_t, A> load_masked(int32_t const* mem, batch_bool_constant<int32_t, A, Values...> mask, convert<int32_t>, Mode, requires_arch<avx512vl_256>) noexcept
+        {
+            constexpr auto imm_mask = mask.mask();
+            return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<uint32_t, A> load_masked(uint32_t const* mem, batch_bool_constant<uint32_t, A, Values...> mask, convert<uint32_t>, Mode, requires_arch<avx512vl_256>) noexcept
+        {
+            constexpr auto imm_mask = mask.mask();
+            return _mm256_mask_loadu_epi32(_mm256_setzero_si256(), imm_mask, mem);
+        }
+
+        // store masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(uint32_t* mem, batch<uint32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx512vl_256>) noexcept
+        {
+            _mm256_mask_storeu_epi32(mem, mask.mask(), src);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(int32_t* mem, batch<int32_t, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx512vl_256>) noexcept
+        {
+            _mm256_mask_storeu_epi32(mem, mask.mask(), src);
+        }
+
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(uint64_t* mem, batch<uint64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx512vl_256>) noexcept
+        {
+            _mm256_mask_storeu_epi64(mem, mask.mask(), src);
+        }
+
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(int64_t* mem, batch<int64_t, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx512vl_256>) noexcept
+        {
+            _mm256_mask_storeu_epi64(mem, mask.mask(), src);
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<uint32_t, A, Values...> mask, Mode, requires_arch<avx512vl_256>) noexcept
+        {
+            _mm256_mask_storeu_ps(mem, mask.mask(), src);
+        }
+
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<uint64_t, A, Values...> mask, Mode, requires_arch<avx512vl_256>) noexcept
+        {
+            _mm256_mask_storeu_pd(mem, mask.mask(), src);
+        }
+
+        // max
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> max(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_max_epi64(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> max(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_max_epu64(self, other);
+        }
+
+        // min
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> min(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_min_epi64(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> min(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_min_epu64(self, other);
+        }
+
+        // swizzle (dynamic version)
+        template <class A>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_permutexvar_ps(mask, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_permutexvar_pd(mask, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_permutexvar_epi64(mask, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch<uint64_t, A> mask, requires_arch<avx512vl_256>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx512vl_256 {}));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_permutexvar_epi32(mask, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch<uint32_t, A> mask, requires_arch<avx512vl_256>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512vl_256 {}));
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> mask, requires_arch<avx512vl_256>) noexcept
+        {
+            return swizzle(batch<uint8_t, avx2> { self.data }, batch<uint8_t, avx2> { mask.data }, avx2 {}).data;
+        }
+        template <class A, typename T, detail::enable_sized_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint8_t, A> const& mask, requires_arch<avx512vl_256> req) noexcept
+        {
+            return bitwise_cast<T>(swizzle(bitwise_cast<uint8_t>(self), mask, req));
+        }
+        template <class A, typename T, detail::enable_sized_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<uint16_t, A> const& mask, requires_arch<avx512vl_256> req) noexcept
+        {
+            return bitwise_cast<T>(swizzle(bitwise_cast<uint16_t>(self), mask, req));
+        }
+
+        // swizzle
+        template <class A, uint8_t... Vals, typename T, detail::enable_sized_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<uint8_t, A, Vals...> const& mask, requires_arch<avx512vl_256>) noexcept
+        {
+            return swizzle(self, mask, fma3<avx2> {});
+        }
+        template <class A, uint16_t... Vals, typename T, detail::enable_sized_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<uint16_t, A, Vals...> const& mask, requires_arch<avx512vl_256>) noexcept
+        {
+            return swizzle(self, mask, fma3<avx2> {});
+        }
+        template <class A, uint32_t... Vals, typename T, detail::enable_sized_t<T, 4> = 0>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<uint32_t, A, Vals...> const& mask, requires_arch<avx512vl_256>) noexcept
+        {
+            return swizzle(self, mask, fma3<avx2> {});
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx512vl_256>) noexcept
+        {
+            constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3);
+            return _mm256_permutex_pd(self, mask);
+        }
+        template <class A, typename T, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3, detail::enable_sized_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx512vl_256>) noexcept
+        {
+            constexpr auto mask = detail::mod_shuffle(V0, V1, V2, V3);
+            return _mm256_permutex_epi64(self, mask);
+        }
+
+        // insert
+        template <class A, size_t I>
+        XSIMD_INLINE batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<avx512vl_256>) noexcept
+        {
+
+            int32_t tmp = bit_cast<int32_t>(val);
+            return _mm256_castsi256_ps(_mm256_mask_set1_epi32(_mm256_castps_si256(self), __mmask8(1 << (I & 7)), tmp));
+        }
+
+        template <class A, size_t I>
+        XSIMD_INLINE batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<avx512vl_256>) noexcept
+        {
+            int64_t tmp = bit_cast<int64_t>(val);
+            return _mm256_castsi256_pd(_mm256_mask_set1_epi64(_mm256_castpd_si256(self), __mmask8(1 << (I & 3)), tmp));
+        }
+
+        template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx512vl_256>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_mask_set1_epi32(self, __mmask8(1 << (I & 7)), val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_mask_set1_epi64(self, __mmask8(1 << (I & 3)), val);
+            }
+            else
+            {
+                return insert(self, val, pos, common {});
+            }
+        }
+
+        // isnan
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm256_cmp_ps_mask(self, self, _CMP_UNORD_Q);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm256_cmp_pd_mask(self, self, _CMP_UNORD_Q);
+        }
+
+        // rotl
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_rolv_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_rolv_epi64(self, other);
+            }
+            else
+            {
+                return rotl(self, other, avx2 {});
+            }
+        }
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, int32_t other, requires_arch<avx512vl_256>) noexcept
+        {
+            return rotl(self, batch<T, A>(other), A {});
+        }
+        template <size_t count, class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotl(batch<T, A> const& self, requires_arch<avx512vl_256>) noexcept
+        {
+            constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+            static_assert(count < bits, "Count must be less than the number of bits in T");
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_rol_epi32(self, count);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_rol_epi64(self, count);
+            }
+            else
+            {
+                return rotl<count>(self, avx2 {});
+            }
+        }
+
+        // rotr
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_rorv_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_rorv_epi64(self, other);
+                }
+            }
+            return rotr(self, other, avx2 {});
+        }
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, int32_t other, requires_arch<avx512vl_256>) noexcept
+        {
+            return rotr(self, batch<T, A>(other), A {});
+        }
+
+        template <size_t count, class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> rotr(batch<T, A> const& self, requires_arch<avx512vl_256>) noexcept
+        {
+            constexpr auto bits = std::numeric_limits<T>::digits + std::numeric_limits<T>::is_signed;
+            static_assert(count < bits, "Count must be less than the number of bits in T");
+            XSIMD_IF_CONSTEXPR(std::is_unsigned<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_ror_epi32(self, count);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_ror_epi64(self, count);
+                }
+            }
+            return rotr<count>(self, avx2 {});
+        }
+
+        // all
+        template <class A, class T>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<avx512vl_256>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            constexpr register_type bitmask = (register_type(1) << batch_bool<T, A>::size) - 1;
+            return (self.data & bitmask) == bitmask;
+        }
+
+        // any
+        template <class A, class T>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<avx512vl_256>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            constexpr register_type bitmask = (register_type(1) << batch_bool<T, A>::size) - 1;
+            return (self.data & bitmask) != 0;
+        }
+
+        // eq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm256_cmp_ps_mask(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm256_cmp_pd_mask(self, other, _CMP_EQ_OQ);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return detail::compare_int_avx512vl_256<A, T, _MM_CMPINT_EQ>(self, other);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(~self.data ^ other.data);
+        }
+
+        // neq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm256_cmp_ps_mask(self, other, _CMP_NEQ_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm256_cmp_pd_mask(self, other, _CMP_NEQ_OQ);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (~(self == other));
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data ^ other.data);
+        }
+
+        // gt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm256_cmp_ps_mask(self, other, _CMP_GT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm256_cmp_pd_mask(self, other, _CMP_GT_OQ);
+        }
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return detail::compare_int_avx512vl_256<A, T, _MM_CMPINT_GT>(self, other);
+        }
+
+        // ge
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm256_cmp_ps_mask(self, other, _CMP_GE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm256_cmp_pd_mask(self, other, _CMP_GE_OQ);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return detail::compare_int_avx512vl_256<A, T, _MM_CMPINT_GE>(self, other);
+        }
+
+        // lt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm256_cmp_ps_mask(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm256_cmp_pd_mask(self, other, _CMP_LT_OQ);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return detail::compare_int_avx512vl_256<A, T, _MM_CMPINT_LT>(self, other);
+        }
+
+        // le
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<float, A>::register_type)_mm256_cmp_ps_mask(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return (typename batch_bool<double, A>::register_type)_mm256_cmp_pd_mask(self, other, _CMP_LE_OQ);
+        }
+
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            return detail::compare_int_avx512vl_256<A, T, _MM_CMPINT_LE>(self, other);
+        }
+
+        // select
+        template <class A>
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_mask_blend_ps(cond, false_br, true_br);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_mask_blend_pd(cond, false_br, true_br);
+        }
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512vl_256>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                batch_bool<T, avx2> batch_cond = batch_bool<T, avx2>::from_mask(cond.mask());
+                return _mm256_blendv_epi8(false_br, true_br, batch_cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                batch_bool<T, avx2> batch_cond = batch_bool<T, avx2>::from_mask(cond.mask());
+                return _mm256_blendv_epi8(false_br, true_br, batch_cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_mask_blend_epi32(cond, false_br, true_br);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_mask_blend_epi64(cond, false_br, true_br);
+            }
+        }
+        template <class A, class T, bool... Values>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512vl_256>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512vl_256 {});
+        }
+
+        // reciprocal
+        template <class A>
+        XSIMD_INLINE batch<float, A>
+        reciprocal(batch<float, A> const& self,
+                   kernel::requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_rcp14_ps(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A>
+        reciprocal(batch<double, A> const& self,
+                   kernel::requires_arch<avx512vl_256>) noexcept
+        {
+            return _mm256_rcp14_pd(self);
+        }
+
+        // bitwise_and
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data & other.data);
+        }
+
+        // bitwise_andnot
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data & ~other.data);
+        }
+
+        // bitwise_not
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx512vl_256>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(~self.data);
+        }
+
+        // bitwise_or
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        // bitwise_xor
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data ^ other.data);
+        }
+
+        // sadd
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512vl_256>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = other < 0;
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(mask, self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
+
+    }
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp b/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp
index b285623d0..c95069df1 100644
--- a/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp
+++ b/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp
@@ -12,9 +12,6 @@
 #ifndef XSIMD_AVX512VNNI_AVX512_BW_HPP
 #define XSIMD_AVX512VNNI_AVX512_BW_HPP
 
-#include <array>
-#include <type_traits>
-
 #include "../types/xsimd_avx512vnni_avx512bw_register.hpp"
 
 #endif
diff --git a/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp b/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp
index 0b4ffd2e4..552869d25 100644
--- a/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp
+++ b/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp
@@ -12,9 +12,6 @@
 #ifndef XSIMD_AVX512VNNI_AVX512VBMI2_HPP
 #define XSIMD_AVX512VNNI_AVX512VBMI2_HPP
 
-#include <array>
-#include <type_traits>
-
 #include "../types/xsimd_avx512vnni_avx512vbmi2_register.hpp"
 
 #endif
diff --git a/include/xsimd/arch/xsimd_avx_128.hpp b/include/xsimd/arch/xsimd_avx_128.hpp
new file mode 100644
index 000000000..46fc9acb7
--- /dev/null
+++ b/include/xsimd/arch/xsimd_avx_128.hpp
@@ -0,0 +1,164 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Marco Barbone                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_128_HPP
+#define XSIMD_AVX_128_HPP
+
+#include "../types/xsimd_avx_register.hpp"
+#include "../types/xsimd_batch_constant.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        using namespace types;
+
+        // broadcast
+        template <class A, class T, class = std::enable_if_t<std::is_same<T, float>::value>>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<avx_128>) noexcept
+        {
+            return _mm_broadcast_ss(&val);
+        }
+
+        // eq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_EQ_OQ);
+        }
+
+        // gt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_GT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_GT_OQ);
+        }
+
+        // ge
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_GE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_GE_OQ);
+        }
+
+        // lt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_LT_OQ);
+        }
+
+        // le
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_LE_OQ);
+        }
+
+        // neq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_ps(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx_128>) noexcept
+        {
+            return _mm_cmp_pd(self, other, _CMP_NEQ_UQ);
+        }
+
+        // load_masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<float, A> load_masked(float const* mem, batch_bool_constant<float, A, Values...> mask, convert<float>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_ps(mem, mask.as_batch());
+        }
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE batch<double, A> load_masked(double const* mem, batch_bool_constant<double, A, Values...> mask, convert<double>, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskload_pd(mem, mask.as_batch());
+        }
+
+        // store_masked
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(float* mem, batch<float, A> const& src, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskstore_ps(mem, mask.as_batch(), src);
+        }
+
+        template <class A, bool... Values, class Mode>
+        XSIMD_INLINE void store_masked(double* mem, batch<double, A> const& src, batch_bool_constant<double, A, Values...> mask, Mode, requires_arch<avx_128>) noexcept
+        {
+            return _mm_maskstore_pd(mem, mask.as_batch(), src);
+        }
+
+        // swizzle (dynamic mask)
+        template <class A, class T, class ITy, class = std::enable_if_t<std::is_floating_point<T>::value && sizeof(T) == sizeof(ITy)>>
+        XSIMD_INLINE batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<avx_128>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(std::is_same<T, float>::value)
+            {
+                return _mm_permutevar_ps(self, mask);
+            }
+            else
+            {
+                // VPERMILPD's variable control reads bit 1 of each 64-bit selector
+                // (bit 0 is ignored), so a {0,1} index needs to become {0,2}.
+                // Negation is a cheap alternative to a left shift by 1.
+                return _mm_permutevar_pd(self, -mask);
+            }
+        }
+
+        // swizzle (constant mask)
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<avx_128>) noexcept
+        {
+            return _mm_permute_ps(self, detail::mod_shuffle(V0, V1, V2, V3));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<avx_128>) noexcept
+        {
+            return _mm_permute_pd(self, detail::mod_shuffle(V0, V1));
+        }
+
+    }
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_avxvnni.hpp b/include/xsimd/arch/xsimd_avxvnni.hpp
index a97ba9296..7c1fec96c 100644
--- a/include/xsimd/arch/xsimd_avxvnni.hpp
+++ b/include/xsimd/arch/xsimd_avxvnni.hpp
@@ -12,9 +12,6 @@
 #ifndef XSIMD_AVXVNNI_HPP
 #define XSIMD_AVXVNNI_HPP
 
-#include <array>
-#include <type_traits>
-
 #include "../types/xsimd_avxvnni_register.hpp"
 
 #endif
diff --git a/include/xsimd/arch/xsimd_common.hpp b/include/xsimd/arch/xsimd_common.hpp
index 11f21bd82..1d800e349 100644
--- a/include/xsimd/arch/xsimd_common.hpp
+++ b/include/xsimd/arch/xsimd_common.hpp
@@ -13,6 +13,7 @@
 #define XSIMD_COMMON_HPP
 
 #include "./common/xsimd_common_arithmetic.hpp"
+#include "./common/xsimd_common_bit.hpp"
 #include "./common/xsimd_common_cast.hpp"
 #include "./common/xsimd_common_complex.hpp"
 #include "./common/xsimd_common_logical.hpp"
diff --git a/include/xsimd/arch/xsimd_common_fwd.hpp b/include/xsimd/arch/xsimd_common_fwd.hpp
index 74bcd2351..f5a7f4ffe 100644
--- a/include/xsimd/arch/xsimd_common_fwd.hpp
+++ b/include/xsimd/arch/xsimd_common_fwd.hpp
@@ -15,6 +15,7 @@
 
 #include <cstdint>
 #include <type_traits>
+#include <utility>
 
 namespace xsimd
 {
@@ -58,6 +59,11 @@ namespace xsimd
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> mul_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE std::pair<batch<T, A>, batch<T, A>>
+        mul_hilo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<common>) noexcept;
@@ -101,6 +107,11 @@ namespace xsimd
         // Forward declarations for pack-level helpers
         namespace detail
         {
+            template <class T>
+            XSIMD_INLINE void reassociation_barrier(T& x, const char*) noexcept;
+            template <class T, class A>
+            XSIMD_INLINE void reassociation_barrier(batch<T, A>& b, const char* reason) noexcept;
+
             template <typename T, T... Vs>
             XSIMD_INLINE constexpr bool is_identity() noexcept;
             template <typename T, class A, T... Vs>
@@ -116,6 +127,14 @@ namespace xsimd
             template <typename T, class A, T... Vs>
             XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant<T, A, Vs...>) noexcept;
 
+            template <class A, class WMul>
+            XSIMD_INLINE batch<uint64_t, A> mulhi_u64_core(batch<uint64_t, A> const& x,
+                                                           batch<uint64_t, A> const& y,
+                                                           WMul mul_epu32) noexcept;
+            template <class A, class WMul>
+            XSIMD_INLINE batch<int64_t, A> mulhi_i64_core(batch<int64_t, A> const& x,
+                                                          batch<int64_t, A> const& y,
+                                                          WMul mul_epu32) noexcept;
         }
     }
 }
diff --git a/include/xsimd/arch/xsimd_constants.hpp b/include/xsimd/arch/xsimd_constants.hpp
index 916cdf70d..00b719fc9 100644
--- a/include/xsimd/arch/xsimd_constants.hpp
+++ b/include/xsimd/arch/xsimd_constants.hpp
@@ -12,10 +12,10 @@
 #ifndef XSIMD_NUMERICAL_CONSTANT_HPP
 #define XSIMD_NUMERICAL_CONSTANT_HPP
 
-#include <limits>
-
 #include "../types/xsimd_utils.hpp"
 
+#include <limits>
+
 namespace xsimd
 {
 
diff --git a/include/xsimd/arch/xsimd_emulated.hpp b/include/xsimd/arch/xsimd_emulated.hpp
index 4437174a9..4426c543f 100644
--- a/include/xsimd/arch/xsimd_emulated.hpp
+++ b/include/xsimd/arch/xsimd_emulated.hpp
@@ -12,16 +12,14 @@
 #ifndef XSIMD_EMULATED_HPP
 #define XSIMD_EMULATED_HPP
 
-#include <complex>
-#include <limits>
-#include <numeric>
-#include <type_traits>
-
 #include "../arch/xsimd_scalar.hpp"
-
 #include "../types/xsimd_emulated_register.hpp"
 #include "../types/xsimd_utils.hpp"
 
+#include <complex>
+#include <numeric>
+#include <type_traits>
+
 namespace xsimd
 {
     template <typename T, class A, bool... Values>
@@ -508,7 +506,7 @@ namespace xsimd
             constexpr size_t size = batch<T, A>::size;
             uint64_t res = 0;
             for (size_t i = 0; i < size; ++i)
-                res |= (self.data[i] ? 1u : 0u) << i;
+                res |= (uint64_t)(self.data[i] ? 1u : 0u) << i;
             return res;
         }
 
diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp
index 1772159a0..cf88f64d7 100644
--- a/include/xsimd/arch/xsimd_isa.hpp
+++ b/include/xsimd/arch/xsimd_isa.hpp
@@ -13,7 +13,6 @@
 #define XSIMD_ISA_HPP
 
 #include "../config/xsimd_arch.hpp"
-
 #include "./xsimd_common_fwd.hpp"
 
 #if XSIMD_WITH_EMULATED
@@ -50,6 +49,7 @@
 
 #if XSIMD_WITH_AVX
 #include "./xsimd_avx.hpp"
+#include "./xsimd_avx_128.hpp"
 #endif
 
 #if XSIMD_WITH_FMA3_AVX
@@ -62,6 +62,7 @@
 
 #if XSIMD_WITH_AVX2
 #include "./xsimd_avx2.hpp"
+#include "./xsimd_avx2_128.hpp"
 #endif
 
 #if XSIMD_WITH_FMA3_AVX2
@@ -72,6 +73,12 @@
 #include "./xsimd_avx512f.hpp"
 #endif
 
+#if XSIMD_WITH_AVX512VL
+#include "./xsimd_avx512vl.hpp"
+#include "./xsimd_avx512vl_128.hpp"
+#include "./xsimd_avx512vl_256.hpp"
+#endif
+
 #if XSIMD_WITH_AVX512DQ
 #include "./xsimd_avx512dq.hpp"
 #endif
@@ -88,6 +95,10 @@
 #include "./xsimd_avx512pf.hpp"
 #endif
 
+#if XSIMD_WITH_AVX512VL
+#include "./xsimd_avx512pf.hpp"
+#endif
+
 #if XSIMD_WITH_AVX512IFMA
 #include "./xsimd_avx512ifma.hpp"
 #endif
@@ -136,6 +147,10 @@
 #include "./xsimd_vsx.hpp"
 #endif
 
+#if XSIMD_WITH_VXE
+#include "./xsimd_vxe.hpp"
+#endif
+
 // Must come last to have access to all conversion specializations.
 #include "./xsimd_common.hpp"
 
diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp
index 55b3b8d81..8ea2756f7 100644
--- a/include/xsimd/arch/xsimd_neon.hpp
+++ b/include/xsimd/arch/xsimd_neon.hpp
@@ -12,284 +12,28 @@
 #ifndef XSIMD_NEON_HPP
 #define XSIMD_NEON_HPP
 
-#include <algorithm>
-#include <array>
-#include <complex>
-#include <tuple>
-#include <type_traits>
-
+#include "../types/xsimd_batch_fwd.hpp"
 #include "../types/xsimd_neon_register.hpp"
 #include "../types/xsimd_utils.hpp"
+#include "../utils/xsimd_type_traits.hpp"
+#include "./common/xsimd_common_bit.hpp"
 #include "./common/xsimd_common_cast.hpp"
+#include "./xsimd_common_fwd.hpp"
 
-// Wrap intrinsics so we can pass them as function pointers
-// - OP: intrinsics name prefix, e.g., vorrq
-// - RT: type traits to deduce intrinsics return types
-#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                     \
-    namespace wrap                                                                \
-    {                                                                             \
-        XSIMD_INLINE RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept  \
-        {                                                                         \
-            return ::OP##_u8(a, b);                                               \
-        }                                                                         \
-        XSIMD_INLINE RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
-        {                                                                         \
-            return ::OP##_u16(a, b);                                              \
-        }                                                                         \
-        XSIMD_INLINE RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
-        {                                                                         \
-            return ::OP##_u32(a, b);                                              \
-        }                                                                         \
-    }
-
-#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                   \
-    WRAP_BINARY_UINT_EXCLUDING_64(OP, RT)                                      \
-    namespace wrap                                                             \
-    {                                                                          \
-        XSIMD_INLINE RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept  \
-        {                                                                      \
-            return ::OP##_s8(a, b);                                            \
-        }                                                                      \
-        XSIMD_INLINE RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
-        {                                                                      \
-            return ::OP##_s16(a, b);                                           \
-        }                                                                      \
-        XSIMD_INLINE RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
-        {                                                                      \
-            return ::OP##_s32(a, b);                                           \
-        }                                                                      \
-    }
-
-#define WRAP_BINARY_INT(OP, RT)                                                   \
-    WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                          \
-    namespace wrap                                                                \
-    {                                                                             \
-        XSIMD_INLINE RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
-        {                                                                         \
-            return ::OP##_u64(a, b);                                              \
-        }                                                                         \
-        XSIMD_INLINE RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept    \
-        {                                                                         \
-            return ::OP##_s64(a, b);                                              \
-        }                                                                         \
-    }
-
-#define WRAP_BINARY_FLOAT(OP, RT)                                                    \
-    namespace wrap                                                                   \
-    {                                                                                \
-        XSIMD_INLINE RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
-        {                                                                            \
-            return ::OP##_f32(a, b);                                                 \
-        }                                                                            \
-    }
-
-#define WRAP_UNARY_INT_EXCLUDING_64(OP)                         \
-    namespace wrap                                              \
-    {                                                           \
-        XSIMD_INLINE uint8x16_t OP##_u8(uint8x16_t a) noexcept  \
-        {                                                       \
-            return ::OP##_u8(a);                                \
-        }                                                       \
-        XSIMD_INLINE int8x16_t OP##_s8(int8x16_t a) noexcept    \
-        {                                                       \
-            return ::OP##_s8(a);                                \
-        }                                                       \
-        XSIMD_INLINE uint16x8_t OP##_u16(uint16x8_t a) noexcept \
-        {                                                       \
-            return ::OP##_u16(a);                               \
-        }                                                       \
-        XSIMD_INLINE int16x8_t OP##_s16(int16x8_t a) noexcept   \
-        {                                                       \
-            return ::OP##_s16(a);                               \
-        }                                                       \
-        XSIMD_INLINE uint32x4_t OP##_u32(uint32x4_t a) noexcept \
-        {                                                       \
-            return ::OP##_u32(a);                               \
-        }                                                       \
-        XSIMD_INLINE int32x4_t OP##_s32(int32x4_t a) noexcept   \
-        {                                                       \
-            return ::OP##_s32(a);                               \
-        }                                                       \
-    }
-
-#define WRAP_UNARY_INT(OP)                                      \
-    WRAP_UNARY_INT_EXCLUDING_64(OP)                             \
-    namespace wrap                                              \
-    {                                                           \
-        XSIMD_INLINE uint64x2_t OP##_u64(uint64x2_t a) noexcept \
-        {                                                       \
-            return ::OP##_u64(a);                               \
-        }                                                       \
-        XSIMD_INLINE int64x2_t OP##_s64(int64x2_t a) noexcept   \
-        {                                                       \
-            return ::OP##_s64(a);                               \
-        }                                                       \
-    }
-
-#define WRAP_UNARY_FLOAT(OP)                                      \
-    namespace wrap                                                \
-    {                                                             \
-        XSIMD_INLINE float32x4_t OP##_f32(float32x4_t a) noexcept \
-        {                                                         \
-            return ::OP##_f32(a);                                 \
-        }                                                         \
-    }
-
-// Dummy identity caster to ease coding
-XSIMD_INLINE uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
-XSIMD_INLINE int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
-XSIMD_INLINE uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
-XSIMD_INLINE int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
-XSIMD_INLINE uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
-XSIMD_INLINE int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
-XSIMD_INLINE uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
-XSIMD_INLINE int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
-XSIMD_INLINE float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <complex>
+#include <type_traits>
 
 namespace xsimd
 {
-    template <typename T, class A, bool... Values>
-    struct batch_bool_constant;
-
     namespace kernel
     {
         using namespace types;
 
         namespace detail
         {
-            template <template <class> class return_type, class... T>
-            struct neon_dispatcher_base
-            {
-                struct unary
-                {
-                    using container_type = std::tuple<return_type<T> (*)(T)...>;
-                    const container_type m_func;
-
-                    template <class U>
-                    return_type<U> apply(U rhs) const noexcept
-                    {
-                        using func_type = return_type<U> (*)(U);
-                        auto func = std::get<func_type>(m_func);
-                        return func(rhs);
-                    }
-                };
-
-                struct binary
-                {
-                    using container_type = std::tuple<return_type<T> (*)(T, T)...>;
-                    const container_type m_func;
-
-                    template <class U>
-                    return_type<U> apply(U lhs, U rhs) const noexcept
-                    {
-                        using func_type = return_type<U> (*)(U, U);
-                        auto func = std::get<func_type>(m_func);
-                        return func(lhs, rhs);
-                    }
-                };
-            };
-
-            /***************************
-             *  arithmetic dispatchers *
-             ***************************/
-
-            template <class T>
-            using identity_return_type = T;
-
-            template <class... T>
-            struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...>
-            {
-            };
-
-            using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
-                                                         uint16x8_t, int16x8_t,
-                                                         uint32x4_t, int32x4_t,
-                                                         uint64x2_t, int64x2_t,
-                                                         float32x4_t>;
-
-            using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
-                                                                    uint16x8_t, int16x8_t,
-                                                                    uint32x4_t, int32x4_t,
-                                                                    float32x4_t>;
-
-            using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
-                                                                       uint16x8_t, int16x8_t,
-                                                                       uint32x4_t, int32x4_t>;
-
-            /**************************
-             * comparison dispatchers *
-             **************************/
-
-            template <class T>
-            struct comp_return_type_impl;
-
-            template <>
-            struct comp_return_type_impl<uint8x16_t>
-            {
-                using type = uint8x16_t;
-            };
-
-            template <>
-            struct comp_return_type_impl<int8x16_t>
-            {
-                using type = uint8x16_t;
-            };
-
-            template <>
-            struct comp_return_type_impl<uint16x8_t>
-            {
-                using type = uint16x8_t;
-            };
-
-            template <>
-            struct comp_return_type_impl<int16x8_t>
-            {
-                using type = uint16x8_t;
-            };
-
-            template <>
-            struct comp_return_type_impl<uint32x4_t>
-            {
-                using type = uint32x4_t;
-            };
-
-            template <>
-            struct comp_return_type_impl<int32x4_t>
-            {
-                using type = uint32x4_t;
-            };
-
-            template <>
-            struct comp_return_type_impl<uint64x2_t>
-            {
-                using type = uint64x2_t;
-            };
-
-            template <>
-            struct comp_return_type_impl<int64x2_t>
-            {
-                using type = uint64x2_t;
-            };
-
-            template <>
-            struct comp_return_type_impl<float32x4_t>
-            {
-                using type = uint32x4_t;
-            };
-
-            template <class T>
-            using comp_return_type = typename comp_return_type_impl<T>::type;
-
-            template <class... T>
-            struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...>
-            {
-            };
-
-            using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t,
-                                                                              uint16x8_t, int16x8_t,
-                                                                              uint32x4_t, int32x4_t,
-                                                                              float32x4_t>;
-
             /**************************************
              * enabling / disabling metafunctions *
              **************************************/
@@ -303,6 +47,218 @@ namespace xsimd
                 = std::enable_if_t<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>;
         }
 
+        /****************
+         * bitwise_cast *
+         ****************/
+
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint8_t>::value && std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vreinterpretq(uint8x16_t a) noexcept { return a; }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint8_t>::value && std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_u8_s8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint8_t>::value && std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_u8_u16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint8_t>::value && std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_u8_s16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint8_t>::value && std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_u8_u32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint8_t>::value && std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_u8_s32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint8_t>::value && std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_u8_u64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint8_t>::value && std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_u8_s64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint8_t>::value && std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_u8_f32(a); }
+
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, int8_t>::value && std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_s8_u8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int8_t>::value && std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vreinterpretq(int8x16_t a) noexcept { return a; }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int8_t>::value && std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_s8_u16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int8_t>::value && std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_s8_s16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int8_t>::value && std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_s8_u32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int8_t>::value && std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_s8_s32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int8_t>::value && std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_s8_u64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int8_t>::value && std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_s8_s64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int8_t>::value && std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_s8_f32(a); }
+
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint16_t>::value && std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_u16_u8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint16_t>::value && std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_u16_s8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint16_t>::value && std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vreinterpretq(uint16x8_t a) noexcept { return a; }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint16_t>::value && std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_u16_s16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint16_t>::value && std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_u16_u32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint16_t>::value && std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_u16_s32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint16_t>::value && std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_u16_u64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint16_t>::value && std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_u16_s64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint16_t>::value && std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_u16_f32(a); }
+
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, int16_t>::value && std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_s16_u8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int16_t>::value && std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_s16_s8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int16_t>::value && std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_s16_u16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int16_t>::value && std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vreinterpretq(int16x8_t a) noexcept { return a; }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int16_t>::value && std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_s16_u32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int16_t>::value && std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_s16_s32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int16_t>::value && std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_s16_u64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int16_t>::value && std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_s16_s64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int16_t>::value && std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_s16_f32(a); }
+
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint32_t>::value && std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_u32_u8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint32_t>::value && std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_u32_s8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint32_t>::value && std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_u32_u16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint32_t>::value && std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_u32_s16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint32_t>::value && std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vreinterpretq(uint32x4_t a) noexcept { return a; }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint32_t>::value && std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_u32_s32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint32_t>::value && std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_u32_u64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint32_t>::value && std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_u32_s64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint32_t>::value && std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_u32_f32(a); }
+
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, int32_t>::value && std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_s32_u8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int32_t>::value && std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_s32_s8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int32_t>::value && std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_s32_u16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int32_t>::value && std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_s32_s16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int32_t>::value && std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_s32_u32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int32_t>::value && std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vreinterpretq(int32x4_t a) noexcept { return a; }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int32_t>::value && std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_s32_u64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int32_t>::value && std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_s32_s64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int32_t>::value && std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_s32_f32(a); }
+
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint64_t>::value && std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_u64_u8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint64_t>::value && std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_u64_s8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint64_t>::value && std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_u64_u16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint64_t>::value && std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_u64_s16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint64_t>::value && std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_u64_u32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint64_t>::value && std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_u64_s32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint64_t>::value && std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vreinterpretq(uint64x2_t a) noexcept { return a; }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint64_t>::value && std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_u64_s64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint64_t>::value && std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_u64_f32(a); }
+
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, int64_t>::value && std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_s64_u8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int64_t>::value && std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_s64_s8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int64_t>::value && std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_s64_u16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int64_t>::value && std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_s64_s16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int64_t>::value && std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_s64_u32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int64_t>::value && std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_s64_s32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int64_t>::value && std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_s64_u64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int64_t>::value && std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vreinterpretq(int64x2_t a) noexcept { return a; }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int64_t>::value && std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_s64_f32(a); }
+
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, float>::value && std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_f32_u8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, float>::value && std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_f32_s8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, float>::value && std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_f32_u16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, float>::value && std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_f32_s16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, float>::value && std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_f32_u32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, float>::value && std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_f32_s32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, float>::value && std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_f32_u64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, float>::value && std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_f32_s64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, float>::value && std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vreinterpretq(float32x4_t a) noexcept { return a; }
+        }
+
+        template <class A, class T, class R>
+        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
+        {
+            using src_register_type = typename batch<T, A>::register_type;
+            return wrap::x_vreinterpretq<map_to_sized_type_t<R>, map_to_sized_type_t<T>>(src_register_type(arg));
+        }
+
         /*************
          * broadcast *
          *************/
@@ -361,47 +317,6 @@ namespace xsimd
             return vdupq_n_f32(val);
         }
 
-        /*******
-         * set *
-         *******/
-
-        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
-        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
-        {
-            return xsimd::types::detail::neon_vector_type<T> { args... };
-        }
-
-        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
-        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
-        {
-            using register_type = typename batch_bool<T, A>::register_type;
-            using unsigned_type = as_unsigned_integer_t<T>;
-            return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
-        }
-
-        template <class A>
-        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
-        {
-            return float32x4_t { f0, f1, f2, f3 };
-        }
-
-        template <class A>
-        XSIMD_INLINE batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
-                                                       std::complex<float> c0, std::complex<float> c1,
-                                                       std::complex<float> c2, std::complex<float> c3) noexcept
-        {
-            return batch<std::complex<float>, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
-                                                 float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
-        }
-
-        template <class A, class... Args>
-        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
-        {
-            using register_type = typename batch_bool<float, A>::register_type;
-            using unsigned_type = as_unsigned_integer_t<float>;
-            return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
-        }
-
         /*************
          * from_bool *
          *************/
@@ -415,7 +330,7 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
-            return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1));
+            return vreinterpretq_s8_u8(vandq_u8(arg.data, vdupq_n_u8(1)));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
@@ -427,7 +342,7 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
-            return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1));
+            return vreinterpretq_s16_u16(vandq_u16(arg.data, vdupq_n_u16(1)));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
@@ -439,7 +354,7 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
-            return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1));
+            return vreinterpretq_s32_u32(vandq_u32(arg.data, vdupq_n_u32(1)));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
@@ -451,7 +366,7 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
-            return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1));
+            return vreinterpretq_s64_u64(vandq_u64(arg.data, vdupq_n_u64(1)));
         }
 
         template <class A>
@@ -468,8 +383,6 @@ namespace xsimd
         // immediate instead.
 #if defined(__clang__) || defined(__GNUC__)
 #define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
-#elif defined(_MSC_VER)
-#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
 #else
 #define xsimd_aligned_load(inst, type, expr) inst((type)expr)
 #endif
@@ -579,7 +492,8 @@ namespace xsimd
         XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<neon>) noexcept
         {
             auto vmem = load_unaligned<A>((unsigned char const*)mem, convert<unsigned char> {}, A {});
-            return { 0 - vmem.data };
+            auto const zero = batch<unsigned char, A> { 0 };
+            return { (zero - vmem).data };
         }
         template <class A, class T, detail::enable_sized_t<T, 1> = 0>
         XSIMD_INLINE batch_bool<T, A> load_aligned(bool const* mem, batch_bool<T, A> t, requires_arch<neon> r) noexcept
@@ -590,8 +504,9 @@ namespace xsimd
         template <class A, class T, detail::enable_sized_t<T, 2> = 0>
         XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<neon>) noexcept
         {
-            uint16x8_t vmem = vmovl_u8(vld1_u8((unsigned char const*)mem));
-            return { 0 - vmem };
+            auto const vmem = batch<uint16_t, A>(vmovl_u8(vld1_u8((unsigned char const*)mem)));
+            auto const zero = batch<uint16_t, A> { 0 };
+            return { (zero - vmem).data };
         }
 
         template <class A, class T, detail::enable_sized_t<T, 2> = 0>
@@ -604,7 +519,9 @@ namespace xsimd
         XSIMD_INLINE batch_bool<T, A> load_unaligned(bool const* mem, batch_bool<T, A>, requires_arch<neon>) noexcept
         {
             uint8x8_t tmp = vreinterpret_u8_u32(vset_lane_u32(*(unsigned int*)mem, vdup_n_u32(0), 0));
-            return { 0 - vmovl_u16(vget_low_u16(vmovl_u8(tmp))) };
+            auto const vmem = batch<uint32_t, A>(vmovl_u16(vget_low_u16(vmovl_u8(tmp))));
+            auto const zero = batch<uint32_t, A> { 0 };
+            return { (zero - vmem).data };
         }
 
         template <class A, class T, detail::enable_sized_t<T, 4> = 0>
@@ -623,7 +540,7 @@ namespace xsimd
             struct load_masked<>
             {
                 template <size_t I, class A, class T, bool Use>
-                static XSIMD_INLINE batch<T, A> apply(T const* mem, batch<T, A> acc, std::integral_constant<bool, Use>) noexcept
+                static XSIMD_INLINE batch<T, A> apply(T const* /* mem */, batch<T, A> acc, std::integral_constant<bool, Use>) noexcept
                 {
                     return acc;
                 }
@@ -645,7 +562,7 @@ namespace xsimd
         }
 
         template <class A, class T, bool Value, bool... Values, class Mode>
-        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Value, Values...> mask, Mode, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<T, A, Value, Values...> /* mask */, Mode, requires_arch<neon>) noexcept
         {
             // Call insert whenever Values... are true
             return detail::load_masked<Values...>::template apply<0>(mem, broadcast(T(0), A {}), std::integral_constant<bool, Value> {});
@@ -763,28 +680,36 @@ namespace xsimd
         XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
         {
             uint8x16_t val = vshrq_n_u8(b.data, 7);
-            vst1q_u8((uint8_t*)mem, val);
+            alignas(A::alignment()) uint8_t buffer[batch_bool<T, A>::size];
+            vst1q_u8(buffer, val);
+            memcpy(mem, buffer, sizeof(buffer));
         }
 
         template <class T, class A, detail::enable_sized_t<T, 2> = 0>
         XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
         {
             uint8x8_t val = vshr_n_u8(vqmovn_u16(b.data), 7);
-            vst1_u8((uint8_t*)mem, val);
+            alignas(A::alignment()) uint8_t buffer[batch_bool<T, A>::size];
+            vst1_u8(buffer, val);
+            memcpy(mem, buffer, sizeof(buffer));
         }
 
         template <class T, class A, detail::enable_sized_t<T, 4> = 0>
         XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
         {
             uint8x8_t val = vshr_n_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(b.data), vdup_n_u16(0))), 7);
-            vst1_lane_u32((uint32_t*)mem, vreinterpret_u32_u8(val), 0);
+            alignas(A::alignment()) uint8_t buffer[8];
+            vst1_u8(buffer, val);
+            memcpy(mem, buffer, batch_bool<T, A>::size);
         }
 
         template <class T, class A, detail::enable_sized_t<T, 8> = 0>
         XSIMD_INLINE void store(batch_bool<T, A> b, bool* mem, requires_arch<neon>) noexcept
         {
             uint8x8_t val = vshr_n_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(vcombine_u32(vqmovn_u64(b.data), vdup_n_u32(0))), vdup_n_u16(0))), 7);
-            vst1_lane_u16((uint16_t*)mem, vreinterpret_u16_u8(val), 0);
+            alignas(A::alignment()) uint8_t buffer[8];
+            vst1_u8(buffer, val);
+            memcpy(mem, buffer, batch_bool<T, A>::size);
         }
 
         template <class A>
@@ -793,6 +718,49 @@ namespace xsimd
             store(batch_bool<uint32_t, A>(b.data), mem, A {});
         }
 
+        /*******
+         * set *
+         *******/
+
+        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<neon> req, Args... args) noexcept
+        {
+            alignas(A::alignment()) T data[] = { static_cast<T>(args)... };
+            return load_aligned<A, T>(data, {}, req);
+        }
+
+        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            using unsigned_type = as_unsigned_integer_t<T>;
+            auto const out = batch<unsigned_type, A> { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
+            return { out.data };
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<neon> req, float f0, float f1, float f2, float f3) noexcept
+        {
+            alignas(A::alignment()) float data[] = { f0, f1, f2, f3 };
+            return load_aligned<A>(data, {}, req);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
+                                                       std::complex<float> c0, std::complex<float> c1,
+                                                       std::complex<float> c2, std::complex<float> c3) noexcept
+        {
+            return batch<std::complex<float>, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
+                                                 float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
+        }
+
+        template <class A, class... Args>
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            using unsigned_type = as_unsigned_integer_t<float>;
+            auto const out = batch<unsigned_type, A> { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
+            return { out.data };
+        }
+
         /*******
          * neg *
          *******/
@@ -849,124 +817,331 @@ namespace xsimd
          * add *
          *******/
 
-        WRAP_BINARY_INT(vaddq, detail::identity_return_type)
-        WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vaddq(uint8x16_t a, uint8x16_t b) noexcept { return vaddq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vaddq(int8x16_t a, int8x16_t b) noexcept { return vaddq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vaddq(uint16x8_t a, uint16x8_t b) noexcept { return vaddq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vaddq(int16x8_t a, int16x8_t b) noexcept { return vaddq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vaddq(uint32x4_t a, uint32x4_t b) noexcept { return vaddq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vaddq(int32x4_t a, int32x4_t b) noexcept { return vaddq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vaddq(uint64x2_t a, uint64x2_t b) noexcept { return vaddq_u64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vaddq(int64x2_t a, int64x2_t b) noexcept { return vaddq_s64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vaddq(float32x4_t a, float32x4_t b) noexcept { return vaddq_f32(a, b); }
+        }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16,
-                                wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64,
-                                wrap::vaddq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vaddq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         /*******
          * avg *
          *******/
 
-        WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vhaddq(uint8x16_t a, uint8x16_t b) noexcept { return vhaddq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vhaddq(uint16x8_t a, uint16x8_t b) noexcept { return vhaddq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vhaddq(uint32x4_t a, uint32x4_t b) noexcept { return vhaddq_u32(a, b); }
+        }
 
         template <class A, class T, class = std::enable_if_t<(std::is_unsigned<T>::value && sizeof(T) != 8)>>
         XSIMD_INLINE batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
-                std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vhaddq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         /********
          * avgr *
          ********/
 
-        WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vrhaddq(uint8x16_t a, uint8x16_t b) noexcept { return vrhaddq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vrhaddq(uint16x8_t a, uint16x8_t b) noexcept { return vrhaddq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vrhaddq(uint32x4_t a, uint32x4_t b) noexcept { return vrhaddq_u32(a, b); }
+        }
 
         template <class A, class T, class = std::enable_if_t<(std::is_unsigned<T>::value && sizeof(T) != 8)>>
         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
-                std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vrhaddq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         /********
          * sadd *
          ********/
 
-        WRAP_BINARY_INT(vqaddq, detail::identity_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vqaddq(uint8x16_t a, uint8x16_t b) noexcept { return vqaddq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vqaddq(int8x16_t a, int8x16_t b) noexcept { return vqaddq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vqaddq(uint16x8_t a, uint16x8_t b) noexcept { return vqaddq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vqaddq(int16x8_t a, int16x8_t b) noexcept { return vqaddq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vqaddq(uint32x4_t a, uint32x4_t b) noexcept { return vqaddq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vqaddq(int32x4_t a, int32x4_t b) noexcept { return vqaddq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vqaddq(uint64x2_t a, uint64x2_t b) noexcept { return vqaddq_u64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vqaddq(int64x2_t a, int64x2_t b) noexcept { return vqaddq_s64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vqaddq(float32x4_t a, float32x4_t b) noexcept { return vaddq_f32(a, b); }
+        }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16,
-                                wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64,
-                                wrap::vaddq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vqaddq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         /*******
          * sub *
          *******/
 
-        WRAP_BINARY_INT(vsubq, detail::identity_return_type)
-        WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vsubq(uint8x16_t a, uint8x16_t b) noexcept { return vsubq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vsubq(int8x16_t a, int8x16_t b) noexcept { return vsubq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vsubq(uint16x8_t a, uint16x8_t b) noexcept { return vsubq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vsubq(int16x8_t a, int16x8_t b) noexcept { return vsubq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vsubq(uint32x4_t a, uint32x4_t b) noexcept { return vsubq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vsubq(int32x4_t a, int32x4_t b) noexcept { return vsubq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vsubq(uint64x2_t a, uint64x2_t b) noexcept { return vsubq_u64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vsubq(int64x2_t a, int64x2_t b) noexcept { return vsubq_s64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vsubq(float32x4_t a, float32x4_t b) noexcept { return vsubq_f32(a, b); }
+        }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16,
-                                wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64,
-                                wrap::vsubq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vsubq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         /********
          * ssub *
          ********/
 
-        WRAP_BINARY_INT(vqsubq, detail::identity_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vqsubq(uint8x16_t a, uint8x16_t b) noexcept { return vqsubq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vqsubq(int8x16_t a, int8x16_t b) noexcept { return vqsubq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vqsubq(uint16x8_t a, uint16x8_t b) noexcept { return vqsubq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vqsubq(int16x8_t a, int16x8_t b) noexcept { return vqsubq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vqsubq(uint32x4_t a, uint32x4_t b) noexcept { return vqsubq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vqsubq(int32x4_t a, int32x4_t b) noexcept { return vqsubq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vqsubq(uint64x2_t a, uint64x2_t b) noexcept { return vqsubq_u64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vqsubq(int64x2_t a, int64x2_t b) noexcept { return vqsubq_s64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vqsubq(float32x4_t a, float32x4_t b) noexcept { return vsubq_f32(a, b); }
+        }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16,
-                                wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64,
-                                wrap::vsubq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vqsubq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         /*******
          * mul *
          *******/
 
-        WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type)
-        WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vmulq(uint8x16_t a, uint8x16_t b) noexcept { return vmulq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vmulq(int8x16_t a, int8x16_t b) noexcept { return vmulq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vmulq(uint16x8_t a, uint16x8_t b) noexcept { return vmulq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vmulq(int16x8_t a, int16x8_t b) noexcept { return vmulq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vmulq(uint32x4_t a, uint32x4_t b) noexcept { return vmulq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vmulq(int32x4_t a, int32x4_t b) noexcept { return vmulq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vmulq(float32x4_t a, float32x4_t b) noexcept { return vmulq_f32(a, b); }
+        }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16,
-                                wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vmulq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
+        }
+
+        /*********
+         * mul_hi *
+         *********/
+
+        template <class A>
+        XSIMD_INLINE batch<int8_t, A> mul_hi(batch<int8_t, A> const& lhs, batch<int8_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int16x8_t lo = vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs));
+            int16x8_t hi = vmull_s8(vget_high_s8(lhs), vget_high_s8(rhs));
+            return vuzpq_s8(vreinterpretq_s8_s16(lo), vreinterpretq_s8_s16(hi)).val[1];
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint8_t, A> mul_hi(batch<uint8_t, A> const& lhs, batch<uint8_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint16x8_t lo = vmull_u8(vget_low_u8(lhs), vget_low_u8(rhs));
+            uint16x8_t hi = vmull_u8(vget_high_u8(lhs), vget_high_u8(rhs));
+            return vuzpq_u8(vreinterpretq_u8_u16(lo), vreinterpretq_u8_u16(hi)).val[1];
+        }
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> mul_hi(batch<int16_t, A> const& lhs, batch<int16_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int32x4_t lo = vmull_s16(vget_low_s16(lhs), vget_low_s16(rhs));
+            int32x4_t hi = vmull_s16(vget_high_s16(lhs), vget_high_s16(rhs));
+            return vuzpq_s16(vreinterpretq_s16_s32(lo), vreinterpretq_s16_s32(hi)).val[1];
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint16_t, A> mul_hi(batch<uint16_t, A> const& lhs, batch<uint16_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint32x4_t lo = vmull_u16(vget_low_u16(lhs), vget_low_u16(rhs));
+            uint32x4_t hi = vmull_u16(vget_high_u16(lhs), vget_high_u16(rhs));
+            return vuzpq_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi)).val[1];
+        }
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> mul_hi(batch<int32_t, A> const& lhs, batch<int32_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int64x2_t lo = vmull_s32(vget_low_s32(lhs), vget_low_s32(rhs));
+            int64x2_t hi = vmull_s32(vget_high_s32(lhs), vget_high_s32(rhs));
+            return vuzpq_s32(vreinterpretq_s32_s64(lo), vreinterpretq_s32_s64(hi)).val[1];
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> mul_hi(batch<uint32_t, A> const& lhs, batch<uint32_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint64x2_t lo = vmull_u32(vget_low_u32(lhs), vget_low_u32(rhs));
+            uint64x2_t hi = vmull_u32(vget_high_u32(lhs), vget_high_u32(rhs));
+            return vuzpq_u32(vreinterpretq_u32_u64(lo), vreinterpretq_u32_u64(hi)).val[1];
+        }
+        // 64-bit intentionally falls through to the common scalar fallback
+
+        /************
+         * mul_hilo *
+         ************/
+
+        template <class A>
+        XSIMD_INLINE std::pair<batch<int8_t, A>, batch<int8_t, A>>
+        mul_hilo(batch<int8_t, A> const& lhs, batch<int8_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int16x8_t lo = vmull_s8(vget_low_s8(lhs), vget_low_s8(rhs));
+            int16x8_t hi = vmull_s8(vget_high_s8(lhs), vget_high_s8(rhs));
+            int8x16x2_t uzp = vuzpq_s8(vreinterpretq_s8_s16(lo), vreinterpretq_s8_s16(hi));
+            return { batch<int8_t, A>(uzp.val[1]), batch<int8_t, A>(uzp.val[0]) };
+        }
+        template <class A>
+        XSIMD_INLINE std::pair<batch<uint8_t, A>, batch<uint8_t, A>>
+        mul_hilo(batch<uint8_t, A> const& lhs, batch<uint8_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint16x8_t lo = vmull_u8(vget_low_u8(lhs), vget_low_u8(rhs));
+            uint16x8_t hi = vmull_u8(vget_high_u8(lhs), vget_high_u8(rhs));
+            uint8x16x2_t uzp = vuzpq_u8(vreinterpretq_u8_u16(lo), vreinterpretq_u8_u16(hi));
+            return { batch<uint8_t, A>(uzp.val[1]), batch<uint8_t, A>(uzp.val[0]) };
+        }
+        template <class A>
+        XSIMD_INLINE std::pair<batch<int16_t, A>, batch<int16_t, A>>
+        mul_hilo(batch<int16_t, A> const& lhs, batch<int16_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int32x4_t lo = vmull_s16(vget_low_s16(lhs), vget_low_s16(rhs));
+            int32x4_t hi = vmull_s16(vget_high_s16(lhs), vget_high_s16(rhs));
+            int16x8x2_t uzp = vuzpq_s16(vreinterpretq_s16_s32(lo), vreinterpretq_s16_s32(hi));
+            return { batch<int16_t, A>(uzp.val[1]), batch<int16_t, A>(uzp.val[0]) };
+        }
+        template <class A>
+        XSIMD_INLINE std::pair<batch<uint16_t, A>, batch<uint16_t, A>>
+        mul_hilo(batch<uint16_t, A> const& lhs, batch<uint16_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint32x4_t lo = vmull_u16(vget_low_u16(lhs), vget_low_u16(rhs));
+            uint32x4_t hi = vmull_u16(vget_high_u16(lhs), vget_high_u16(rhs));
+            uint16x8x2_t uzp = vuzpq_u16(vreinterpretq_u16_u32(lo), vreinterpretq_u16_u32(hi));
+            return { batch<uint16_t, A>(uzp.val[1]), batch<uint16_t, A>(uzp.val[0]) };
+        }
+        template <class A>
+        XSIMD_INLINE std::pair<batch<int32_t, A>, batch<int32_t, A>>
+        mul_hilo(batch<int32_t, A> const& lhs, batch<int32_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int64x2_t lo = vmull_s32(vget_low_s32(lhs), vget_low_s32(rhs));
+            int64x2_t hi = vmull_s32(vget_high_s32(lhs), vget_high_s32(rhs));
+            int32x4x2_t uzp = vuzpq_s32(vreinterpretq_s32_s64(lo), vreinterpretq_s32_s64(hi));
+            return { batch<int32_t, A>(uzp.val[1]), batch<int32_t, A>(uzp.val[0]) };
+        }
+        template <class A>
+        XSIMD_INLINE std::pair<batch<uint32_t, A>, batch<uint32_t, A>>
+        mul_hilo(batch<uint32_t, A> const& lhs, batch<uint32_t, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint64x2_t lo = vmull_u32(vget_low_u32(lhs), vget_low_u32(rhs));
+            uint64x2_t hi = vmull_u32(vget_high_u32(lhs), vget_high_u32(rhs));
+            uint32x4x2_t uzp = vuzpq_u32(vreinterpretq_u32_u64(lo), vreinterpretq_u32_u64(hi));
+            return { batch<uint32_t, A>(uzp.val[1]), batch<uint32_t, A>(uzp.val[0]) };
         }
 
         /*******
@@ -1008,29 +1183,39 @@ namespace xsimd
          * eq *
          ******/
 
-        WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type)
-        WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vceqq(uint8x16_t a, uint8x16_t b) noexcept { return vceqq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vceqq(int8x16_t a, int8x16_t b) noexcept { return vceqq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vceqq(uint16x8_t a, uint16x8_t b) noexcept { return vceqq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vceqq(int16x8_t a, int16x8_t b) noexcept { return vceqq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vceqq(uint32x4_t a, uint32x4_t b) noexcept { return vceqq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vceqq(int32x4_t a, int32x4_t b) noexcept { return vceqq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vceqq(float32x4_t a, float32x4_t b) noexcept { return vceqq_f32(a, b); }
+        }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16,
-                                wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vceqq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
-            const dispatcher_type dispatcher = {
-                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vceqq<sized_uint_t<sizeof(T)>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
@@ -1093,18 +1278,32 @@ namespace xsimd
          * lt *
          ******/
 
-        WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type)
-        WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vcltq(uint8x16_t a, uint8x16_t b) noexcept { return vcltq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vcltq(int8x16_t a, int8x16_t b) noexcept { return vcltq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vcltq(uint16x8_t a, uint16x8_t b) noexcept { return vcltq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vcltq(int16x8_t a, int16x8_t b) noexcept { return vcltq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcltq(uint32x4_t a, uint32x4_t b) noexcept { return vcltq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcltq(int32x4_t a, int32x4_t b) noexcept { return vcltq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcltq(float32x4_t a, float32x4_t b) noexcept { return vcltq_f32(a, b); }
+        }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16,
-                                wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vcltq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
@@ -1126,18 +1325,32 @@ namespace xsimd
          * le *
          ******/
 
-        WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type)
-        WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vcleq(uint8x16_t a, uint8x16_t b) noexcept { return vcleq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vcleq(int8x16_t a, int8x16_t b) noexcept { return vcleq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vcleq(uint16x8_t a, uint16x8_t b) noexcept { return vcleq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vcleq(int16x8_t a, int16x8_t b) noexcept { return vcleq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcleq(uint32x4_t a, uint32x4_t b) noexcept { return vcleq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcleq(int32x4_t a, int32x4_t b) noexcept { return vcleq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcleq(float32x4_t a, float32x4_t b) noexcept { return vcleq_f32(a, b); }
+        }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16,
-                                wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vcleq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -1149,31 +1362,33 @@ namespace xsimd
         /******
          * gt *
          ******/
-        namespace detail
-        {
-            XSIMD_INLINE int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
-            {
-                return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
-            }
 
-            XSIMD_INLINE uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
-            {
-                return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
-            }
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vcgtq(uint8x16_t a, uint8x16_t b) noexcept { return vcgtq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vcgtq(int8x16_t a, int8x16_t b) noexcept { return vcgtq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vcgtq(uint16x8_t a, uint16x8_t b) noexcept { return vcgtq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vcgtq(int16x8_t a, int16x8_t b) noexcept { return vcgtq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcgtq(uint32x4_t a, uint32x4_t b) noexcept { return vcgtq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcgtq(int32x4_t a, int32x4_t b) noexcept { return vcgtq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcgtq(float32x4_t a, float32x4_t b) noexcept { return vcgtq_f32(a, b); }
         }
 
-        WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
-        WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
-
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16,
-                                wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vcgtq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
@@ -1195,18 +1410,32 @@ namespace xsimd
          * ge *
          ******/
 
-        WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type)
-        WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vcgeq(uint8x16_t a, uint8x16_t b) noexcept { return vcgeq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vcgeq(int8x16_t a, int8x16_t b) noexcept { return vcgeq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vcgeq(uint16x8_t a, uint16x8_t b) noexcept { return vcgeq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vcgeq(int16x8_t a, int16x8_t b) noexcept { return vcgeq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcgeq(uint32x4_t a, uint32x4_t b) noexcept { return vcgeq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcgeq(int32x4_t a, int32x4_t b) noexcept { return vcgeq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vcgeq(float32x4_t a, float32x4_t b) noexcept { return vcgeq_f32(a, b); }
+        }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16,
-                                wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vcgeq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -1230,25 +1459,32 @@ namespace xsimd
          * bitwise_and *
          ***************/
 
-        WRAP_BINARY_INT(vandq, detail::identity_return_type)
-
-        namespace detail
+        namespace wrap
         {
-            XSIMD_INLINE float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
-            {
-                return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs),
-                                                       vreinterpretq_u32_f32(rhs)));
-            }
-
-            template <class V>
-            V bitwise_and_neon(V const& lhs, V const& rhs)
-            {
-                const neon_dispatcher::binary dispatcher = {
-                    std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16,
-                                    wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64,
-                                    bitwise_and_f32)
-                };
-                return dispatcher.apply(lhs, rhs);
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vandq(uint8x16_t a, uint8x16_t b) noexcept { return vandq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vandq(int8x16_t a, int8x16_t b) noexcept { return vandq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vandq(uint16x8_t a, uint16x8_t b) noexcept { return vandq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vandq(int16x8_t a, int16x8_t b) noexcept { return vandq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vandq(uint32x4_t a, uint32x4_t b) noexcept { return vandq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vandq(int32x4_t a, int32x4_t b) noexcept { return vandq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vandq(uint64x2_t a, uint64x2_t b) noexcept { return vandq_u64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vandq(int64x2_t a, int64x2_t b) noexcept { return vandq_s64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vandq(float32x4_t a, float32x4_t b) noexcept
+            {
+                return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),
+                                                       vreinterpretq_u32_f32(b)));
             }
         }
 
@@ -1256,39 +1492,46 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
+            return wrap::x_vandq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
+            return wrap::x_vandq<sized_uint_t<sizeof(T)>>(register_type(lhs), register_type(rhs));
         }
 
         /**************
          * bitwise_or *
          **************/
 
-        WRAP_BINARY_INT(vorrq, detail::identity_return_type)
-
-        namespace detail
+        namespace wrap
         {
-            XSIMD_INLINE float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
-            {
-                return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs),
-                                                       vreinterpretq_u32_f32(rhs)));
-            }
-
-            template <class V>
-            XSIMD_INLINE V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
-            {
-                const neon_dispatcher::binary dispatcher = {
-                    std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16,
-                                    wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64,
-                                    bitwise_or_f32)
-                };
-                return dispatcher.apply(lhs, rhs);
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vorrq(uint8x16_t a, uint8x16_t b) noexcept { return vorrq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vorrq(int8x16_t a, int8x16_t b) noexcept { return vorrq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vorrq(uint16x8_t a, uint16x8_t b) noexcept { return vorrq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vorrq(int16x8_t a, int16x8_t b) noexcept { return vorrq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vorrq(uint32x4_t a, uint32x4_t b) noexcept { return vorrq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vorrq(int32x4_t a, int32x4_t b) noexcept { return vorrq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vorrq(uint64x2_t a, uint64x2_t b) noexcept { return vorrq_u64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vorrq(int64x2_t a, int64x2_t b) noexcept { return vorrq_s64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vorrq(float32x4_t a, float32x4_t b) noexcept
+            {
+                return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),
+                                                       vreinterpretq_u32_f32(b)));
             }
         }
 
@@ -1296,39 +1539,46 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
+            return wrap::x_vorrq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
+            return wrap::x_vorrq<sized_uint_t<sizeof(T)>>(register_type(lhs), register_type(rhs));
         }
 
         /***************
          * bitwise_xor *
          ***************/
 
-        WRAP_BINARY_INT(veorq, detail::identity_return_type)
-
-        namespace detail
+        namespace wrap
         {
-            XSIMD_INLINE float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
-            {
-                return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs),
-                                                       vreinterpretq_u32_f32(rhs)));
-            }
-
-            template <class V>
-            XSIMD_INLINE V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
-            {
-                const neon_dispatcher::binary dispatcher = {
-                    std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16,
-                                    wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64,
-                                    bitwise_xor_f32)
-                };
-                return dispatcher.apply(lhs, rhs);
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_veorq(uint8x16_t a, uint8x16_t b) noexcept { return veorq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_veorq(int8x16_t a, int8x16_t b) noexcept { return veorq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_veorq(uint16x8_t a, uint16x8_t b) noexcept { return veorq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_veorq(int16x8_t a, int16x8_t b) noexcept { return veorq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_veorq(uint32x4_t a, uint32x4_t b) noexcept { return veorq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_veorq(int32x4_t a, int32x4_t b) noexcept { return veorq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_veorq(uint64x2_t a, uint64x2_t b) noexcept { return veorq_u64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_veorq(int64x2_t a, int64x2_t b) noexcept { return veorq_s64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_veorq(float32x4_t a, float32x4_t b) noexcept
+            {
+                return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),
+                                                       vreinterpretq_u32_f32(b)));
             }
         }
 
@@ -1336,14 +1586,14 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
+            return wrap::x_veorq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
+            return wrap::x_veorq<sized_uint_t<sizeof(T)>>(register_type(lhs), register_type(rhs));
         }
 
         /*******
@@ -1360,25 +1610,37 @@ namespace xsimd
          * bitwise_not *
          ***************/
 
-        WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
-
-        namespace detail
+        namespace wrap
         {
-            XSIMD_INLINE float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vmvnq(uint8x16_t a) noexcept { return vmvnq_u8(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vmvnq(int8x16_t a) noexcept { return vmvnq_s8(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vmvnq(uint16x8_t a) noexcept { return vmvnq_u16(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vmvnq(int16x8_t a) noexcept { return vmvnq_s16(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vmvnq(uint32x4_t a) noexcept { return vmvnq_u32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vmvnq(int32x4_t a) noexcept { return vmvnq_s32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vmvnq(uint64x2_t a) noexcept
             {
-                return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
+                return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(a)));
             }
-
-            template <class V>
-            XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vmvnq(int64x2_t a) noexcept
             {
-                const neon_dispatcher::unary dispatcher = {
-                    std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
-                                    wrap::vmvnq_u32, wrap::vmvnq_s32,
-                                    bitwise_not_u64, bitwise_not_s64,
-                                    bitwise_not_f32)
-                };
-                return dispatcher.apply(arg);
+                return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a)));
+            }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vmvnq(float32x4_t a) noexcept
+            {
+                return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(a)));
             }
         }
 
@@ -1386,38 +1648,45 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            return detail::bitwise_not_neon(register_type(arg));
+            return wrap::x_vmvnq<map_to_sized_type_t<T>>(register_type(arg));
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return detail::bitwise_not_neon(register_type(arg));
+            return wrap::x_vmvnq<sized_uint_t<sizeof(T)>>(register_type(arg));
         }
 
         /******************
          * bitwise_andnot *
          ******************/
 
-        WRAP_BINARY_INT(vbicq, detail::identity_return_type)
-
-        namespace detail
+        namespace wrap
         {
-            XSIMD_INLINE float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
-            {
-                return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs)));
-            }
-
-            template <class V>
-            XSIMD_INLINE V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
-            {
-                const detail::neon_dispatcher::binary dispatcher = {
-                    std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16,
-                                    wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64,
-                                    bitwise_andnot_f32)
-                };
-                return dispatcher.apply(lhs, rhs);
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vbicq(uint8x16_t a, uint8x16_t b) noexcept { return vbicq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vbicq(int8x16_t a, int8x16_t b) noexcept { return vbicq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vbicq(uint16x8_t a, uint16x8_t b) noexcept { return vbicq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vbicq(int16x8_t a, int16x8_t b) noexcept { return vbicq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vbicq(uint32x4_t a, uint32x4_t b) noexcept { return vbicq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vbicq(int32x4_t a, int32x4_t b) noexcept { return vbicq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vbicq(uint64x2_t a, uint64x2_t b) noexcept { return vbicq_u64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vbicq(int64x2_t a, int64x2_t b) noexcept { return vbicq_s64(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vbicq(float32x4_t a, float32x4_t b) noexcept
+            {
+                return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
             }
         }
 
@@ -1425,32 +1694,46 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
+            return wrap::x_vbicq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch_bool<T, A>::register_type;
-            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
+            return wrap::x_vbicq<sized_uint_t<sizeof(T)>>(register_type(lhs), register_type(rhs));
         }
 
         /*******
          * min *
          *******/
 
-        WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type)
-        WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vminq(uint8x16_t a, uint8x16_t b) noexcept { return vminq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vminq(int8x16_t a, int8x16_t b) noexcept { return vminq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vminq(uint16x8_t a, uint16x8_t b) noexcept { return vminq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vminq(int16x8_t a, int16x8_t b) noexcept { return vminq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vminq(uint32x4_t a, uint32x4_t b) noexcept { return vminq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vminq(int32x4_t a, int32x4_t b) noexcept { return vminq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vminq(float32x4_t a, float32x4_t b) noexcept { return vminq_f32(a, b); }
+        }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16,
-                                wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vminq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -1463,18 +1746,32 @@ namespace xsimd
          * max *
          *******/
 
-        WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type)
-        WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vmaxq(uint8x16_t a, uint8x16_t b) noexcept { return vmaxq_u8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vmaxq(int8x16_t a, int8x16_t b) noexcept { return vmaxq_s8(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vmaxq(uint16x8_t a, uint16x8_t b) noexcept { return vmaxq_u16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vmaxq(int16x8_t a, int16x8_t b) noexcept { return vmaxq_s16(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vmaxq(uint32x4_t a, uint32x4_t b) noexcept { return vmaxq_u32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vmaxq(int32x4_t a, int32x4_t b) noexcept { return vmaxq_s32(a, b); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vmaxq(float32x4_t a, float32x4_t b) noexcept { return vmaxq_f32(a, b); }
+        }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16,
-                                wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32)
-            };
-            return dispatcher.apply(register_type(lhs), register_type(rhs));
+            return wrap::x_vmaxq<map_to_sized_type_t<T>>(register_type(lhs), register_type(rhs));
         }
 
         template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
@@ -1489,39 +1786,30 @@ namespace xsimd
 
         namespace wrap
         {
-            XSIMD_INLINE int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); }
-            XSIMD_INLINE int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); }
-            XSIMD_INLINE int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); }
-        }
-        WRAP_UNARY_FLOAT(vabsq)
-
-        namespace detail
-        {
-            XSIMD_INLINE uint8x16_t abs_u8(uint8x16_t arg) noexcept
-            {
-                return arg;
-            }
-
-            XSIMD_INLINE uint16x8_t abs_u16(uint16x8_t arg) noexcept
-            {
-                return arg;
-            }
-
-            XSIMD_INLINE uint32x4_t abs_u32(uint32x4_t arg) noexcept
-            {
-                return arg;
-            }
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vabsq(uint8x16_t a) noexcept { return a; }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vabsq(int8x16_t a) noexcept { return vabsq_s8(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vabsq(uint16x8_t a) noexcept { return a; }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vabsq(int16x8_t a) noexcept { return vabsq_s16(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vabsq(uint32x4_t a) noexcept { return a; }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vabsq(int32x4_t a) noexcept { return vabsq_s32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vabsq(float32x4_t a) noexcept { return vabsq_f32(a); }
         }
 
         template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::excluding_int64_dispatcher::unary dispatcher = {
-                std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16,
-                                detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32)
-            };
-            return dispatcher.apply(register_type(arg));
+            return wrap::x_vabsq<map_to_sized_type_t<T>>(register_type(arg));
         }
 
         /********
@@ -1843,39 +2131,27 @@ namespace xsimd
 
         namespace wrap
         {
-            XSIMD_INLINE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); }
-            XSIMD_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); }
-            XSIMD_INLINE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); }
-            XSIMD_INLINE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); }
-            XSIMD_INLINE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); }
-            XSIMD_INLINE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); }
-            XSIMD_INLINE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); }
-            XSIMD_INLINE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); }
-            XSIMD_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); }
-        }
-
-        namespace detail
-        {
-            template <class... T>
-            struct neon_select_dispatcher_impl
-            {
-                using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>;
-                const container_type m_func;
-
-                template <class U>
-                U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept
-                {
-                    using func_type = U (*)(comp_return_type<U>, U, U);
-                    auto func = std::get<func_type>(m_func);
-                    return func(cond, lhs, rhs);
-                }
-            };
-
-            using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t,
-                                                                       uint16x8_t, int16x8_t,
-                                                                       uint32x4_t, int32x4_t,
-                                                                       uint64x2_t, int64x2_t,
-                                                                       float32x4_t>;
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vbslq(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return vbslq_u8(a, b, c); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vbslq(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return vbslq_s8(a, b, c); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vbslq(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return vbslq_u16(a, b, c); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vbslq(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return vbslq_s16(a, b, c); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vbslq(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return vbslq_u32(a, b, c); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vbslq(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return vbslq_s32(a, b, c); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vbslq(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return vbslq_u64(a, b, c); }
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vbslq(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return vbslq_s64(a, b, c); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vbslq(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return vbslq_f32(a, b, c); }
         }
 
         template <class A, class T, detail::enable_neon_type_t<T> = 0>
@@ -1883,12 +2159,7 @@ namespace xsimd
         {
             using bool_register_type = typename batch_bool<T, A>::register_type;
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_select_dispatcher dispatcher = {
-                std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16,
-                                wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64,
-                                wrap::vbslq_f32)
-            };
-            return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b));
+            return wrap::x_vbslq<map_to_sized_type_t<T>>(bool_register_type(cond), register_type(a), register_type(b));
         }
 
         template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
@@ -2371,6 +2642,15 @@ namespace xsimd
                     return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
                 }
             }
+
+            template <class A, class T>
+            XSIMD_INLINE bool shifts_all_positive(batch<T, A> const& b) noexcept
+            {
+                std::array<T, batch<T, A>::size> tmp = {};
+                b.store_unaligned(tmp.begin());
+                return std::all_of(tmp.begin(), tmp.end(), [](T x)
+                                   { return x >= 0; });
+            }
         }
 
         template <class A, class T>
@@ -2382,9 +2662,11 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return vshlq_u8(lhs, rhs);
+            // Blindly converting to signed since out of bounds shifts are UB anyways
+            assert(detail::shifts_all_positive(rhs));
+            return vshlq_u8(lhs, vreinterpretq_s8_u8(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
@@ -2394,9 +2676,11 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return vshlq_u16(lhs, rhs);
+            // Blindly converting to signed since out of bounds shifts are UB anyways
+            assert(detail::shifts_all_positive(rhs));
+            return vshlq_u16(lhs, vreinterpretq_s16_u16(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
@@ -2406,9 +2690,11 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return vshlq_u32(lhs, rhs);
+            // Blindly converting to signed since out of bounds shifts are UB anyways
+            assert(detail::shifts_all_positive(rhs));
+            return vshlq_u32(lhs, vreinterpretq_s32_u32(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
@@ -2418,9 +2704,11 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return vshlq_u64(lhs, rhs);
+            // Blindly converting to signed since out of bounds shifts are UB
+            assert(detail::shifts_all_positive(rhs));
+            return vshlq_u64(lhs, vreinterpretq_s64_u64(rhs));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
@@ -2618,9 +2906,11 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
-        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return vshlq_u8(lhs, vnegq_s8(rhs));
+            // Blindly converting to signed since out of bounds shifts are UB anyways
+            assert(detail::shifts_all_positive(rhs));
+            return vshlq_u8(lhs, vnegq_s8(vreinterpretq_s8_u8(rhs)));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
@@ -2630,9 +2920,11 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
-        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return vshlq_u16(lhs, vnegq_s16(rhs));
+            // Blindly converting to signed since out of bounds shifts are UB anyways
+            assert(detail::shifts_all_positive(rhs));
+            return vshlq_u16(lhs, vnegq_s16(vreinterpretq_s16_u16(rhs)));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
@@ -2642,9 +2934,11 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
-        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
         {
-            return vshlq_u32(lhs, vnegq_s32(rhs));
+            // Blindly converting to signed since out of bounds shifts are UB anyways
+            assert(detail::shifts_all_positive(rhs));
+            return vshlq_u32(lhs, vnegq_s32(vreinterpretq_s32_u32(rhs)));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
@@ -2653,6 +2947,21 @@ namespace xsimd
             return vshlq_s32(lhs, vnegq_s32(rhs));
         }
 
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon> req) noexcept
+        {
+            // Blindly converting to signed since out of bounds shifts are UB anyways
+            assert(detail::shifts_all_positive(rhs));
+            using S = std::make_signed_t<T>;
+            return vshlq_u64(lhs, neg(batch<S, A>(vreinterpretq_s64_u64(rhs)), req).data);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s64(lhs, neg(rhs, neon {}).data);
+        }
+
         // immediate variant
         template <size_t shift, class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& x, requires_arch<neon>) noexcept
@@ -2702,6 +3011,61 @@ namespace xsimd
             return vshrq_n_s64(x, shift);
         }
 
+        // get
+        template <class A, size_t I>
+        XSIMD_INLINE float get(batch<float, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_f32(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u8(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s8(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u16(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s16(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u32(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s32(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_u64(self, I);
+        }
+
+        template <class A, size_t I, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<neon>) noexcept
+        {
+            return vgetq_lane_s64(self, I);
+        }
+
         // first
         template <class A>
         XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<neon>) noexcept
@@ -2817,157 +3181,6 @@ namespace xsimd
             return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
         }
 
-        /****************
-         * bitwise_cast *
-         ****************/
-
-#define WRAP_CAST(SUFFIX, TYPE)                                                \
-    namespace wrap                                                             \
-    {                                                                          \
-        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept   \
-        {                                                                      \
-            return ::vreinterpretq_##SUFFIX##_u8(a);                           \
-        }                                                                      \
-        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept    \
-        {                                                                      \
-            return ::vreinterpretq_##SUFFIX##_s8(a);                           \
-        }                                                                      \
-        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept  \
-        {                                                                      \
-            return ::vreinterpretq_##SUFFIX##_u16(a);                          \
-        }                                                                      \
-        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept   \
-        {                                                                      \
-            return ::vreinterpretq_##SUFFIX##_s16(a);                          \
-        }                                                                      \
-        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept  \
-        {                                                                      \
-            return ::vreinterpretq_##SUFFIX##_u32(a);                          \
-        }                                                                      \
-        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept   \
-        {                                                                      \
-            return ::vreinterpretq_##SUFFIX##_s32(a);                          \
-        }                                                                      \
-        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept  \
-        {                                                                      \
-            return ::vreinterpretq_##SUFFIX##_u64(a);                          \
-        }                                                                      \
-        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept   \
-        {                                                                      \
-            return ::vreinterpretq_##SUFFIX##_s64(a);                          \
-        }                                                                      \
-        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
-        {                                                                      \
-            return ::vreinterpretq_##SUFFIX##_f32(a);                          \
-        }                                                                      \
-    }
-
-        WRAP_CAST(u8, uint8x16_t)
-        WRAP_CAST(s8, int8x16_t)
-        WRAP_CAST(u16, uint16x8_t)
-        WRAP_CAST(s16, int16x8_t)
-        WRAP_CAST(u32, uint32x4_t)
-        WRAP_CAST(s32, int32x4_t)
-        WRAP_CAST(u64, uint64x2_t)
-        WRAP_CAST(s64, int64x2_t)
-        WRAP_CAST(f32, float32x4_t)
-
-#undef WRAP_CAST
-
-        namespace detail
-        {
-            template <class R, class... T>
-            struct bitwise_caster_impl
-            {
-                using container_type = std::tuple<R (*)(T)...>;
-                container_type m_func;
-
-                template <class U>
-                R apply(U rhs) const noexcept
-                {
-                    using func_type = R (*)(U);
-                    auto func = std::get<func_type>(m_func);
-                    return func(rhs);
-                }
-            };
-
-            template <class R, class... T>
-            XSIMD_INLINE const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept
-            {
-                return { std::make_tuple(arg...) };
-            }
-
-            template <class... T>
-            struct type_list
-            {
-            };
-
-            template <class RTL, class TTL>
-            struct bitwise_caster;
-
-            template <class... R, class... T>
-            struct bitwise_caster<type_list<R...>, type_list<T...>>
-            {
-                using container_type = std::tuple<bitwise_caster_impl<R, T...>...>;
-                container_type m_caster;
-
-                template <class V, class U>
-                V apply(U rhs) const noexcept
-                {
-                    using caster_type = bitwise_caster_impl<V, T...>;
-                    auto caster = std::get<caster_type>(m_caster);
-                    return caster.apply(rhs);
-                }
-            };
-
-            template <class... T>
-            using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>;
-
-            using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t,
-                                                         uint16x8_t, int16x8_t,
-                                                         uint32x4_t, int32x4_t,
-                                                         uint64x2_t, int64x2_t,
-                                                         float32x4_t>;
-        }
-
-        template <class A, class T, class R>
-        XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
-        {
-            const detail::neon_bitwise_caster caster = {
-                std::make_tuple(
-                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16,
-                                                     wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64,
-                                                     wrap::vreinterpretq_u8_f32),
-                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16,
-                                                     wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64,
-                                                     wrap::vreinterpretq_s8_f32),
-                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16,
-                                                     wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64,
-                                                     wrap::vreinterpretq_u16_f32),
-                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16,
-                                                     wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64,
-                                                     wrap::vreinterpretq_s16_f32),
-                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16,
-                                                     wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64,
-                                                     wrap::vreinterpretq_u32_f32),
-                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16,
-                                                     wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64,
-                                                     wrap::vreinterpretq_s32_f32),
-                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16,
-                                                     wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64,
-                                                     wrap::vreinterpretq_u64_f32),
-                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16,
-                                                     wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64,
-                                                     wrap::vreinterpretq_s64_f32),
-                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16,
-                                                     wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64,
-                                                     wrap::vreinterpretq_f32_f32))
-            };
-            using src_register_type = typename batch<T, A>::register_type;
-            using dst_register_type = typename batch<R, A>::register_type;
-            return caster.apply<dst_register_type>(src_register_type(arg));
-        }
-
         /*********
          * isnan *
          *********/
@@ -3049,37 +3262,34 @@ namespace xsimd
          ****************/
         namespace wrap
         {
-            template <size_t N>
-            XSIMD_INLINE uint8x16_t rotate_left_u8(uint8x16_t a, uint8x16_t b) noexcept { return vextq_u8(a, b, N); }
-            template <size_t N>
-            XSIMD_INLINE int8x16_t rotate_left_s8(int8x16_t a, int8x16_t b) noexcept { return vextq_s8(a, b, N); }
-            template <size_t N>
-            XSIMD_INLINE uint16x8_t rotate_left_u16(uint16x8_t a, uint16x8_t b) noexcept { return vextq_u16(a, b, N); }
-            template <size_t N>
-            XSIMD_INLINE int16x8_t rotate_left_s16(int16x8_t a, int16x8_t b) noexcept { return vextq_s16(a, b, N); }
-            template <size_t N>
-            XSIMD_INLINE uint32x4_t rotate_left_u32(uint32x4_t a, uint32x4_t b) noexcept { return vextq_u32(a, b, N); }
-            template <size_t N>
-            XSIMD_INLINE int32x4_t rotate_left_s32(int32x4_t a, int32x4_t b) noexcept { return vextq_s32(a, b, N); }
-            template <size_t N>
-            XSIMD_INLINE uint64x2_t rotate_left_u64(uint64x2_t a, uint64x2_t b) noexcept { return vextq_u64(a, b, N); }
-            template <size_t N>
-            XSIMD_INLINE int64x2_t rotate_left_s64(int64x2_t a, int64x2_t b) noexcept { return vextq_s64(a, b, N); }
-            template <size_t N>
-            XSIMD_INLINE float32x4_t rotate_left_f32(float32x4_t a, float32x4_t b) noexcept { return vextq_f32(a, b, N); }
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <size_t N, class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_rotate_left(uint8x16_t a, uint8x16_t b) noexcept { return vextq_u8(a, b, N); }
+            template <size_t N, class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_rotate_left(int8x16_t a, int8x16_t b) noexcept { return vextq_s8(a, b, N); }
+            template <size_t N, class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_rotate_left(uint16x8_t a, uint16x8_t b) noexcept { return vextq_u16(a, b, N % 8); }
+            template <size_t N, class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_rotate_left(int16x8_t a, int16x8_t b) noexcept { return vextq_s16(a, b, N % 8); }
+            template <size_t N, class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_rotate_left(uint32x4_t a, uint32x4_t b) noexcept { return vextq_u32(a, b, N % 4); }
+            template <size_t N, class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_rotate_left(int32x4_t a, int32x4_t b) noexcept { return vextq_s32(a, b, N % 4); }
+            template <size_t N, class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_rotate_left(uint64x2_t a, uint64x2_t b) noexcept { return vextq_u64(a, b, N % 2); }
+            template <size_t N, class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_rotate_left(int64x2_t a, int64x2_t b) noexcept { return vextq_s64(a, b, N % 2); }
+            template <size_t N, class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_rotate_left(float32x4_t a, float32x4_t b) noexcept { return vextq_f32(a, b, N % 4); }
         }
 
         template <size_t N, class A, class T, detail::enable_neon_type_t<T> = 0>
         XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<neon>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            // Adding modulo to avoid warning.
-            const detail::neon_dispatcher::binary dispatcher = {
-                std::make_tuple(wrap::rotate_left_u8<N>, wrap::rotate_left_s8<N>, wrap::rotate_left_u16<N % 8>, wrap::rotate_left_s16<N % 8>,
-                                wrap::rotate_left_u32<N % 4>, wrap::rotate_left_s32<N % 4>, wrap::rotate_left_u64<N % 2>, wrap::rotate_left_s64<N % 2>,
-                                wrap::rotate_left_f32<N % 4>)
-            };
-            return dispatcher.apply(register_type(a), register_type(a));
+            return wrap::x_rotate_left<N, map_to_sized_type_t<T>>(register_type(a), register_type(a));
         }
     }
 
@@ -3357,15 +3567,121 @@ namespace xsimd
             uint64_t mask_hi = vgetq_lane_u64(self, 1);
             return ((mask_lo >> 63) | (mask_hi << 1)) & 0x3;
         }
+
+        /*********
+         * count *
+         *********/
+
+        // NOTE: Extracting a u32 for the return value saves two instructions on 32-bit ARM:
+        // <https://godbolt.org/z/PYn4na8sY>.
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
+        {
+            uint8x16_t msbs = vshrq_n_u8(self, 7);
+            uint64x2_t psum = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(msbs)));
+            uint64x1_t total = vadd_u64(vget_low_u64(psum), vget_high_u64(psum));
+
+            assert(vget_lane_u64(total, 0) <= std::numeric_limits<uint32_t>::max());
+            return vget_lane_u32(vreinterpret_u32_u64(total), 0);
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
+        {
+            uint16x8_t msbs = vshrq_n_u16(self, 15);
+            uint64x2_t psum = vpaddlq_u32(vpaddlq_u16(msbs));
+            uint64x1_t total = vadd_u64(vget_low_u64(psum), vget_high_u64(psum));
+
+            assert(vget_lane_u64(total, 0) <= std::numeric_limits<uint32_t>::max());
+            return vget_lane_u32(vreinterpret_u32_u64(total), 0);
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
+        {
+            uint32x4_t msbs = vshrq_n_u32(self, 31);
+            uint64x2_t psum = vpaddlq_u32(msbs);
+            uint64x1_t total = vadd_u64(vget_low_u64(psum), vget_high_u64(psum));
+
+            assert(vget_lane_u64(total, 0) <= std::numeric_limits<uint32_t>::max());
+            return vget_lane_u32(vreinterpret_u32_u64(total), 0);
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon>) noexcept
+        {
+            uint64x2_t msbs = vshrq_n_u64(self, 63);
+            uint64x1_t total = vadd_u64(vget_low_u64(msbs), vget_high_u64(msbs));
+
+            assert(vget_lane_u64(total, 0) <= std::numeric_limits<uint32_t>::max());
+            return vget_lane_u32(vreinterpret_u32_u64(total), 0);
+        }
+
+#define WRAP_MASK_OP(OP)                                                               \
+    template <class A, class T, detail::enable_sized_t<T, 1> = 0>                      \
+    XSIMD_INLINE size_t OP(batch_bool<T, A> const& self, requires_arch<neon>) noexcept \
+    {                                                                                  \
+        uint8x16_t inner = self;                                                       \
+        XSIMD_IF_CONSTEXPR(detail::do_swap)                                            \
+        {                                                                              \
+            inner = vrev16q_u8(inner);                                                 \
+        }                                                                              \
+                                                                                       \
+        uint8x8_t narrowed = vshrn_n_u16(vreinterpretq_u16_u8(inner), 4);              \
+        XSIMD_IF_CONSTEXPR(detail::do_swap)                                            \
+        {                                                                              \
+            narrowed = vrev64_u8(narrowed);                                            \
+        }                                                                              \
+                                                                                       \
+        uint64_t result = vget_lane_u64(vreinterpret_u64_u8(narrowed), 0);             \
+        return xsimd::detail::OP(result) / 4;                                          \
+    }                                                                                  \
+    template <class A, class T, detail::enable_sized_t<T, 2> = 0>                      \
+    XSIMD_INLINE size_t OP(batch_bool<T, A> const& self, requires_arch<neon>) noexcept \
+    {                                                                                  \
+        uint8x8_t narrowed = vmovn_u16(self);                                          \
+        XSIMD_IF_CONSTEXPR(detail::do_swap)                                            \
+        {                                                                              \
+            narrowed = vrev64_u8(narrowed);                                            \
+        }                                                                              \
+                                                                                       \
+        uint64_t result = vget_lane_u64(vreinterpret_u64_u8(narrowed), 0);             \
+        return xsimd::detail::OP(result) / 8;                                          \
+    }                                                                                  \
+    template <class A, class T, detail::enable_sized_t<T, 4> = 0>                      \
+    XSIMD_INLINE size_t OP(batch_bool<T, A> const& self, requires_arch<neon>) noexcept \
+    {                                                                                  \
+        uint16x4_t narrowed = vmovn_u32(self);                                         \
+        XSIMD_IF_CONSTEXPR(detail::do_swap)                                            \
+        {                                                                              \
+            narrowed = vrev64_u16(narrowed);                                           \
+        }                                                                              \
+                                                                                       \
+        uint64_t result = vget_lane_u64(vreinterpret_u64_u16(narrowed), 0);            \
+        return xsimd::detail::OP(result) / 16;                                         \
+    }                                                                                  \
+    template <class A, class T, detail::enable_sized_t<T, 8> = 0>                      \
+    XSIMD_INLINE size_t OP(batch_bool<T, A> const& self, requires_arch<neon>) noexcept \
+    {                                                                                  \
+        uint32x2_t narrowed = vmovn_u64(self);                                         \
+        XSIMD_IF_CONSTEXPR(detail::do_swap)                                            \
+        {                                                                              \
+            narrowed = vrev64_u32(narrowed);                                           \
+        }                                                                              \
+                                                                                       \
+        uint64_t result = vget_lane_u64(vreinterpret_u64_u32(narrowed), 0);            \
+        return xsimd::detail::OP(result) / 32;                                         \
     }
 
-}
+        WRAP_MASK_OP(countl_zero)
+        WRAP_MASK_OP(countl_one)
+        WRAP_MASK_OP(countr_zero)
+        WRAP_MASK_OP(countr_one)
 
-#undef WRAP_BINARY_INT_EXCLUDING_64
-#undef WRAP_BINARY_INT
-#undef WRAP_BINARY_FLOAT
-#undef WRAP_UNARY_INT_EXCLUDING_64
-#undef WRAP_UNARY_INT
-#undef WRAP_UNARY_FLOAT
+#undef WRAP_MASK_OP
+    }
+
+}
 
 #endif
diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp
index 9f3c4bce8..92400e48b 100644
--- a/include/xsimd/arch/xsimd_neon64.hpp
+++ b/include/xsimd/arch/xsimd_neon64.hpp
@@ -12,14 +12,16 @@
 #ifndef XSIMD_NEON64_HPP
 #define XSIMD_NEON64_HPP
 
+#include "../types/xsimd_neon64_register.hpp"
+#include "../types/xsimd_utils.hpp"
+#include "./xsimd_neon.hpp"
+
+#include <cassert>
 #include <complex>
 #include <cstddef>
-#include <tuple>
+#include <cstring>
 #include <utility>
 
-#include "../types/xsimd_neon64_register.hpp"
-#include "../types/xsimd_utils.hpp"
-
 namespace xsimd
 {
     template <typename T, class A, bool... Values>
@@ -29,6 +31,21 @@ namespace xsimd
     {
         using namespace types;
 
+        namespace detail
+        {
+
+            template <class T>
+            using enable_neon64_type_t = std::enable_if_t<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value,
+                                                          int>;
+        }
+
+        // get
+        template <class A, size_t I>
+        XSIMD_INLINE double get(batch<double, A> const& self, ::xsimd::index<I>, requires_arch<neon64>) noexcept
+        {
+            return vgetq_lane_f64(self, I);
+        }
+
         // first
         template <class A>
         XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<neon64>) noexcept
@@ -109,25 +126,6 @@ namespace xsimd
             return vdupq_n_f64(val);
         }
 
-        /*******
-         * set *
-         *******/
-
-        template <class A>
-        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
-        {
-            return float64x2_t { d0, d1 };
-        }
-
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
-        {
-            using register_type = typename batch_bool<double, A>::register_type;
-            using unsigned_type = as_unsigned_integer_t<double>;
-            return register_type { static_cast<unsigned_type>(b0 ? -1LL : 0LL),
-                                   static_cast<unsigned_type>(b1 ? -1LL : 0LL) };
-        }
-
         /*************
          * from_bool *
          *************/
@@ -143,8 +141,6 @@ namespace xsimd
          ********/
 #if defined(__clang__) || defined(__GNUC__)
 #define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
-#elif defined(_MSC_VER)
-#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
 #else
 #define xsimd_aligned_load(inst, type, expr) inst((type)expr)
 #endif
@@ -178,6 +174,89 @@ namespace xsimd
             return store_aligned<A>(dst, src, A {});
         }
 
+        /****************
+         * store_stream *
+         ****************/
+
+#if defined(__GNUC__)
+        template <class A>
+        XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& val, requires_arch<neon64>) noexcept
+        {
+            float32x2_t lo = vget_low_f32(val);
+            float32x2_t hi = vget_high_f32(val);
+            __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
+                                 :
+                                 : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
+                                 : "memory");
+        }
+
+        template <class A>
+        XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& val, requires_arch<neon64>) noexcept
+        {
+            float64x1_t lo = vget_low_f64(val);
+            float64x1_t hi = vget_high_f64(val);
+            __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
+                                 :
+                                 : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
+                                 : "memory");
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& val, requires_arch<neon64>) noexcept
+        {
+            uint64x2_t u64;
+            std::memcpy(&u64, &val, sizeof(u64));
+            uint64x1_t lo = vget_low_u64(u64);
+            uint64x1_t hi = vget_high_u64(u64);
+            __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]"
+                                 :
+                                 : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem)
+                                 : "memory");
+        }
+#endif
+
+        /***************
+         * load_stream *
+         ***************/
+
+#if defined(__GNUC__)
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<neon64>) noexcept
+        {
+            float32x2_t lo, hi;
+            __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
+                                 : [lo] "=w"(lo), [hi] "=w"(hi)
+                                 : [mem] "r"(mem)
+                                 : "memory");
+            return vcombine_f32(lo, hi);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<neon64>) noexcept
+        {
+            float64x1_t lo, hi;
+            __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
+                                 : [lo] "=w"(lo), [hi] "=w"(hi)
+                                 : [mem] "r"(mem)
+                                 : "memory");
+            return vcombine_f64(lo, hi);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<neon64>) noexcept
+        {
+            uint64x1_t lo, hi;
+            __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]"
+                                 : [lo] "=w"(lo), [hi] "=w"(hi)
+                                 : [mem] "r"(mem)
+                                 : "memory");
+            uint64x2_t u64 = vcombine_u64(lo, hi);
+            batch<T, A> result;
+            std::memcpy(&result, &u64, sizeof(u64));
+            return result;
+        }
+#endif
+
         /*********************
          * store<batch_bool> *
          *********************/
@@ -229,6 +308,28 @@ namespace xsimd
             store_complex_aligned(dst, src, A {});
         }
 
+        /*******
+         * set *
+         *******/
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<neon64> req, double d0, double d1) noexcept
+        {
+            alignas(A::alignment()) double data[] = { d0, d1 };
+            return load_aligned<A>(data, {}, req);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
+        {
+            using unsigned_type = as_unsigned_integer_t<double>;
+            auto const out = batch<unsigned_type, A> {
+                static_cast<unsigned_type>(b0 ? -1LL : 0LL),
+                static_cast<unsigned_type>(b1 ? -1LL : 0LL)
+            };
+            return { out.data };
+        }
+
         /*******
          * neg *
          *******/
@@ -568,7 +669,7 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
         {
-            return detail::bitwise_not_u64(rhs);
+            return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(rhs)));
         }
 
         /******************
@@ -652,6 +753,33 @@ namespace xsimd
             return vaddvq_u32(positioned);
         }
 
+        /*********
+         * count *
+         *********/
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon64>) noexcept
+        {
+            return vaddvq_u8(vshrq_n_u8(self, 7));
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon64>) noexcept
+        {
+            return vaddvq_u16(vshrq_n_u16(self, 15));
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon64>) noexcept
+        {
+            return vaddvq_u32(vshrq_n_u32(self, 31));
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        XSIMD_INLINE size_t count(batch_bool<T, A> const& self, requires_arch<neon64>) noexcept
+        {
+            return vaddvq_u64(vshrq_n_u64(self, 63));
+        }
+
         /*******
          * abs *
          *******/
@@ -760,186 +888,77 @@ namespace xsimd
             return vsetq_lane_f64(val, self, I);
         }
 
-        /******************
-         * reducer macros *
-         ******************/
-
-        // Wrap reducer intrinsics so we can pass them as function pointers
-        // - OP: intrinsics name prefix, e.g., vorrq
-
-#define WRAP_REDUCER_INT_EXCLUDING_64(OP)                     \
-    namespace wrap                                            \
-    {                                                         \
-        XSIMD_INLINE uint8_t OP##_u8(uint8x16_t a) noexcept   \
-        {                                                     \
-            return ::OP##_u8(a);                              \
-        }                                                     \
-        XSIMD_INLINE int8_t OP##_s8(int8x16_t a) noexcept     \
-        {                                                     \
-            return ::OP##_s8(a);                              \
-        }                                                     \
-        XSIMD_INLINE uint16_t OP##_u16(uint16x8_t a) noexcept \
-        {                                                     \
-            return ::OP##_u16(a);                             \
-        }                                                     \
-        XSIMD_INLINE int16_t OP##_s16(int16x8_t a) noexcept   \
-        {                                                     \
-            return ::OP##_s16(a);                             \
-        }                                                     \
-        XSIMD_INLINE uint32_t OP##_u32(uint32x4_t a) noexcept \
-        {                                                     \
-            return ::OP##_u32(a);                             \
-        }                                                     \
-        XSIMD_INLINE int32_t OP##_s32(int32x4_t a) noexcept   \
-        {                                                     \
-            return ::OP##_s32(a);                             \
-        }                                                     \
-    }
-
-#define WRAP_REDUCER_INT(OP)                                  \
-    WRAP_REDUCER_INT_EXCLUDING_64(OP)                         \
-    namespace wrap                                            \
-    {                                                         \
-        XSIMD_INLINE uint64_t OP##_u64(uint64x2_t a) noexcept \
-        {                                                     \
-            return ::OP##_u64(a);                             \
-        }                                                     \
-        XSIMD_INLINE int64_t OP##_s64(int64x2_t a) noexcept   \
-        {                                                     \
-            return ::OP##_s64(a);                             \
-        }                                                     \
-    }
-
-#define WRAP_REDUCER_FLOAT(OP)                               \
-    namespace wrap                                           \
-    {                                                        \
-        XSIMD_INLINE float OP##_f32(float32x4_t a) noexcept  \
-        {                                                    \
-            return ::OP##_f32(a);                            \
-        }                                                    \
-        XSIMD_INLINE double OP##_f64(float64x2_t a) noexcept \
-        {                                                    \
-            return ::OP##_f64(a);                            \
-        }                                                    \
-    }
-
-        namespace detail
-        {
-            template <class R>
-            struct reducer_return_type_impl;
-
-            template <>
-            struct reducer_return_type_impl<uint8x16_t>
-            {
-                using type = uint8_t;
-            };
-
-            template <>
-            struct reducer_return_type_impl<int8x16_t>
-            {
-                using type = int8_t;
-            };
-
-            template <>
-            struct reducer_return_type_impl<uint16x8_t>
-            {
-                using type = uint16_t;
-            };
-
-            template <>
-            struct reducer_return_type_impl<int16x8_t>
-            {
-                using type = int16_t;
-            };
-
-            template <>
-            struct reducer_return_type_impl<uint32x4_t>
-            {
-                using type = uint32_t;
-            };
-
-            template <>
-            struct reducer_return_type_impl<int32x4_t>
-            {
-                using type = int32_t;
-            };
-
-            template <>
-            struct reducer_return_type_impl<uint64x2_t>
-            {
-                using type = uint64_t;
-            };
-
-            template <>
-            struct reducer_return_type_impl<int64x2_t>
-            {
-                using type = int64_t;
-            };
-
-            template <>
-            struct reducer_return_type_impl<float32x4_t>
-            {
-                using type = float;
-            };
-
-            template <>
-            struct reducer_return_type_impl<float64x2_t>
-            {
-                using type = double;
-            };
-
-            template <class R>
-            using reducer_return_type = typename reducer_return_type_impl<R>::type;
-
-            template <class... T>
-            struct neon_reducer_dispatcher_impl : neon_dispatcher_base<reducer_return_type, T...>
-            {
-            };
-
-            using neon_reducer_dispatcher = neon_reducer_dispatcher_impl<uint8x16_t, int8x16_t,
-                                                                         uint16x8_t, int16x8_t,
-                                                                         uint32x4_t, int32x4_t,
-                                                                         uint64x2_t, int64x2_t,
-                                                                         float32x4_t, float64x2_t>;
-            template <class T>
-            using enable_neon64_type_t = std::enable_if_t<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value,
-                                                          int>;
-        }
-
         /**************
          * reduce_add *
          **************/
 
-        WRAP_REDUCER_INT(vaddvq)
-        WRAP_REDUCER_FLOAT(vaddvq)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8_t x_vaddvq(uint8x16_t a) noexcept { return vaddvq_u8(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8_t x_vaddvq(int8x16_t a) noexcept { return vaddvq_s8(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16_t x_vaddvq(uint16x8_t a) noexcept { return vaddvq_u16(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16_t x_vaddvq(int16x8_t a) noexcept { return vaddvq_s16(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32_t x_vaddvq(uint32x4_t a) noexcept { return vaddvq_u32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32_t x_vaddvq(int32x4_t a) noexcept { return vaddvq_s32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64_t x_vaddvq(uint64x2_t a) noexcept { return vaddvq_u64(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64_t x_vaddvq(int64x2_t a) noexcept { return vaddvq_s64(a); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float x_vaddvq(float32x4_t a) noexcept { return vaddvq_f32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE double x_vaddvq(float64x2_t a) noexcept { return vaddvq_f64(a); }
+        }
 
         template <class A, class T, detail::enable_neon64_type_t<T> = 0>
         XSIMD_INLINE typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_reducer_dispatcher::unary dispatcher = {
-                std::make_tuple(wrap::vaddvq_u8, wrap::vaddvq_s8, wrap::vaddvq_u16, wrap::vaddvq_s16,
-                                wrap::vaddvq_u32, wrap::vaddvq_s32, wrap::vaddvq_u64, wrap::vaddvq_s64,
-                                wrap::vaddvq_f32, wrap::vaddvq_f64)
-            };
-            return dispatcher.apply(register_type(arg));
+            return wrap::x_vaddvq<T>(register_type(arg));
         }
 
         /**************
          * reduce_max *
          **************/
 
-        WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq)
-        WRAP_REDUCER_FLOAT(vmaxvq)
-
         namespace wrap
         {
-            XSIMD_INLINE uint64_t vmaxvq_u64(uint64x2_t a) noexcept
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8_t x_vmaxvq(uint8x16_t a) noexcept { return vmaxvq_u8(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8_t x_vmaxvq(int8x16_t a) noexcept { return vmaxvq_s8(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16_t x_vmaxvq(uint16x8_t a) noexcept { return vmaxvq_u16(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16_t x_vmaxvq(int16x8_t a) noexcept { return vmaxvq_s16(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32_t x_vmaxvq(uint32x4_t a) noexcept { return vmaxvq_u32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32_t x_vmaxvq(int32x4_t a) noexcept { return vmaxvq_s32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float x_vmaxvq(float32x4_t a) noexcept { return vmaxvq_f32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE double x_vmaxvq(float64x2_t a) noexcept { return vmaxvq_f64(a); }
+
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64_t x_vmaxvq(uint64x2_t a) noexcept
             {
                 return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
             }
-
-            XSIMD_INLINE int64_t vmaxvq_s64(int64x2_t a) noexcept
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64_t x_vmaxvq(int64x2_t a) noexcept
             {
                 return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
             }
@@ -949,29 +968,42 @@ namespace xsimd
         XSIMD_INLINE typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_reducer_dispatcher::unary dispatcher = {
-                std::make_tuple(wrap::vmaxvq_u8, wrap::vmaxvq_s8, wrap::vmaxvq_u16, wrap::vmaxvq_s16,
-                                wrap::vmaxvq_u32, wrap::vmaxvq_s32, wrap::vmaxvq_u64, wrap::vmaxvq_s64,
-                                wrap::vmaxvq_f32, wrap::vmaxvq_f64)
-            };
-            return dispatcher.apply(register_type(arg));
+            return wrap::x_vmaxvq<T>(register_type(arg));
         }
 
         /**************
          * reduce_min *
          **************/
 
-        WRAP_REDUCER_INT_EXCLUDING_64(vminvq)
-        WRAP_REDUCER_FLOAT(vminvq)
-
         namespace wrap
         {
-            XSIMD_INLINE uint64_t vminvq_u64(uint64x2_t a) noexcept
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class T, std::enable_if_t<std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE uint8_t x_vminvq(uint8x16_t a) noexcept { return vminvq_u8(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE int8_t x_vminvq(int8x16_t a) noexcept { return vminvq_s8(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE uint16_t x_vminvq(uint16x8_t a) noexcept { return vminvq_u16(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE int16_t x_vminvq(int16x8_t a) noexcept { return vminvq_s16(a); }
+            template <class T, std::enable_if_t<std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE uint32_t x_vminvq(uint32x4_t a) noexcept { return vminvq_u32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE int32_t x_vminvq(int32x4_t a) noexcept { return vminvq_s32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float x_vminvq(float32x4_t a) noexcept { return vminvq_f32(a); }
+            template <class T, std::enable_if_t<std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE double x_vminvq(float64x2_t a) noexcept { return vminvq_f64(a); }
+
+            template <class T, std::enable_if_t<std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE uint64_t x_vminvq(uint64x2_t a) noexcept
             {
                 return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
             }
-
-            XSIMD_INLINE int64_t vminvq_s64(int64x2_t a) noexcept
+            template <class T, std::enable_if_t<std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE int64_t x_vminvq(int64x2_t a) noexcept
             {
                 return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
             }
@@ -981,18 +1013,9 @@ namespace xsimd
         XSIMD_INLINE typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
         {
             using register_type = typename batch<T, A>::register_type;
-            const detail::neon_reducer_dispatcher::unary dispatcher = {
-                std::make_tuple(wrap::vminvq_u8, wrap::vminvq_s8, wrap::vminvq_u16, wrap::vminvq_s16,
-                                wrap::vminvq_u32, wrap::vminvq_s32, wrap::vminvq_u64, wrap::vminvq_s64,
-                                wrap::vminvq_f32, wrap::vminvq_f64)
-            };
-            return dispatcher.apply(register_type(arg));
+            return wrap::x_vminvq<T>(register_type(arg));
         }
 
-#undef WRAP_REDUCER_INT_EXCLUDING_64
-#undef WRAP_REDUCER_INT
-#undef WRAP_REDUCER_FLOAT
-
         /**********
          * select *
          **********/
@@ -1209,9 +1232,11 @@ namespace xsimd
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
-        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
         {
-            return vshlq_u64(lhs, vnegq_s64(rhs));
+            // Blindly converting to signed since out of bounds shifts are UB anyways
+            assert(detail::shifts_all_positive(rhs));
+            return vshlq_u64(lhs, vnegq_s64(vreinterpretq_s64_u64(rhs)));
         }
 
         template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
@@ -1230,84 +1255,68 @@ namespace xsimd
          * bitwise_cast *
          ****************/
 
-#define WRAP_CAST(SUFFIX, TYPE)                                                \
-    namespace wrap                                                             \
-    {                                                                          \
-        XSIMD_INLINE float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept   \
-        {                                                                      \
-            return ::vreinterpretq_f64_##SUFFIX(a);                            \
-        }                                                                      \
-        XSIMD_INLINE TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
-        {                                                                      \
-            return ::vreinterpretq_##SUFFIX##_f64(a);                          \
-        }                                                                      \
-    }
-
-        WRAP_CAST(u8, uint8x16_t)
-        WRAP_CAST(s8, int8x16_t)
-        WRAP_CAST(u16, uint16x8_t)
-        WRAP_CAST(s16, int16x8_t)
-        WRAP_CAST(u32, uint32x4_t)
-        WRAP_CAST(s32, int32x4_t)
-        WRAP_CAST(u64, uint64x2_t)
-        WRAP_CAST(s64, int64x2_t)
-        WRAP_CAST(f32, float32x4_t)
+        namespace wrap
+        {
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, double>::value && std::is_same<T, uint8_t>::value, int> = 0>
+            XSIMD_INLINE float64x2_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_f64_u8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, double>::value && std::is_same<T, int8_t>::value, int> = 0>
+            XSIMD_INLINE float64x2_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_f64_s8(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, double>::value && std::is_same<T, uint16_t>::value, int> = 0>
+            XSIMD_INLINE float64x2_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_f64_u16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, double>::value && std::is_same<T, int16_t>::value, int> = 0>
+            XSIMD_INLINE float64x2_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_f64_s16(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, double>::value && std::is_same<T, uint32_t>::value, int> = 0>
+            XSIMD_INLINE float64x2_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_f64_u32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, double>::value && std::is_same<T, int32_t>::value, int> = 0>
+            XSIMD_INLINE float64x2_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_f64_s32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, double>::value && std::is_same<T, uint64_t>::value, int> = 0>
+            XSIMD_INLINE float64x2_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_f64_u64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, double>::value && std::is_same<T, int64_t>::value, int> = 0>
+            XSIMD_INLINE float64x2_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_f64_s64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, double>::value && std::is_same<T, float>::value, int> = 0>
+            XSIMD_INLINE float64x2_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_f64_f32(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, double>::value && std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE float64x2_t x_vreinterpretq(float64x2_t a) noexcept { return a; }
+
+            // TODO(c++17): Make a single function with if constexpr switch
+            // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC)
+            // the vector types are all aliases of the same type.
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint8_t>::value && std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE uint8x16_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_u8_f64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int8_t>::value && std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE int8x16_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_s8_f64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint16_t>::value && std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE uint16x8_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_u16_f64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int16_t>::value && std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE int16x8_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_s16_f64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint32_t>::value && std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE uint32x4_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_u32_f64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int32_t>::value && std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE int32x4_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_s32_f64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, uint64_t>::value && std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE uint64x2_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_u64_f64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, int64_t>::value && std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE int64x2_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_s64_f64(a); }
+            template <class R, class T, std::enable_if_t<std::is_same<R, float>::value && std::is_same<T, double>::value, int> = 0>
+            XSIMD_INLINE float32x4_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_f32_f64(a); }
 
-#undef WRAP_CAST
+        }
 
         template <class A, class T>
         XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
         {
-            using caster_type = detail::bitwise_caster_impl<float64x2_t,
-                                                            uint8x16_t, int8x16_t,
-                                                            uint16x8_t, int16x8_t,
-                                                            uint32x4_t, int32x4_t,
-                                                            uint64x2_t, int64x2_t,
-                                                            float32x4_t>;
-            const caster_type caster = {
-                std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16,
-                                wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64,
-                                wrap::vreinterpretq_f64_f32)
-            };
             using register_type = typename batch<T, A>::register_type;
-            return caster.apply(register_type(arg));
-        }
-
-        namespace detail
-        {
-            template <class S, class... R>
-            struct bitwise_caster_neon64
-            {
-                using container_type = std::tuple<R (*)(S)...>;
-                container_type m_func;
-
-                template <class V>
-                V apply(float64x2_t rhs) const
-                {
-                    using func_type = V (*)(float64x2_t);
-                    auto func = std::get<func_type>(m_func);
-                    return func(rhs);
-                }
-            };
+            return wrap::x_vreinterpretq<double, map_to_sized_type_t<T>>(register_type(arg));
         }
 
         template <class A, class R>
         XSIMD_INLINE batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
         {
-            using caster_type = detail::bitwise_caster_neon64<float64x2_t,
-                                                              uint8x16_t, int8x16_t,
-                                                              uint16x8_t, int16x8_t,
-                                                              uint32x4_t, int32x4_t,
-                                                              uint64x2_t, int64x2_t,
-                                                              float32x4_t>;
-            const caster_type caster = {
-                std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64,
-                                wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64,
-                                wrap::vreinterpretq_f32_f64)
-            };
             using src_register_type = typename batch<double, A>::register_type;
-            using dst_register_type = typename batch<R, A>::register_type;
-            return caster.apply<dst_register_type>(src_register_type(arg));
+            return wrap::x_vreinterpretq<map_to_sized_type_t<R>, double>(src_register_type(arg));
         }
 
         template <class A>
@@ -1491,7 +1500,7 @@ namespace xsimd
                                                batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
                                                requires_arch<neon64>) noexcept
         {
-            return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
+            return vqtbl1q_u8(self, idx.as_batch());
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
@@ -1500,7 +1509,7 @@ namespace xsimd
                                               batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
                                               requires_arch<neon64>) noexcept
         {
-            return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
+            return vqtbl1q_s8(self, idx.as_batch());
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
diff --git a/include/xsimd/arch/xsimd_rvv.hpp b/include/xsimd/arch/xsimd_rvv.hpp
index 9e4098d30..483f5e28f 100644
--- a/include/xsimd/arch/xsimd_rvv.hpp
+++ b/include/xsimd/arch/xsimd_rvv.hpp
@@ -10,13 +10,15 @@
 #ifndef XSIMD_RVV_HPP
 #define XSIMD_RVV_HPP
 
-#include <complex>
-#include <type_traits>
-#include <utility>
-
+#include "../config/xsimd_macros.hpp"
 #include "../types/xsimd_batch_constant.hpp"
 #include "../types/xsimd_rvv_register.hpp"
-#include "xsimd_constants.hpp"
+#include "../types/xsimd_utils.hpp"
+#include "../utils/xsimd_type_traits.hpp"
+#include "./xsimd_constants.hpp"
+
+#include <complex>
+#include <type_traits>
 
 // This set of macros allows the synthesis of identifiers using a template and
 // variable macro arguments.  A single template can then be used by multiple
@@ -25,11 +27,9 @@
 //
 // First some logic to paste text together...
 //
-#define XSIMD_RVV_JOIN_(x, y) x##y
-#define XSIMD_RVV_JOIN(x, y) XSIMD_RVV_JOIN_(x, y)
-#define XSIMD_RVV_PREFIX_T(T, S, then) XSIMD_RVV_JOIN(T, then)
-#define XSIMD_RVV_PREFIX_S(T, S, then) XSIMD_RVV_JOIN(S, then)
-#define XSIMD_RVV_PREFIX_M(T, S, then) XSIMD_RVV_JOIN(m1, then)
+#define XSIMD_RVV_PREFIX_T(T, S, then) XSIMD_CONCAT(T, then)
+#define XSIMD_RVV_PREFIX_S(T, S, then) XSIMD_CONCAT(S, then)
+#define XSIMD_RVV_PREFIX_M(T, S, then) XSIMD_CONCAT(m1, then)
 #define XSIMD_RVV_PREFIX(T, S, then) then
 //
 // XSIMD_RVV_IDENTIFIER accepts type and size parameters, and a template for
@@ -40,15 +40,15 @@
 // join two or more variables together.
 //
 #define XSIMD_RVV_IDENTIFIER9(T, S, t, ...) t
-#define XSIMD_RVV_IDENTIFIER8(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER9(T, S, __VA_ARGS__)))
-#define XSIMD_RVV_IDENTIFIER7(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER8(T, S, __VA_ARGS__)))
-#define XSIMD_RVV_IDENTIFIER6(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER7(T, S, __VA_ARGS__)))
-#define XSIMD_RVV_IDENTIFIER5(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER6(T, S, __VA_ARGS__)))
-#define XSIMD_RVV_IDENTIFIER4(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER5(T, S, __VA_ARGS__)))
-#define XSIMD_RVV_IDENTIFIER3(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER4(T, S, __VA_ARGS__)))
-#define XSIMD_RVV_IDENTIFIER2(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER3(T, S, __VA_ARGS__)))
-#define XSIMD_RVV_IDENTIFIER1(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER2(T, S, __VA_ARGS__)))
-#define XSIMD_RVV_IDENTIFIER0(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER1(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER8(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER9(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER7(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER8(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER6(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER7(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER5(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER6(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER4(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER5(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER3(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER4(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER2(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER3(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER1(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER2(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER0(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER1(T, S, __VA_ARGS__)))
 //
 // UNBRACKET and REPARSE force the preprocessor to handle expansion in a
 // specific order.  XSIMD_RVV_UNBRACKET strips the parentheses from the template
@@ -88,32 +88,32 @@
 // for the function signature argument(s) to XSIMD_RVV_OVERLOAD.  That signature can
 // also reference the template argument T, because it's a text substitution
 // into the template.
-#define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...)                      \
-    namespace NAME##_cruft                                                \
-    {                                                                     \
-        template <class T>                                                \
-        struct ctx                                                        \
-        {                                                                 \
-            static constexpr size_t width = XSIMD_RVV_BITS;               \
-            static constexpr size_t vl = width / (sizeof(T) * 8);         \
-            using vec = rvv_reg_t<T, width>;                              \
-            using uvec = rvv_reg_t<as_unsigned_relaxed_t<T>, width>;      \
-            using svec = rvv_reg_t<as_signed_relaxed_t<T>, width>;        \
-            using fvec = rvv_reg_t<as_float_relaxed_t<T>, width>;         \
-            using bvec = rvv_bool_t<T, width>;                            \
-            using scalar_vec = rvv_reg_t<T, types::detail::rvv_width_m1>; \
-            using wide_vec = rvv_reg_t<T, width * 2>;                     \
-            using narrow_vec = rvv_reg_t<T, width / 2>;                   \
-            using type = SIGNATURE;                                       \
-        };                                                                \
-        template <class T>                                                \
-        using sig_t = typename ctx<T>::type;                              \
-        template <class K, class T>                                       \
-        struct impl                                                       \
-        {                                                                 \
-            void operator()() const noexcept {};                          \
-        };                                                                \
-        template <class K>                                                \
+#define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...)                       \
+    namespace NAME##_cruft                                                 \
+    {                                                                      \
+        template <class T>                                                 \
+        struct ctx                                                         \
+        {                                                                  \
+            static constexpr size_t width = XSIMD_RVV_BITS;                \
+            static constexpr size_t vl = width / (sizeof(T) * 8);          \
+            using vec = rvv_reg_t<T, width>;                               \
+            using uvec = rvv_reg_t<xsimd::sized_uint_t<sizeof(T)>, width>; \
+            using svec = rvv_reg_t<xsimd::sized_int_t<sizeof(T)>, width>;  \
+            using fvec = rvv_reg_t<as_float_relaxed_t<T>, width>;          \
+            using bvec = rvv_bool_t<T, width>;                             \
+            using scalar_vec = rvv_reg_t<T, types::detail::rvv_width_m1>;  \
+            using wide_vec = rvv_reg_t<T, width * 2>;                      \
+            using narrow_vec = rvv_reg_t<T, width / 2>;                    \
+            using type = SIGNATURE;                                        \
+        };                                                                 \
+        template <class T>                                                 \
+        using sig_t = typename ctx<T>::type;                               \
+        template <class K, class T>                                        \
+        struct impl                                                        \
+        {                                                                  \
+            void operator()() const noexcept {};                           \
+        };                                                                 \
+        template <class K>                                                 \
         using impl_t = impl<K, sig_t<K>>;
 
 #define XSIMD_RVV_WRAPPER_HEAD_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
@@ -287,66 +287,19 @@ namespace xsimd
 
     namespace kernel
     {
-        namespace detail
+        namespace detail_rvv
         {
-            template <class T>
-            using rvv_fix_char_t = types::detail::rvv_fix_char_t<T>;
             template <class T, size_t Width = XSIMD_RVV_BITS>
             using rvv_reg_t = types::detail::rvv_reg_t<T, Width>;
             template <class T, size_t Width = XSIMD_RVV_BITS>
             using rvv_bool_t = types::detail::rvv_bool_t<T, Width>;
 
-            template <size_t>
-            struct as_signed_relaxed;
-            template <>
-            struct as_signed_relaxed<1>
+            template <std::size_t S>
+            struct as_float_relaxed
             {
-                using type = int8_t;
+                using type = xsimd::sized_fp_t<S>;
             };
             template <>
-            struct as_signed_relaxed<2>
-            {
-                using type = int16_t;
-            };
-            template <>
-            struct as_signed_relaxed<4>
-            {
-                using type = int32_t;
-            };
-            template <>
-            struct as_signed_relaxed<8>
-            {
-                using type = int64_t;
-            };
-            template <class T>
-            using as_signed_relaxed_t = typename as_signed_relaxed<sizeof(T)>::type;
-            template <size_t>
-            struct as_unsigned_relaxed;
-            template <>
-            struct as_unsigned_relaxed<1>
-            {
-                using type = uint8_t;
-            };
-            template <>
-            struct as_unsigned_relaxed<2>
-            {
-                using type = uint16_t;
-            };
-            template <>
-            struct as_unsigned_relaxed<4>
-            {
-                using type = uint32_t;
-            };
-            template <>
-            struct as_unsigned_relaxed<8>
-            {
-                using type = uint64_t;
-            };
-            template <class T>
-            using as_unsigned_relaxed_t = typename as_unsigned_relaxed<sizeof(T)>::type;
-            template <size_t>
-            struct as_float_relaxed;
-            template <>
             struct as_float_relaxed<1>
             {
                 using type = int8_t;
@@ -356,26 +309,16 @@ namespace xsimd
             {
                 using type = int16_t;
             };
-            template <>
-            struct as_float_relaxed<4>
-            {
-                using type = float;
-            };
-            template <>
-            struct as_float_relaxed<8>
-            {
-                using type = double;
-            };
             template <class T>
             using as_float_relaxed_t = typename as_float_relaxed<sizeof(T)>::type;
 
             template <class T, class U>
-            rvv_reg_t<T, U::width> rvvreinterpret(U const& arg) noexcept
+            XSIMD_INLINE rvv_reg_t<T, U::width> rvvreinterpret(U const& arg) noexcept
             {
                 return rvv_reg_t<T, U::width>(arg, types::detail::XSIMD_RVV_BITCAST);
             }
             template <class T, class A, class U>
-            rvv_reg_t<T, A::width> rvvreinterpret(batch<U, A> const& arg) noexcept
+            XSIMD_INLINE rvv_reg_t<T, A::width> rvvreinterpret(batch<U, A> const& arg) noexcept
             {
                 typename batch<U, A>::register_type r = arg;
                 return rvvreinterpret<T>(r);
@@ -431,43 +374,23 @@ namespace xsimd
                     index = __riscv_vsll(index, shift, batch<T, A>::size);
                 return __riscv_vadd(index, T(offset), batch<T, A>::size);
             }
-
-            // enable for signed integers
-            template <class T>
-            using rvv_enable_signed_int_t = std::enable_if_t<std::is_integral<T>::value && std::is_signed<T>::value, int>;
-
-            // enable for unsigned integers
-            template <class T>
-            using rvv_enable_unsigned_int_t = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value, int>;
-
-            // enable for floating points
-            template <class T>
-            using rvv_enable_floating_point_t = std::enable_if_t<std::is_floating_point<T>::value, int>;
-
-            // enable for signed integers or floating points
-            template <class T>
-            using rvv_enable_signed_int_or_floating_point_t = std::enable_if_t<std::is_signed<T>::value, int>;
-
-            // enable for all RVE supported types
-            template <class T>
-            using rvv_enable_all_t = std::enable_if_t<std::is_arithmetic<T>::value, int>;
         } // namespace detail
 
         /********************
          * Scalar to vector *
          ********************/
 
-        namespace detail
+        namespace detail_rvv
         {
             template <class T, size_t Width>
-            XSIMD_INLINE detail::rvv_reg_t<T, Width> broadcast(T arg) noexcept
+            XSIMD_INLINE rvv_reg_t<T, Width> broadcast(T arg) noexcept
             {
                 // A bit of a dance, here, because rvvmv_splat has no other
                 // argument from which to deduce type, and T=char is not
                 // supported.
-                detail::rvv_fix_char_t<T> arg_not_char(arg);
-                const auto splat = detail::rvvmv_splat(arg_not_char);
-                return detail::rvv_reg_t<T, Width>(splat.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
+                map_to_sized_type_t<T> arg_not_char(arg);
+                const auto splat = rvvmv_splat(arg_not_char);
+                return rvv_reg_t<T, Width>(splat.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
             }
         }
 
@@ -475,33 +398,33 @@ namespace xsimd
         template <class A, class T>
         XSIMD_INLINE batch<T, A> broadcast(T arg, requires_arch<rvv>) noexcept
         {
-            return detail::broadcast<T, A::width>(arg);
+            return detail_rvv::broadcast<T, A::width>(arg);
         }
 
         /*********
          * Load *
          *********/
 
-        namespace detail
+        namespace detail_rvv
         {
             XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*))
             XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec))
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
         {
-            return detail::rvvle(reinterpret_cast<detail::rvv_fix_char_t<T> const*>(src));
+            return detail_rvv::rvvle(reinterpret_cast<map_to_sized_type_t<T> const*>(src));
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
         {
             return load_aligned<A>(src, convert<T>(), rvv {});
         }
 
         // load_complex
-        namespace detail
+        namespace detail_rvv
         {
             template <class T, size_t W, std::enable_if_t<W >= types::detail::rvv_width_m1, int> = 0>
             XSIMD_INLINE rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
@@ -520,37 +443,44 @@ namespace xsimd
             XSIMD_RVV_OVERLOAD(rvvget_hi_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 1)
 
             template <class T, size_t W, std::enable_if_t<W >= types::detail::rvv_width_m1, int> = 0>
-            rvv_reg_t<T, W> rvvget_lo(rvv_reg_t<T, W * 2> const& vv) noexcept
+            XSIMD_INLINE rvv_reg_t<T, W> rvvget_lo(rvv_reg_t<T, W * 2> const& vv) noexcept
             {
                 typename rvv_reg_t<T, W>::register_type tmp = rvvget_lo_(T {}, vv);
                 return tmp;
             }
             template <class T, size_t W, std::enable_if_t<W >= types::detail::rvv_width_m1, int> = 0>
-            rvv_reg_t<T, W> rvvget_hi(rvv_reg_t<T, W * 2> const& vv) noexcept
+            XSIMD_INLINE rvv_reg_t<T, W> rvvget_hi(rvv_reg_t<T, W * 2> const& vv) noexcept
             {
                 typename rvv_reg_t<T, W>::register_type tmp = rvvget_hi_(T {}, vv);
                 return tmp;
             }
-            template <class T, size_t W, std::enable_if_t<W<types::detail::rvv_width_m1, int> = 0> rvv_reg_t<T, W> rvvget_lo(rvv_reg_t<T, W * 2> const& vv) noexcept
+            template <class T, size_t W, std::enable_if_t<W<types::detail::rvv_width_m1, int> = 0> XSIMD_INLINE rvv_reg_t<T, W> rvvget_lo(rvv_reg_t<T, W * 2> const& vv) noexcept
             {
                 typename rvv_reg_t<T, W>::register_type tmp = vv;
                 return tmp;
             }
-            template <class T, size_t W, std::enable_if_t<W<types::detail::rvv_width_m1, int> = 0> rvv_reg_t<T, W> rvvget_hi(rvv_reg_t<T, W * 2> const& vv) noexcept
+            template <class T, size_t W, std::enable_if_t<W<types::detail::rvv_width_m1, int> = 0> XSIMD_INLINE rvv_reg_t<T, W> rvvget_hi(rvv_reg_t<T, W * 2> const& vv) noexcept
             {
                 return __riscv_vslidedown(vv, vv.vl / 2, vv.vl);
             }
 
-            template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        }
+
+        // Must be in detail::load_complex for use by common memory.
+        // ODR violation are prevented because the size of the register is encoded
+        // in batch.
+        namespace detail
+        {
+            template <class A, class T, detail::enable_floating_point_t<T> = 0>
             XSIMD_INLINE batch<std::complex<T>, A> load_complex(batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<rvv>) noexcept
             {
-                const auto real_index = vindex<A, as_unsigned_integer_t<T>, 0, 1>();
-                const auto imag_index = vindex<A, as_unsigned_integer_t<T>, 1, 1>();
-                const auto index = rvvabut<as_unsigned_integer_t<T>, A::width>(real_index, imag_index);
-                const auto input = rvvabut<T, A::width>(lo.data, hi.data);
-                const rvv_reg_t<T, A::width * 2> result = __riscv_vrgather(input, index, index.vl);
+                const auto real_index = detail_rvv::vindex<A, as_unsigned_integer_t<T>, 0, 1>();
+                const auto imag_index = detail_rvv::vindex<A, as_unsigned_integer_t<T>, 1, 1>();
+                const auto index = detail_rvv::rvvabut<as_unsigned_integer_t<T>, A::width>(real_index, imag_index);
+                const auto input = detail_rvv::rvvabut<T, A::width>(lo.data, hi.data);
+                const detail_rvv::rvv_reg_t<T, A::width * 2> result = __riscv_vrgather(input, index, index.vl);
 
-                return { rvvget_lo<T, A::width>(result), rvvget_hi<T, A::width>(result) };
+                return { detail_rvv::rvvget_lo<T, A::width>(result), detail_rvv::rvvget_hi<T, A::width>(result) };
             }
         }
 
@@ -558,13 +488,13 @@ namespace xsimd
          * Store *
          *********/
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
         {
-            detail::rvvse(reinterpret_cast<detail::rvv_fix_char_t<T>*>(dst), src);
+            detail_rvv::rvvse(reinterpret_cast<map_to_sized_type_t<T>*>(dst), src);
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
         {
             store_aligned<A>(dst, src, rvv {});
@@ -574,7 +504,7 @@ namespace xsimd
          * scatter/gather *
          ******************/
 
-        namespace detail
+        namespace detail_rvv
         {
             template <class T, class U>
             using rvv_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>;
@@ -587,34 +517,34 @@ namespace xsimd
         }
 
         // scatter
-        template <class A, class T, class U, detail::rvv_enable_sg_t<T, U> = 0>
+        template <class A, class T, class U, detail_rvv::rvv_enable_sg_t<T, U> = 0>
         XSIMD_INLINE void scatter(batch<T, A> const& vals, T* dst, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
         {
             using UU = as_unsigned_integer_t<U>;
-            const auto uindex = detail::rvv_to_unsigned_batch(index);
-            auto* base = reinterpret_cast<detail::rvv_fix_char_t<T>*>(dst);
+            const auto uindex = detail_rvv::rvv_to_unsigned_batch(index);
+            auto* base = reinterpret_cast<map_to_sized_type_t<T>*>(dst);
             // or rvvsuxei
-            const auto bi = detail::rvvmul_splat(uindex, sizeof(T));
-            detail::rvvsoxei(base, bi, vals);
+            const auto bi = detail_rvv::rvvmul_splat(uindex, sizeof(T));
+            detail_rvv::rvvsoxei(base, bi, vals);
         }
 
         // gather
-        template <class A, class T, class U, detail::rvv_enable_sg_t<T, U> = 0>
+        template <class A, class T, class U, detail_rvv::rvv_enable_sg_t<T, U> = 0>
         XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
         {
             using UU = as_unsigned_integer_t<U>;
-            const auto uindex = detail::rvv_to_unsigned_batch(index);
-            auto const* base = reinterpret_cast<detail::rvv_fix_char_t<T> const*>(src);
+            const auto uindex = detail_rvv::rvv_to_unsigned_batch(index);
+            auto const* base = reinterpret_cast<map_to_sized_type_t<T> const*>(src);
             // or rvvluxei
-            const auto bi = detail::rvvmul_splat(uindex, sizeof(T));
-            return detail::rvvloxei(base, bi);
+            const auto bi = detail_rvv::rvvmul_splat(uindex, sizeof(T));
+            return detail_rvv::rvvloxei(base, bi);
         }
 
         /**************
          * Arithmetic *
          **************/
 
-        namespace detail
+        namespace detail_rvv
         {
             XSIMD_RVV_OVERLOAD3(rvvadd,
                                 (__riscv_vadd),
@@ -637,6 +567,9 @@ namespace xsimd
                                 (__riscv_vmul),
                                 (__riscv_vmul),
                                 (__riscv_vfmul), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD2(rvvmulh,
+                                (__riscv_vmulh),
+                                (__riscv_vmulhu), , vec(vec, vec))
             XSIMD_RVV_OVERLOAD3(rvvdiv,
                                 (__riscv_vdiv),
                                 (__riscv_vdivu),
@@ -695,108 +628,115 @@ namespace xsimd
         } // namespace detail
 
         // add
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvadd(lhs, rhs);
+            return detail_rvv::rvvadd(lhs, rhs);
         }
 
         // sadd
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvsadd(lhs, rhs);
+            return detail_rvv::rvvsadd(lhs, rhs);
         }
 
         // sub
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvsub(lhs, rhs);
+            return detail_rvv::rvvsub(lhs, rhs);
         }
 
         // ssub
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvssub(lhs, rhs);
+            return detail_rvv::rvvssub(lhs, rhs);
         }
 
         // mul
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmul(lhs, rhs);
+            return detail_rvv::rvvmul(lhs, rhs);
+        }
+
+        // mul_hi
+        template <class A, class T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+        XSIMD_INLINE batch<T, A> mul_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail_rvv::rvvmulh(lhs, rhs);
         }
 
         // div
-        template <class A, class T, typename detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, typename detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvdiv(lhs, rhs);
+            return detail_rvv::rvvdiv(lhs, rhs);
         }
 
         // max
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmax(lhs, rhs);
+            return detail_rvv::rvvmax(lhs, rhs);
         }
 
         // min
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmin(lhs, rhs);
+            return detail_rvv::rvvmin(lhs, rhs);
         }
 
         // neg
-        template <class A, class T, detail::rvv_enable_unsigned_int_t<T> = 0>
+        template <class A, class T, detail::enable_unsigned_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             using S = as_signed_integer_t<T>;
-            const auto as_signed = detail::rvvreinterpret<S>(arg);
-            const auto result = detail::rvvneg(as_signed);
-            return detail::rvvreinterpret<T>(result);
+            const auto as_signed = detail_rvv::rvvreinterpret<S>(arg);
+            const auto result = detail_rvv::rvvneg(as_signed);
+            return detail_rvv::rvvreinterpret<T>(result);
         }
 
-        template <class A, class T, detail::rvv_enable_signed_int_or_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_signed_numeral_t<T> = 0>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            return detail::rvvneg(arg);
+            return detail_rvv::rvvneg(arg);
         }
 
         // abs
-        template <class A, class T, detail::rvv_enable_unsigned_int_t<T> = 0>
+        template <class A, class T, detail::enable_unsigned_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return arg;
         }
 
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            return detail::rvvabs(arg);
+            return detail_rvv::rvvabs(arg);
         }
 
         // fma: x * y + z
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
         {
-            // also detail::rvvmadd(x, y, z);
-            return detail::rvvmacc(z, x, y);
+            // also detail_rvv::rvvmadd(x, y, z);
+            return detail_rvv::rvvmacc(z, x, y);
         }
 
         // fnma: z - x * y
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
         {
-            // also detail::rvvnmsub(x, y, z);
-            return detail::rvvnmsac(z, x, y);
+            // also detail_rvv::rvvnmsub(x, y, z);
+            return detail_rvv::rvvnmsac(z, x, y);
         }
 
         // fms: x * y - z
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
         {
             // also vfmsac(z, x, y), but lacking integer version
@@ -805,7 +745,7 @@ namespace xsimd
         }
 
         // fnms: - x * y - z
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
         {
             // also vfnmacc(z, x, y), but lacking integer version
@@ -817,7 +757,7 @@ namespace xsimd
          * Logical operations *
          **********************/
 
-        namespace detail
+        namespace detail_rvv
         {
             XSIMD_RVV_OVERLOAD_INTS(rvvand, (__riscv_vand), , vec(vec, vec))
             XSIMD_RVV_OVERLOAD_INTS(rvvor, (__riscv_vor), , vec(vec, vec))
@@ -835,118 +775,118 @@ namespace xsimd
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvand(lhs, rhs);
+            return detail_rvv::rvvand(lhs, rhs);
         }
 
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
-            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
-            const auto result_bits = detail::rvvand(lhs_bits, rhs_bits);
-            return detail::rvvreinterpret<T>(result_bits);
+            const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs);
+            const auto result_bits = detail_rvv::rvvand(lhs_bits, rhs_bits);
+            return detail_rvv::rvvreinterpret<T>(result_bits);
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmand(lhs, rhs);
+            return detail_rvv::rvvmand(lhs, rhs);
         }
 
         // bitwise_andnot
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            const auto not_rhs = detail::rvvnot(rhs);
-            return detail::rvvand(lhs, not_rhs);
+            const auto not_rhs = detail_rvv::rvvnot(rhs);
+            return detail_rvv::rvvand(lhs, not_rhs);
         }
 
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
-            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
-            const auto not_rhs = detail::rvvnot(rhs_bits);
-            const auto result_bits = detail::rvvand(lhs_bits, not_rhs);
-            return detail::rvvreinterpret<T>(result_bits);
+            const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs);
+            const auto not_rhs = detail_rvv::rvvnot(rhs_bits);
+            const auto result_bits = detail_rvv::rvvand(lhs_bits, not_rhs);
+            return detail_rvv::rvvreinterpret<T>(result_bits);
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmandn(lhs, rhs);
+            return detail_rvv::rvvmandn(lhs, rhs);
         }
 
         // bitwise_or
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvor(lhs, rhs);
+            return detail_rvv::rvvor(lhs, rhs);
         }
 
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
-            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
-            const auto result_bits = detail::rvvor(lhs_bits, rhs_bits);
-            return detail::rvvreinterpret<T>(result_bits);
+            const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs);
+            const auto result_bits = detail_rvv::rvvor(lhs_bits, rhs_bits);
+            return detail_rvv::rvvreinterpret<T>(result_bits);
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmor(lhs, rhs);
+            return detail_rvv::rvvmor(lhs, rhs);
         }
 
         // bitwise_xor
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvxor(lhs, rhs);
+            return detail_rvv::rvvxor(lhs, rhs);
         }
 
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
-            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
-            const auto result_bits = detail::rvvxor(lhs_bits, rhs_bits);
-            return detail::rvvreinterpret<T>(result_bits);
+            const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs);
+            const auto result_bits = detail_rvv::rvvxor(lhs_bits, rhs_bits);
+            return detail_rvv::rvvreinterpret<T>(result_bits);
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmxor(lhs, rhs);
+            return detail_rvv::rvvmxor(lhs, rhs);
         }
 
         // bitwise_not
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            return detail::rvvnot(arg);
+            return detail_rvv::rvvnot(arg);
         }
 
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            const auto arg_bits = detail::rvv_to_unsigned_batch(arg);
-            const auto result_bits = detail::rvvnot(arg_bits);
-            return detail::rvvreinterpret<T>(result_bits);
+            const auto arg_bits = detail_rvv::rvv_to_unsigned_batch(arg);
+            const auto result_bits = detail_rvv::rvvnot(arg_bits);
+            return detail_rvv::rvvreinterpret<T>(result_bits);
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmnot(arg);
+            return detail_rvv::rvvmnot(arg);
         }
 
         /**********
          * Shifts *
          **********/
 
-        namespace detail
+        namespace detail_rvv
         {
             XSIMD_RVV_OVERLOAD_INTS(rvvsll_splat, (__riscv_vsll), , vec(vec, size_t))
             XSIMD_RVV_OVERLOAD_INTS(rvvsll, (__riscv_vsll), , vec(vec, uvec))
@@ -964,13 +904,13 @@ namespace xsimd
         {
             constexpr size_t size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && static_cast<size_t>(n) < size && "index in bounds");
-            return detail::rvvsll_splat(arg, n);
+            return detail_rvv::rvvsll_splat(arg, n);
         }
 
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvsll(lhs, detail::rvv_to_unsigned_batch<A, T>(rhs));
+            return detail_rvv::rvvsll(lhs, detail_rvv::rvv_to_unsigned_batch<A, T>(rhs));
         }
 
         // bitwise_rshift
@@ -979,20 +919,20 @@ namespace xsimd
         {
             constexpr size_t size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && static_cast<size_t>(n) < size && "index in bounds");
-            return detail::rvvsr_splat(arg, n);
+            return detail_rvv::rvvsr_splat(arg, n);
         }
 
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvsr(lhs, detail::rvv_to_unsigned_batch<A, T>(rhs));
+            return detail_rvv::rvvsr(lhs, detail_rvv::rvv_to_unsigned_batch<A, T>(rhs));
         }
 
         /**************
          * Reductions *
          **************/
 
-        namespace detail
+        namespace detail_rvv
         {
             XSIMD_RVV_OVERLOAD3(rvvredsum,
                                 (__riscv_vredsum),
@@ -1019,38 +959,38 @@ namespace xsimd
             template <class A, class T>
             XSIMD_INLINE T reduce_scalar(rvv_reg_t<T, types::detail::rvv_width_m1> const& arg)
             {
-                return detail::rvvmv_lane0(rvv_reg_t<T, A::width>(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST));
+                return detail_rvv::rvvmv_lane0(rvv_reg_t<T, A::width>(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST));
             }
         }
         // reduce_add
-        template <class A, class T, class V = typename batch<T, A>::value_type, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, class V = typename batch<T, A>::value_type, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE V reduce_add(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            const auto zero = detail::broadcast<T, types::detail::rvv_width_m1>(T(0));
-            const auto r = detail::rvvredsum(arg, zero);
-            return detail::reduce_scalar<A, T>(r);
+            const auto zero = detail_rvv::broadcast<T, types::detail::rvv_width_m1>(T(0));
+            const auto r = detail_rvv::rvvredsum(arg, zero);
+            return detail_rvv::reduce_scalar<A, T>(r);
         }
 
         // reduce_max
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE T reduce_max(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            const auto lowest = detail::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::lowest());
-            const auto r = detail::rvvredmax(arg, lowest);
-            return detail::reduce_scalar<A, T>(r);
+            const auto lowest = detail_rvv::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::lowest());
+            const auto r = detail_rvv::rvvredmax(arg, lowest);
+            return detail_rvv::reduce_scalar<A, T>(r);
         }
 
         // reduce_min
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE T reduce_min(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            const auto max = detail::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::max());
-            const auto r = detail::rvvredmin(arg, max);
-            return detail::reduce_scalar<A, T>(r);
+            const auto max = detail_rvv::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::max());
+            const auto r = detail_rvv::rvvredmin(arg, max);
+            return detail_rvv::reduce_scalar<A, T>(r);
         }
 
         // haddp
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> haddp(const batch<T, A>* row, requires_arch<rvv>) noexcept
         {
             constexpr std::size_t size = batch<T, A>::size;
@@ -1068,64 +1008,64 @@ namespace xsimd
          ***************/
 
         // eq
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmseq(lhs, rhs);
+            return detail_rvv::rvvmseq(lhs, rhs);
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            const auto neq_result = detail::rvvmxor(lhs, rhs);
-            return detail::rvvmnot(neq_result);
+            const auto neq_result = detail_rvv::rvvmxor(lhs, rhs);
+            return detail_rvv::rvvmnot(neq_result);
         }
 
         // neq
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmsne(lhs, rhs);
+            return detail_rvv::rvvmsne(lhs, rhs);
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmxor(lhs, rhs);
+            return detail_rvv::rvvmxor(lhs, rhs);
         }
 
         // lt
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmslt(lhs, rhs);
+            return detail_rvv::rvvmslt(lhs, rhs);
         }
 
         // le
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmsle(lhs, rhs);
+            return detail_rvv::rvvmsle(lhs, rhs);
         }
 
         // gt
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmsgt(lhs, rhs);
+            return detail_rvv::rvvmsgt(lhs, rhs);
         }
 
         // ge
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmsge(lhs, rhs);
+            return detail_rvv::rvvmsge(lhs, rhs);
         }
 
         /*************
          * Selection *
          *************/
-        namespace detail
+        namespace detail_rvv
         {
             XSIMD_RVV_OVERLOAD(rvvcompress, (__riscv_vcompress_tu), , vec(vec, vec, bvec))
         }
@@ -1134,13 +1074,13 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask, requires_arch<rvv>) noexcept
         {
             auto zero = broadcast<A>(T(0), rvv {});
-            return detail::rvvcompress(zero, x, mask);
+            return detail_rvv::rvvcompress(zero, x, mask);
         }
 
         /***************
          * Permutation *
          ***************/
-        namespace detail
+        namespace detail_rvv
         {
             XSIMD_RVV_OVERLOAD(rvvrgather, (__riscv_vrgather), , vec(vec, uvec))
             XSIMD_RVV_OVERLOAD(rvvslideup, (__riscv_vslideup), , vec(vec, vec, size_t))
@@ -1153,7 +1093,7 @@ namespace xsimd
         {
             static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
             const batch<I, A> indices { idx... };
-            return detail::rvvrgather(arg, indices);
+            return detail_rvv::rvvrgather(arg, indices);
         }
 
         template <class A, class T, class I, I... idx>
@@ -1172,18 +1112,18 @@ namespace xsimd
 
         // extract_pair
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, size_t n, requires_arch<rvv>) noexcept
         {
-            const auto tmp = detail::rvvslidedown(rhs, n);
-            return detail::rvvslideup(tmp, lhs, lhs.size - n);
+            const auto tmp = detail_rvv::rvvslidedown(rhs, n);
+            return detail_rvv::rvvslideup(tmp, lhs, lhs.size - n);
         }
 
         // select
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmerge(b, a, cond);
+            return detail_rvv::rvvmerge(b, a, cond);
         }
 
         template <class A, class T, bool... b>
@@ -1193,29 +1133,29 @@ namespace xsimd
         }
 
         // zip_lo
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            const auto index = detail::vindex<A, as_unsigned_integer_t<T>, 0, -1>();
-            const auto mask = detail::pmask8<T, A::width>(0xaa);
-            return detail::rvvmerge(detail::rvvrgather(lhs, index),
-                                    detail::rvvrgather(rhs, index),
-                                    mask);
+            const auto index = detail_rvv::vindex<A, as_unsigned_integer_t<T>, 0, -1>();
+            const auto mask = detail_rvv::pmask8<T, A::width>(0xaa);
+            return detail_rvv::rvvmerge(detail_rvv::rvvrgather(lhs, index),
+                                        detail_rvv::rvvrgather(rhs, index),
+                                        mask);
         }
 
         // zip_hi
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
         {
-            const auto index = detail::vindex<A, as_unsigned_integer_t<T>, batch<T, A>::size / 2, -1>();
-            const auto mask = detail::pmask8<T, A::width>(0xaa);
-            return detail::rvvmerge(detail::rvvrgather(lhs, index),
-                                    detail::rvvrgather(rhs, index),
-                                    mask);
+            const auto index = detail_rvv::vindex<A, as_unsigned_integer_t<T>, batch<T, A>::size / 2, -1>();
+            const auto mask = detail_rvv::pmask8<T, A::width>(0xaa);
+            return detail_rvv::rvvmerge(detail_rvv::rvvrgather(lhs, index),
+                                        detail_rvv::rvvrgather(rhs, index),
+                                        mask);
         }
 
         // store_complex
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
         {
             const auto lo = zip_lo(src.real(), src.imag());
@@ -1225,7 +1165,7 @@ namespace xsimd
             store_aligned(buf + lo.size, hi, rvv {});
         }
 
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
         {
             store_complex_aligned(dst, src, rvv {});
@@ -1235,7 +1175,7 @@ namespace xsimd
          * Floating-point arithmetic *
          *****************************/
 
-        namespace detail
+        namespace detail_rvv
         {
             XSIMD_RVV_OVERLOAD_FLOATS(rvvfsqrt, (__riscv_vfsqrt), , vec(vec))
             XSIMD_RVV_OVERLOAD_FLOATS(rvvfrec7, (__riscv_vfrec7), , vec(vec))
@@ -1243,26 +1183,26 @@ namespace xsimd
         }
 
         // rsqrt
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            auto approx = detail::rvvfrsqrt7(arg);
+            auto approx = detail_rvv::rvvfrsqrt7(arg);
             approx = approx * (1.5 - (0.5 * arg * approx * approx));
             return approx;
         }
 
         // sqrt
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            return detail::rvvfsqrt(arg);
+            return detail_rvv::rvvfsqrt(arg);
         }
 
         // reciprocal
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<rvv>) noexcept
         {
-            return detail::rvvfrec7(arg);
+            return detail_rvv::rvvfrec7(arg);
         }
 
         /******************************
@@ -1270,7 +1210,7 @@ namespace xsimd
          ******************************/
 
         // fast_cast
-        namespace detail
+        namespace detail_rvv
         {
             XSIMD_RVV_OVERLOAD2(rvvfcvt_rtz, // truncating conversion, like C.
                                 (__riscv_vfcvt_rtz_x),
@@ -1329,85 +1269,106 @@ namespace xsimd
             using U = as_unsigned_integer_t<T>;
             const auto values = set(batch<U, rvv> {}, rvv {}, static_cast<U>(args)...);
             const auto zero = broadcast<A>(U(0), rvv {});
-            detail::rvv_bool_t<T> result = detail::rvvmsne(values, zero);
+            detail_rvv::rvv_bool_t<T> result = detail_rvv::rvvmsne(values, zero);
             return result;
         }
 
         // first
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE T first(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            return detail::rvvmv_lane0(arg);
+            return detail_rvv::rvvmv_lane0(arg);
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE std::complex<T> first(batch<std::complex<T>, A> const& arg, requires_arch<rvv>) noexcept
         {
-            return std::complex<T> { detail::rvvmv_lane0(arg.real()), detail::rvvmv_lane0(arg.imag()) };
+            return std::complex<T> { detail_rvv::rvvmv_lane0(arg.real()), detail_rvv::rvvmv_lane0(arg.imag()) };
         }
 
         // insert
-        template <class A, class T, size_t I, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, size_t I, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<rvv>) noexcept
         {
-            const auto mask = detail::pmask<T, A::width>(uint64_t(1) << I);
-            return detail::rvvmerge_splat(arg, val, mask);
+            const auto mask = detail_rvv::pmask<T, A::width>(uint64_t(1) << I);
+            return detail_rvv::rvvmerge_splat(arg, val, mask);
         }
 
         // get
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE T get(batch<T, A> const& arg, size_t i, requires_arch<rvv>) noexcept
         {
-            const auto tmp = detail::rvvslidedown(arg, i);
-            return detail::rvvmv_lane0(tmp);
+            const auto tmp = detail_rvv::rvvslidedown(arg, i);
+            return detail_rvv::rvvmv_lane0(tmp);
         }
 
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE std::complex<T> get(batch<std::complex<T>, A> const& arg, size_t i, requires_arch<rvv>) noexcept
         {
-            const auto tmpr = detail::rvvslidedown(arg.real(), i);
-            const auto tmpi = detail::rvvslidedown(arg.imag(), i);
-            return std::complex<T> { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) };
+            const auto tmpr = detail_rvv::rvvslidedown(arg.real(), i);
+            const auto tmpi = detail_rvv::rvvslidedown(arg.imag(), i);
+            return std::complex<T> { detail_rvv::rvvmv_lane0(tmpr), detail_rvv::rvvmv_lane0(tmpi) };
+        }
+
+        // get (compile-time index): skip the slidedown when I == 0; lane 0 maps straight to the scalar move.
+        template <class A, size_t I, class T, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE T get(batch<T, A> const& arg, index<I>, requires_arch<rvv>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return detail_rvv::rvvmv_lane0(arg);
+            }
+            return get(arg, I, rvv {});
+        }
+
+        template <class A, size_t I, class T, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE std::complex<T> get(batch<std::complex<T>, A> const& arg, index<I>, requires_arch<rvv>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return std::complex<T> { detail_rvv::rvvmv_lane0(arg.real()), detail_rvv::rvvmv_lane0(arg.imag()) };
+            }
+            return get(arg, I, rvv {});
         }
 
         // all
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            return detail::rvvcpop(arg) == batch_bool<T, A>::size;
+            return detail_rvv::rvvcpop(arg) == batch_bool<T, A>::size;
         }
 
         // any
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            return detail::rvvcpop(arg) > 0;
+            return detail_rvv::rvvcpop(arg) > 0;
         }
 
         // bitwise_cast
-        template <class A, class T, class R, detail::rvv_enable_all_t<T> = 0, detail::rvv_enable_all_t<R> = 0>
+        template <class A, class T, class R, detail::enable_arithmetic_t<T> = 0, detail::enable_arithmetic_t<R> = 0>
         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<rvv>) noexcept
         {
-            return detail::rvv_reg_t<R, A::width>(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
+            return detail_rvv::rvv_reg_t<R, A::width>(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
         }
 
         // batch_bool_cast
-        template <class A, class T_out, class T_in, detail::rvv_enable_all_t<T_in> = 0>
+        template <class A, class T_out, class T_in, detail::enable_arithmetic_t<T_in> = 0>
         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<rvv>) noexcept
         {
-            using intermediate_t = typename detail::rvv_bool_t<T_out>;
+            using intermediate_t = typename detail_rvv::rvv_bool_t<T_out>;
             return intermediate_t(arg.data);
         }
 
         // from_bool
-        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             const auto zero = broadcast<A>(T(0), rvv {});
-            return detail::rvvmerge_splat(zero, T(1), arg);
+            return detail_rvv::rvvmerge_splat(zero, T(1), arg);
         }
 
-        namespace detail
+        namespace detail_rvv
         {
             template <size_t Width>
             XSIMD_INLINE vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i)
@@ -1438,31 +1399,31 @@ namespace xsimd
         }
 
         // slide_left
-        template <size_t N, class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <size_t N, class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             const auto zero = broadcast<A>(uint8_t(0), rvv {});
             const auto bytes = arg.data.get_bytes();
-            return detail::rvvreinterpret<T>(detail::rvvslideup(zero, bytes, N));
+            return detail_rvv::rvvreinterpret<T>(detail_rvv::rvvslideup(zero, bytes, N));
         }
 
         // slide_right
-        template <size_t N, class A, class T, detail::rvv_enable_all_t<T> = 0>
+        template <size_t N, class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
-            using reg_t = detail::rvv_reg_t<T, A::width>;
+            using reg_t = detail_rvv::rvv_reg_t<T, A::width>;
             const auto bytes = arg.data.get_bytes();
-            return reg_t(detail::rvvslidedownbytes<A::width>(bytes, N), types::detail::XSIMD_RVV_BITCAST);
+            return reg_t(detail_rvv::rvvslidedownbytes<A::width>(bytes, N), types::detail::XSIMD_RVV_BITCAST);
         }
 
         // isnan
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             return !(arg == arg);
         }
 
-        namespace detail
+        namespace detail_rvv
         {
             template <class T>
             using rvv_as_signed_integer_t = as_signed_integer_t<as_unsigned_integer_t<T>>;
@@ -1481,29 +1442,29 @@ namespace xsimd
         }
 
         // nearbyint_as_int
-        template <class A, class T, class U = detail::rvv_as_signed_integer_t<T>>
+        template <class A, class T, class U = detail_rvv::rvv_as_signed_integer_t<T>>
         XSIMD_INLINE batch<U, A> nearbyint_as_int(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             // Reference rounds ties to nearest even
-            return detail::rvvfcvt_default(arg);
+            return detail_rvv::rvvfcvt_default(arg);
         }
 
         // round
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> round(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             // Round ties away from zero.
             const auto mask = abs(arg) < constants::maxflint<batch<T, A>>();
-            return select(mask, to_float(detail::rvvfcvt_afz(arg)), arg, rvv {});
+            return select(mask, to_float(detail_rvv::rvvfcvt_afz(arg)), arg, rvv {});
         }
 
         // nearbyint
-        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<rvv>) noexcept
         {
             // Round according to current rounding mode.
             const auto mask = abs(arg) < constants::maxflint<batch<T, A>>();
-            return select(mask, to_float(detail::rvvfcvt_default(arg)), arg, rvv {});
+            return select(mask, to_float(detail_rvv::rvvfcvt_default(arg)), arg, rvv {});
         }
 
         // mask
@@ -1516,12 +1477,12 @@ namespace xsimd
             XSIMD_IF_CONSTEXPR((8 * sizeof(T)) >= batch_bool<T, A>::size)
             {
                 // (A) Easy case: the number of slots fits in T.
-                const auto zero = detail::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0));
-                auto ones = detail::broadcast<as_unsigned_integer_t<T>, A::width>(1);
-                auto iota = detail::rvvid(as_unsigned_integer_t<T> {});
-                auto upowers = detail::rvvsll(ones, iota);
+                const auto zero = detail_rvv::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0));
+                auto ones = detail_rvv::broadcast<as_unsigned_integer_t<T>, A::width>(1);
+                auto iota = detail_rvv::rvvid(as_unsigned_integer_t<T> {});
+                auto upowers = detail_rvv::rvvsll(ones, iota);
                 auto r = __riscv_vredor(self.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
-                return detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r);
+                return detail_rvv::reduce_scalar<A, as_unsigned_integer_t<T>>(r);
             }
             else XSIMD_IF_CONSTEXPR((2 * 8 * sizeof(T)) == batch_bool<T, A>::size)
             {
@@ -1537,20 +1498,20 @@ namespace xsimd
                 };
 
                 // The low part is similar to the approach in (A).
-                const auto zero = detail::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0));
-                auto ones = detail::broadcast<as_unsigned_integer_t<T>, A::width>(1);
-                auto iota = detail::rvvid(as_unsigned_integer_t<T> {});
-                auto upowers = detail::rvvsll(ones, iota);
+                const auto zero = detail_rvv::broadcast<as_unsigned_integer_t<T>, types::detail::rvv_width_m1>(T(0));
+                auto ones = detail_rvv::broadcast<as_unsigned_integer_t<T>, A::width>(1);
+                auto iota = detail_rvv::rvvid(as_unsigned_integer_t<T> {});
+                auto upowers = detail_rvv::rvvsll(ones, iota);
                 auto low_mask = self & make_batch_bool_constant<T, LowerHalf, A>();
                 auto r_low = __riscv_vredor(low_mask.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
 
                 // The high part requires to slide the upower filter to match the high mask.
-                upowers = detail::rvvslideup(upowers, upowers, 8 * sizeof(T));
+                upowers = detail_rvv::rvvslideup(upowers, upowers, 8 * sizeof(T));
                 auto high_mask = self & make_batch_bool_constant<T, UpperHalf, A>();
                 auto r_high = __riscv_vredor(high_mask.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool<T, A>::size);
 
                 // Concatenate the two parts.
-                return (uint64_t)detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r_low) | ((uint64_t)detail::reduce_scalar<A, as_unsigned_integer_t<T>>(r_high) << (8 * sizeof(T)));
+                return (uint64_t)detail_rvv::reduce_scalar<A, as_unsigned_integer_t<T>>(r_low) | ((uint64_t)detail_rvv::reduce_scalar<A, as_unsigned_integer_t<T>>(r_high) << (8 * sizeof(T)));
             }
             else
             {
diff --git a/include/xsimd/arch/xsimd_scalar.hpp b/include/xsimd/arch/xsimd_scalar.hpp
index 6d9f19b31..28d0ba21e 100644
--- a/include/xsimd/arch/xsimd_scalar.hpp
+++ b/include/xsimd/arch/xsimd_scalar.hpp
@@ -12,6 +12,8 @@
 #ifndef XSIMD_SCALAR_HPP
 #define XSIMD_SCALAR_HPP
 
+#include "../config/xsimd_macros.hpp"
+
 #include <cassert>
 #include <cmath>
 #include <complex>
@@ -20,10 +22,8 @@
 #include <limits>
 #include <type_traits>
 
-#include "xsimd/config/xsimd_inline.hpp"
-
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
-#include "xtl/xcomplex.hpp"
+#include <xtl/xcomplex.hpp>
 #endif
 
 #ifdef __APPLE__
@@ -819,7 +819,7 @@ namespace xsimd
         return static_cast<long double>(std::signbit(x));
     }
 
-    template <class T>
+    template <class T, class = std::enable_if_t<std::is_scalar<T>::value>>
     XSIMD_INLINE auto signbit(T const& v) noexcept
     {
         return bitofsign(v);
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
index cccba8144..c6cfb5f07 100644
--- a/include/xsimd/arch/xsimd_sse2.hpp
+++ b/include/xsimd/arch/xsimd_sse2.hpp
@@ -12,13 +12,14 @@
 #ifndef XSIMD_SSE2_HPP
 #define XSIMD_SSE2_HPP
 
+#include "../types/xsimd_batch_constant.hpp"
+#include "../types/xsimd_sse2_register.hpp"
+#include "./utils/shifts.hpp"
+
 #include <complex>
 #include <limits>
 #include <type_traits>
 
-#include "../types/xsimd_batch_constant.hpp"
-#include "../types/xsimd_sse2_register.hpp"
-
 namespace xsimd
 {
     template <typename T, class A, bool... Values>
@@ -326,6 +327,36 @@ namespace xsimd
             return bitwise_lshift<shift>(self, common {});
         }
 
+        // bitwise_lshift multiple (constant)
+        // Missing implementations are dispacthed to the `batch` overload in xsimd_api.
+        template <class A, class T, T... Vs, detail::enable_sized_integral_t<T, 2> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(
+            batch<T, A> const& self, batch_constant<T, A, Vs...> shifts, requires_arch<sse2> req) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(utils::all_equals(shifts))
+            {
+                return bitwise_lshift<shifts.get(0), A>(self, req);
+            }
+            constexpr auto mults = batch_constant<T, A, static_cast<T>(1u << Vs)...>();
+            return _mm_mullo_epi16(self, mults.as_batch());
+        }
+
+        template <class A, class T, T... Vs, detail::enable_sized_integral_t<T, 1> = 0>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(
+            batch<T, A> const& self, batch_constant<T, A, Vs...> shifts, requires_arch<sse2> req) noexcept
+        {
+            using uint_t = std::make_unsigned_t<T>;
+
+            XSIMD_IF_CONSTEXPR(utils::all_equals(shifts))
+            {
+                return bitwise_lshift<shifts.get(0), A>(self, req);
+            }
+            return bitwise_cast<T>(
+                utils::bitwise_lshift_as_twice_larger<uint_t>(
+                    bitwise_cast<uint_t>(self),
+                    batch_constant<uint_t, A, static_cast<uint_t>(Vs)...> {}));
+        }
+
         // bitwise_not
         template <class A>
         XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
@@ -685,6 +716,7 @@ namespace xsimd
                 __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
                 __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
             }
 
@@ -699,6 +731,7 @@ namespace xsimd
                 __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
                 __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
             }
 
@@ -1400,11 +1433,70 @@ namespace xsimd
 
         // mul
         template <class A>
+        XSIMD_INLINE batch<uint8_t, A> mul(batch<uint8_t, A> const& self, batch<uint8_t, A> const& other, requires_arch<sse2>) noexcept
+        {
+            // Low byte of an N-bit*N-bit product is bitwise identical for
+            // signed and unsigned operands. Split into even/odd bytes inside
+            // each 16-bit lane, do two 16-bit mullos, then re-interleave.
+            __m128i mask = _mm_set1_epi16(0x00FF);
+            __m128i a_even = _mm_and_si128(self, mask);
+            __m128i b_even = _mm_and_si128(other, mask);
+            __m128i a_odd = _mm_srli_epi16(self, 8);
+            __m128i b_odd = _mm_srli_epi16(other, 8);
+            __m128i p_even = _mm_and_si128(_mm_mullo_epi16(a_even, b_even), mask);
+            __m128i p_odd = _mm_slli_epi16(_mm_mullo_epi16(a_odd, b_odd), 8);
+            return _mm_or_si128(p_even, p_odd);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int8_t, A> mul(batch<int8_t, A> const& self, batch<int8_t, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int8_t>(mul(bitwise_cast<uint8_t>(self), bitwise_cast<uint8_t>(other), sse2 {}));
+        }
+        template <class A>
         XSIMD_INLINE batch<int16_t, A> mul(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<sse2>) noexcept
         {
             return _mm_mullo_epi16(self, other);
         }
 
+        // mul_hi
+        template <class A>
+        XSIMD_INLINE batch<int8_t, A> mul_hi(batch<int8_t, A> const& self, batch<int8_t, A> const& other, requires_arch<sse2>) noexcept
+        {
+            // Sign-extend bytes to 16-bit (unpack-with-self followed by srai 8
+            // duplicates the byte then arithmetic-shifts the sign in), do the
+            // 16x16->16 multiply, then take the high byte of each product.
+            __m128i a_lo = _mm_srai_epi16(_mm_unpacklo_epi8(self, self), 8);
+            __m128i a_hi = _mm_srai_epi16(_mm_unpackhi_epi8(self, self), 8);
+            __m128i b_lo = _mm_srai_epi16(_mm_unpacklo_epi8(other, other), 8);
+            __m128i b_hi = _mm_srai_epi16(_mm_unpackhi_epi8(other, other), 8);
+            __m128i p_lo = _mm_srai_epi16(_mm_mullo_epi16(a_lo, b_lo), 8);
+            __m128i p_hi = _mm_srai_epi16(_mm_mullo_epi16(a_hi, b_hi), 8);
+            // results already lie in [-128, 127], so packs is exact (no saturation kicks in).
+            return _mm_packs_epi16(p_lo, p_hi);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint8_t, A> mul_hi(batch<uint8_t, A> const& self, batch<uint8_t, A> const& other, requires_arch<sse2>) noexcept
+        {
+            __m128i zero = _mm_setzero_si128();
+            __m128i a_lo = _mm_unpacklo_epi8(self, zero);
+            __m128i a_hi = _mm_unpackhi_epi8(self, zero);
+            __m128i b_lo = _mm_unpacklo_epi8(other, zero);
+            __m128i b_hi = _mm_unpackhi_epi8(other, zero);
+            __m128i p_lo = _mm_srli_epi16(_mm_mullo_epi16(a_lo, b_lo), 8);
+            __m128i p_hi = _mm_srli_epi16(_mm_mullo_epi16(a_hi, b_hi), 8);
+            return _mm_packus_epi16(p_lo, p_hi);
+        }
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> mul_hi(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mulhi_epi16(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint16_t, A> mul_hi(batch<uint16_t, A> const& self, batch<uint16_t, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mulhi_epu16(self, other);
+        }
+
         // nearbyint_as_int
         template <class A>
         XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
@@ -1927,6 +2019,23 @@ namespace xsimd
             return _mm_storeu_pd(mem, self);
         }
 
+        // store_stream
+        template <class A>
+        XSIMD_INLINE void store_stream(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            _mm_stream_ps(mem, self);
+        }
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
+        XSIMD_INLINE void store_stream(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            _mm_stream_si128((__m128i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_stream(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            _mm_stream_pd(mem, self);
+        }
+
         // sub
         template <class A>
         XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
@@ -2226,6 +2335,57 @@ namespace xsimd
             }
         }
 
+        // get (must appear after first and swizzle so it can delegate through the xsimd API)
+        namespace detail
+        {
+            // broadcast lane index I across a batch_constant<IdxT, A, I, I, ..., I> matching batch<T, A>::size
+            template <class T, class A, size_t I, size_t... Is>
+            XSIMD_INLINE auto broadcast_lane_index(std::index_sequence<Is...>) noexcept
+                -> batch_constant<as_unsigned_integer_t<T>, A, static_cast<as_unsigned_integer_t<T>>(Is * 0 + I)...>
+            {
+                return {};
+            }
+
+            template <class T, class A, size_t I>
+            XSIMD_INLINE auto broadcast_lane_index() noexcept
+                -> decltype(broadcast_lane_index<T, A, I>(std::make_index_sequence<batch<T, A>::size> {}))
+            {
+                return {};
+            }
+        }
+
+        template <class A, size_t I, class T>
+        XSIMD_INLINE typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= 2, T>::type
+        get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, A {});
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm_extract_epi16(self, I));
+            }
+            else
+            {
+                // SSE2 has no pextrb; byte-lane shift + movd is the shortest path for I>0.
+                return static_cast<T>(_mm_cvtsi128_si32(_mm_srli_si128(self, I)) & 0xFF);
+            }
+        }
+
+        template <class A, size_t I, class T>
+        XSIMD_INLINE typename std::enable_if<(std::is_integral<T>::value && sizeof(T) >= 4) || std::is_floating_point<T>::value, T>::type
+        get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, A {});
+            }
+            else
+            {
+                return first(swizzle(self, detail::broadcast_lane_index<T, A, I>(), A {}), A {});
+            }
+        }
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_sse3.hpp b/include/xsimd/arch/xsimd_sse3.hpp
index 3ece0ae3b..a860e4d61 100644
--- a/include/xsimd/arch/xsimd_sse3.hpp
+++ b/include/xsimd/arch/xsimd_sse3.hpp
@@ -13,6 +13,7 @@
 #define XSIMD_SSE3_HPP
 
 #include "../types/xsimd_sse3_register.hpp"
+
 #include <type_traits>
 
 namespace xsimd
diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
index 030fb29db..bc2b0f3de 100644
--- a/include/xsimd/arch/xsimd_sse4_1.hpp
+++ b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -12,11 +12,11 @@
 #ifndef XSIMD_SSE4_1_HPP
 #define XSIMD_SSE4_1_HPP
 
-#include <type_traits>
-
 #include "../types/xsimd_sse4_1_register.hpp"
 #include "./common/xsimd_common_cast.hpp"
 
+#include <type_traits>
+
 namespace xsimd
 {
 
@@ -41,6 +41,15 @@ namespace xsimd
             return _mm_ceil_pd(self);
         }
 
+        // bitwise_lshift multiple (constant)
+        template <class A, uint32_t... Vs>
+        XSIMD_INLINE batch<uint32_t, A> bitwise_lshift(
+            batch<uint32_t, A> const& self, batch_constant<uint32_t, A, Vs...>, requires_arch<sse4_1>) noexcept
+        {
+            constexpr auto mults = batch_constant<uint32_t, A, static_cast<uint32_t>(1u << Vs)...>();
+            return _mm_mullo_epi32(self, mults.as_batch());
+        }
+
         // fast_cast
         namespace detail
         {
@@ -53,6 +62,7 @@ namespace xsimd
                 xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
                 __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); //  2^52
                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
             }
 
@@ -64,6 +74,7 @@ namespace xsimd
                 xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
                 __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); //  2^52
                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)");
                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
             }
         }
@@ -94,6 +105,41 @@ namespace xsimd
             return _mm_floor_pd(self);
         }
 
+        // get
+        template <class A, size_t I, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(I == 0)
+            {
+                return first(self, sse2 {});
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm_extract_epi8(self, I));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm_extract_epi16(self, I));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm_extract_epi32(self, I));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+#if defined(__x86_64__)
+                return static_cast<T>(_mm_extract_epi64(self, I));
+#else
+                return get(self, ::xsimd::index<I> {}, sse2 {});
+#endif
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
         // insert
         template <class A, class T, size_t I, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
@@ -217,6 +263,23 @@ namespace xsimd
             }
         }
 
+        // load_stream
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value, void>>
+        XSIMD_INLINE batch<T, A> load_stream(T const* mem, convert<T>, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_stream_load_si128((__m128i*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_stream(float const* mem, convert<float>, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)mem));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_stream(double const* mem, convert<double>, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_castsi128_pd(_mm_stream_load_si128((__m128i*)mem));
+        }
+
         // min
         template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
@@ -296,6 +359,41 @@ namespace xsimd
             }
         }
 
+        // mul_hi
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> mul_hi(batch<int32_t, A> const& self, batch<int32_t, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            __m128i even = _mm_mul_epi32(self, other); // 64-bit products in lanes 0,2
+            __m128i odd = _mm_mul_epi32(_mm_srli_epi64(self, 32), _mm_srli_epi64(other, 32));
+            // hi halves in the low 32 of each 64 lane of (even>>32), and in the high 32 of odd
+            __m128i even_hi = _mm_srli_epi64(even, 32);
+            // blend: 32-bit lanes {even_hi[0], odd_hi[1], even_hi[2], odd_hi[3]}
+            return _mm_blend_epi16(even_hi, odd, 0xCC);
+        }
+        template <class A>
+        XSIMD_INLINE batch<uint32_t, A> mul_hi(batch<uint32_t, A> const& self, batch<uint32_t, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            __m128i even = _mm_mul_epu32(self, other);
+            __m128i odd = _mm_mul_epu32(_mm_srli_epi64(self, 32), _mm_srli_epi64(other, 32));
+            __m128i even_hi = _mm_srli_epi64(even, 32);
+            return _mm_blend_epi16(even_hi, odd, 0xCC);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<uint64_t, A> mul_hi(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            return detail::mulhi_u64_core<A>(self, other,
+                                             [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                             { return batch<uint64_t, A>(_mm_mul_epu32(a, b)); });
+        }
+        template <class A>
+        XSIMD_INLINE batch<int64_t, A> mul_hi(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            return detail::mulhi_i64_core<A>(self, other,
+                                             [](batch<uint64_t, A> a, batch<uint64_t, A> b)
+                                             { return batch<uint64_t, A>(_mm_mul_epu32(a, b)); });
+        }
+
         // nearbyint
         template <class A>
         XSIMD_INLINE batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
diff --git a/include/xsimd/arch/xsimd_sse4_2.hpp b/include/xsimd/arch/xsimd_sse4_2.hpp
index 5265182f9..1619cf70a 100644
--- a/include/xsimd/arch/xsimd_sse4_2.hpp
+++ b/include/xsimd/arch/xsimd_sse4_2.hpp
@@ -12,10 +12,10 @@
 #ifndef XSIMD_SSE4_2_HPP
 #define XSIMD_SSE4_2_HPP
 
-#include <limits>
-
 #include "../types/xsimd_sse4_2_register.hpp"
 
+#include <limits>
+
 namespace xsimd
 {
 
diff --git a/include/xsimd/arch/xsimd_ssse3.hpp b/include/xsimd/arch/xsimd_ssse3.hpp
index 85e03288d..bae3fb9c4 100644
--- a/include/xsimd/arch/xsimd_ssse3.hpp
+++ b/include/xsimd/arch/xsimd_ssse3.hpp
@@ -12,12 +12,12 @@
 #ifndef XSIMD_SSSE3_HPP
 #define XSIMD_SSSE3_HPP
 
-#include <cstddef>
-#include <type_traits>
-
 #include "../types/xsimd_ssse3_register.hpp"
 #include "../types/xsimd_utils.hpp"
 
+#include <cstddef>
+#include <type_traits>
+
 namespace xsimd
 {
 
diff --git a/include/xsimd/arch/xsimd_sve.hpp b/include/xsimd/arch/xsimd_sve.hpp
index 4a45e09aa..2a46947bf 100644
--- a/include/xsimd/arch/xsimd_sve.hpp
+++ b/include/xsimd/arch/xsimd_sve.hpp
@@ -13,10 +13,18 @@
 #ifndef XSIMD_SVE_HPP
 #define XSIMD_SVE_HPP
 
+#include "../config/xsimd_config.hpp"
+#include "../config/xsimd_macros.hpp"
+#include "../types/xsimd_sve_register.hpp"
+
 #include <complex>
 #include <type_traits>
 
-#include "../types/xsimd_sve_register.hpp"
+// Define a inline namespace with the explicit SVE vector size to avoid ODR violation
+// When dynamically dispatching between different SVE sizes.
+// While most code is safe from ODR violation as the size is already encoded in the
+// register (and hence batch) types, utilities can quickly fall prone to this issue.
+#define XSIMD_SVE_NAMESPACE XSIMD_CONCAT(sve, XSIMD_SVE_BITS)
 
 namespace xsimd
 {
@@ -25,106 +33,93 @@ namespace xsimd
 
     namespace kernel
     {
-        namespace detail
-        {
-            using xsimd::index;
-            using xsimd::types::detail::sve_vector_type;
-
-            // predicate creation
-            XSIMD_INLINE svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); }
-            XSIMD_INLINE svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); }
-            XSIMD_INLINE svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); }
-            XSIMD_INLINE svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); }
-
-            template <class T>
-            svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index<sizeof(T)> {}); }
-
-            // predicate loading
-            template <bool M0, bool M1>
-            svbool_t sve_pmask() noexcept { return svdupq_b64(M0, M1); }
-            template <bool M0, bool M1, bool M2, bool M3>
-            svbool_t sve_pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); }
-            template <bool M0, bool M1, bool M2, bool M3, bool M4, bool M5, bool M6, bool M7>
-            svbool_t sve_pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); }
-            template <bool M0, bool M1, bool M2, bool M3, bool M4, bool M5, bool M6, bool M7,
-                      bool M8, bool M9, bool M10, bool M11, bool M12, bool M13, bool M14, bool M15>
-            svbool_t sve_pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); }
-
-            // count active lanes in a predicate
-            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); }
-            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); }
-            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); }
-            XSIMD_INLINE uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); }
-
-            template <class T>
-            XSIMD_INLINE uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index<sizeof(T)> {}); }
-
-            // enable for signed integers
-            template <class T>
-            using sve_enable_signed_int_t = std::enable_if_t<std::is_integral<T>::value && std::is_signed<T>::value, int>;
-
-            // enable for unsigned integers
-            template <class T>
-            using sve_enable_unsigned_int_t = std::enable_if_t<std::is_integral<T>::value && !std::is_signed<T>::value, int>;
-
-            // enable for floating points
-            template <class T>
-            using sve_enable_floating_point_t = std::enable_if_t<std::is_floating_point<T>::value, int>;
-
-            // enable for signed integers or floating points
-            template <class T>
-            using sve_enable_signed_int_or_floating_point_t = std::enable_if_t<std::is_signed<T>::value, int>;
-
-            // enable for all SVE supported types
-            template <class T>
-            using sve_enable_all_t = std::enable_if_t<std::is_arithmetic<T>::value, int>;
-        } // namespace detail
+        namespace detail_sve
+        {
+            inline namespace XSIMD_SVE_NAMESPACE
+            {
+                using xsimd::index;
+                using xsimd::types::detail::sve_vector_type;
+
+                // predicate creation
+                XSIMD_INLINE svbool_t ptrue_impl(index<1>) noexcept { return svptrue_b8(); }
+                XSIMD_INLINE svbool_t ptrue_impl(index<2>) noexcept { return svptrue_b16(); }
+                XSIMD_INLINE svbool_t ptrue_impl(index<4>) noexcept { return svptrue_b32(); }
+                XSIMD_INLINE svbool_t ptrue_impl(index<8>) noexcept { return svptrue_b64(); }
+
+                template <class T>
+                XSIMD_INLINE svbool_t ptrue() noexcept { return ptrue_impl(index<sizeof(T)> {}); }
+
+                // predicate loading
+                template <bool M0, bool M1>
+                XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b64(M0, M1); }
+                template <bool M0, bool M1, bool M2, bool M3>
+                XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); }
+                template <bool M0, bool M1, bool M2, bool M3, bool M4, bool M5, bool M6, bool M7>
+                XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); }
+                template <bool M0, bool M1, bool M2, bool M3, bool M4, bool M5, bool M6, bool M7,
+                          bool M8, bool M9, bool M10, bool M11, bool M12, bool M13, bool M14, bool M15>
+                XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); }
+
+                // count active lanes in a predicate
+                XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); }
+                XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); }
+                XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); }
+                XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); }
+
+                template <class T>
+                XSIMD_INLINE uint64_t pcount(svbool_t p) noexcept { return pcount_impl(p, index<sizeof(T)> {}); }
+
+                // enable for signed integers or floating points
+                template <class T>
+                using enable_signed_int_or_floating_point_t = std::enable_if_t<std::is_signed<T>::value, int>;
+
+                // `sizeless` is the matching sizeless SVE type. xsimd stores SVE
+                // vectors as fixed-size attributed types (arm_sve_vector_bits),
+                // which clang treats as implicitly convertible to every sizeless
+                // SVE type — including multi-vector tuples — making the overloaded
+                // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting
+                // to `sizeless` first collapses the overload set to the single
+                // 1-vector candidate.
+                template <class T>
+                using sizeless_t = xsimd::types::detail::sizeless_sve_vector_type<T>;
+            } // namespace XSIMD_SVE_NAMESPACE
+        } // namespace detail_sve
 
         /*********
          * Load *
          *********/
 
-        namespace detail
-        {
-            // "char" is not allowed in SVE load/store operations
-            using sve_fix_char_t_impl = std::conditional_t<std::is_signed<char>::value, int8_t, uint8_t>;
-
-            template <class T>
-            using sve_fix_char_t = std::conditional_t<std::is_same<char, std::decay_t<T>>::value,
-                                                      sve_fix_char_t_impl, T>;
-        }
-
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<sve>) noexcept
         {
-            return svld1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T> const*>(src));
+            return svld1(detail_sve::ptrue<T>(), reinterpret_cast<map_to_sized_type_t<T> const*>(src));
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<sve>) noexcept
         {
             return load_aligned<A>(src, convert<T>(), sve {});
         }
 
         // load_masked
-        template <class A, class T, bool... Values, class Mode, detail::sve_enable_all_t<T> = 0>
-        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<float, A, Values...> mask, Mode, requires_arch<sve>) noexcept
+        template <class A, class T, bool... Values, class Mode, detail::enable_arithmetic_t<T> = 0>
+        XSIMD_INLINE batch<T, A> load_masked(T const* mem, batch_bool_constant<float, A, Values...>, Mode, requires_arch<sve>) noexcept
         {
-            return svld1(detail::sve_pmask<Values...>(), reinterpret_cast<detail::sve_fix_char_t<T> const*>(mem));
+            return svld1(detail_sve::pmask<Values...>(), reinterpret_cast<map_to_sized_type_t<T> const*>(mem));
         }
 
         // load_complex
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
         {
             const T* buf = reinterpret_cast<const T*>(mem);
-            const auto tmp = svld2(detail::sve_ptrue<T>(), buf);
+            const auto tmp = svld2(detail_sve::ptrue<T>(), buf);
             const auto real = svget2(tmp, 0);
             const auto imag = svget2(tmp, 1);
             return batch<std::complex<T>, A> { real, imag };
         }
 
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<std::complex<T>, A> load_complex_unaligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
         {
             return load_complex_aligned<A>(mem, convert<std::complex<T>> {}, sve {});
@@ -134,20 +129,20 @@ namespace xsimd
          * Store *
          *********/
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
         {
-            svst1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T>*>(dst), src);
+            svst1(detail_sve::ptrue<T>(), reinterpret_cast<map_to_sized_type_t<T>*>(dst), src);
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
         {
             store_aligned<A>(dst, src, sve {});
         }
 
         // store_complex
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
         {
             using v2type = std::conditional_t<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>;
@@ -155,10 +150,10 @@ namespace xsimd
             tmp = svset2(tmp, 0, src.real());
             tmp = svset2(tmp, 1, src.imag());
             T* buf = reinterpret_cast<T*>(dst);
-            svst2(detail::sve_ptrue<T>(), buf, tmp);
+            svst2(detail_sve::ptrue<T>(), buf, tmp);
         }
 
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
         {
             store_complex_aligned(dst, src, sve {});
@@ -168,24 +163,24 @@ namespace xsimd
          * scatter/gather *
          ******************/
 
-        namespace detail
+        namespace detail_sve
         {
             template <class T, class U>
-            using sve_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>;
+            using enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>;
         }
 
         // scatter
-        template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
+        template <class A, class T, class U, detail_sve::enable_sg_t<T, U> = 0>
         XSIMD_INLINE void scatter(batch<T, A> const& src, T* dst, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
         {
-            svst1_scatter_index(detail::sve_ptrue<T>(), dst, index.data, src.data);
+            svst1_scatter_index(detail_sve::ptrue<T>(), dst, index.data, src.data);
         }
 
         // gather
-        template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
+        template <class A, class T, class U, detail_sve::enable_sg_t<T, U> = 0>
         XSIMD_INLINE batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
         {
-            return svld1_gather_index(detail::sve_ptrue<T>(), src, index.data);
+            return svld1_gather_index(detail_sve::ptrue<T>(), src, index.data);
         }
 
         /********************
@@ -253,7 +248,7 @@ namespace xsimd
             return svdup_n_f64(arg);
         }
 
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sve>) noexcept
         {
             return broadcast<sve>(val, sve {});
@@ -264,10 +259,10 @@ namespace xsimd
          **************/
 
         // add
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svadd_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return svadd_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // sadd
@@ -278,10 +273,10 @@ namespace xsimd
         }
 
         // sub
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svsub_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return svsub_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // ssub
@@ -292,100 +287,107 @@ namespace xsimd
         }
 
         // mul
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svmul_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return svmul_x(detail_sve::ptrue<T>(), lhs, rhs);
+        }
+
+        // mul_hi
+        template <class A, class T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
+        XSIMD_INLINE batch<T, A> mul_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmulh_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // div
         template <class A, class T, std::enable_if_t<sizeof(T) >= 4, int> = 0>
         XSIMD_INLINE batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svdiv_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return svdiv_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // max
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svmax_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return svmax_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // min
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svmin_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return svmin_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // neg
         template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svreinterpret_u8(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s8(arg)));
+            return svreinterpret_u8(svneg_x(detail_sve::ptrue<T>(), svreinterpret_s8(static_cast<detail_sve::sizeless_t<T>>(arg))));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svreinterpret_u16(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s16(arg)));
+            return svreinterpret_u16(svneg_x(detail_sve::ptrue<T>(), svreinterpret_s16(static_cast<detail_sve::sizeless_t<T>>(arg))));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svreinterpret_u32(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s32(arg)));
+            return svreinterpret_u32(svneg_x(detail_sve::ptrue<T>(), svreinterpret_s32(static_cast<detail_sve::sizeless_t<T>>(arg))));
         }
 
         template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svreinterpret_u64(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s64(arg)));
+            return svreinterpret_u64(svneg_x(detail_sve::ptrue<T>(), svreinterpret_s64(static_cast<detail_sve::sizeless_t<T>>(arg))));
         }
 
-        template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_signed_numeral_t<T> = 0>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svneg_x(detail::sve_ptrue<T>(), arg);
+            return svneg_x(detail_sve::ptrue<T>(), arg);
         }
 
         // abs
-        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        template <class A, class T, detail::enable_unsigned_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return arg;
         }
 
-        template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_signed_numeral_t<T> = 0>
         XSIMD_INLINE batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svabs_x(detail::sve_ptrue<T>(), arg);
+            return svabs_x(detail_sve::ptrue<T>(), arg);
         }
 
         // fma: x * y + z
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
         {
-            return svmad_x(detail::sve_ptrue<T>(), x, y, z);
+            return svmad_x(detail_sve::ptrue<T>(), x, y, z);
         }
 
         // fnma: z - x * y
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
         {
-            return svmsb_x(detail::sve_ptrue<T>(), x, y, z);
+            return svmsb_x(detail_sve::ptrue<T>(), x, y, z);
         }
 
         // fms: x * y - z
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
         {
             return -fnma(x, y, z, sve {});
         }
 
         // fnms: - x * y - z
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
         {
             return -fma(x, y, z, sve {});
@@ -399,191 +401,194 @@ namespace xsimd
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svand_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return svand_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         template <class A>
         XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
         {
-            const auto lhs_bits = svreinterpret_u32(lhs);
-            const auto rhs_bits = svreinterpret_u32(rhs);
-            const auto result_bits = svand_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            const auto lhs_bits = svreinterpret_u32(static_cast<detail_sve::sizeless_t<float>>(lhs));
+            const auto rhs_bits = svreinterpret_u32(static_cast<detail_sve::sizeless_t<float>>(rhs));
+            const auto result_bits = svand_x(detail_sve::ptrue<float>(), lhs_bits, rhs_bits);
             return svreinterpret_f32(result_bits);
         }
 
         template <class A>
         XSIMD_INLINE batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
         {
-            const auto lhs_bits = svreinterpret_u64(lhs);
-            const auto rhs_bits = svreinterpret_u64(rhs);
-            const auto result_bits = svand_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            const auto lhs_bits = svreinterpret_u64(static_cast<detail_sve::sizeless_t<double>>(lhs));
+            const auto rhs_bits = svreinterpret_u64(static_cast<detail_sve::sizeless_t<double>>(rhs));
+            const auto result_bits = svand_x(detail_sve::ptrue<double>(), lhs_bits, rhs_bits);
             return svreinterpret_f64(result_bits);
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svand_z(detail::sve_ptrue<T>(), lhs, rhs);
+            return svand_z(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // bitwise_andnot
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svbic_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return svbic_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         template <class A>
         XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
         {
-            const auto lhs_bits = svreinterpret_u32(lhs);
-            const auto rhs_bits = svreinterpret_u32(rhs);
-            const auto result_bits = svbic_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            const auto lhs_bits = svreinterpret_u32(static_cast<detail_sve::sizeless_t<float>>(lhs));
+            const auto rhs_bits = svreinterpret_u32(static_cast<detail_sve::sizeless_t<float>>(rhs));
+            const auto result_bits = svbic_x(detail_sve::ptrue<float>(), lhs_bits, rhs_bits);
             return svreinterpret_f32(result_bits);
         }
 
         template <class A>
         XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
         {
-            const auto lhs_bits = svreinterpret_u64(lhs);
-            const auto rhs_bits = svreinterpret_u64(rhs);
-            const auto result_bits = svbic_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            const auto lhs_bits = svreinterpret_u64(static_cast<detail_sve::sizeless_t<double>>(lhs));
+            const auto rhs_bits = svreinterpret_u64(static_cast<detail_sve::sizeless_t<double>>(rhs));
+            const auto result_bits = svbic_x(detail_sve::ptrue<double>(), lhs_bits, rhs_bits);
             return svreinterpret_f64(result_bits);
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svbic_z(detail::sve_ptrue<T>(), lhs, rhs);
+            return svbic_z(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // bitwise_or
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svorr_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return svorr_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         template <class A>
         XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
         {
-            const auto lhs_bits = svreinterpret_u32(lhs);
-            const auto rhs_bits = svreinterpret_u32(rhs);
-            const auto result_bits = svorr_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            const auto lhs_bits = svreinterpret_u32(static_cast<detail_sve::sizeless_t<float>>(lhs));
+            const auto rhs_bits = svreinterpret_u32(static_cast<detail_sve::sizeless_t<float>>(rhs));
+            const auto result_bits = svorr_x(detail_sve::ptrue<float>(), lhs_bits, rhs_bits);
             return svreinterpret_f32(result_bits);
         }
 
         template <class A>
         XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
         {
-            const auto lhs_bits = svreinterpret_u64(lhs);
-            const auto rhs_bits = svreinterpret_u64(rhs);
-            const auto result_bits = svorr_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            const auto lhs_bits = svreinterpret_u64(static_cast<detail_sve::sizeless_t<double>>(lhs));
+            const auto rhs_bits = svreinterpret_u64(static_cast<detail_sve::sizeless_t<double>>(rhs));
+            const auto result_bits = svorr_x(detail_sve::ptrue<double>(), lhs_bits, rhs_bits);
             return svreinterpret_f64(result_bits);
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svorr_z(detail::sve_ptrue<T>(), lhs, rhs);
+            return svorr_z(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // bitwise_xor
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return sveor_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return sveor_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         template <class A>
         XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
         {
-            const auto lhs_bits = svreinterpret_u32(lhs);
-            const auto rhs_bits = svreinterpret_u32(rhs);
-            const auto result_bits = sveor_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            const auto lhs_bits = svreinterpret_u32(static_cast<detail_sve::sizeless_t<float>>(lhs));
+            const auto rhs_bits = svreinterpret_u32(static_cast<detail_sve::sizeless_t<float>>(rhs));
+            const auto result_bits = sveor_x(detail_sve::ptrue<float>(), lhs_bits, rhs_bits);
             return svreinterpret_f32(result_bits);
         }
 
         template <class A>
         XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
         {
-            const auto lhs_bits = svreinterpret_u64(lhs);
-            const auto rhs_bits = svreinterpret_u64(rhs);
-            const auto result_bits = sveor_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            const auto lhs_bits = svreinterpret_u64(static_cast<detail_sve::sizeless_t<double>>(lhs));
+            const auto rhs_bits = svreinterpret_u64(static_cast<detail_sve::sizeless_t<double>>(rhs));
+            const auto result_bits = sveor_x(detail_sve::ptrue<double>(), lhs_bits, rhs_bits);
             return svreinterpret_f64(result_bits);
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+            return sveor_z(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // bitwise_not
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svnot_x(detail::sve_ptrue<T>(), arg);
+            return svnot_x(detail_sve::ptrue<T>(), arg);
         }
 
         template <class A>
         XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& arg, requires_arch<sve>) noexcept
         {
-            const auto arg_bits = svreinterpret_u32(arg);
-            const auto result_bits = svnot_x(detail::sve_ptrue<float>(), arg_bits);
+            const auto arg_bits = svreinterpret_u32(static_cast<detail_sve::sizeless_t<float>>(arg));
+            const auto result_bits = svnot_x(detail_sve::ptrue<float>(), arg_bits);
             return svreinterpret_f32(result_bits);
         }
 
         template <class A>
         XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& arg, requires_arch<sve>) noexcept
         {
-            const auto arg_bits = svreinterpret_u64(arg);
-            const auto result_bits = svnot_x(detail::sve_ptrue<double>(), arg_bits);
+            const auto arg_bits = svreinterpret_u64(static_cast<detail_sve::sizeless_t<double>>(arg));
+            const auto result_bits = svnot_x(detail_sve::ptrue<double>(), arg_bits);
             return svreinterpret_f64(result_bits);
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svnot_z(detail::sve_ptrue<T>(), arg);
+            return svnot_z(detail_sve::ptrue<T>(), arg);
         }
 
         /**********
          * Shifts *
          **********/
 
-        namespace detail
+        namespace detail_sve
         {
-            template <class A, class T, class U>
-            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<1>) noexcept
+            inline namespace XSIMD_SVE_NAMESPACE
             {
-                return svreinterpret_u8(arg);
-            }
+                template <class A, class T, class U>
+                XSIMD_INLINE batch<U, A> to_unsigned_batch_impl(batch<T, A> const& arg, index<1>) noexcept
+                {
+                    return svreinterpret_u8(static_cast<sizeless_t<T>>(arg));
+                }
 
-            template <class A, class T, class U>
-            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<2>) noexcept
-            {
-                return svreinterpret_u16(arg);
-            }
+                template <class A, class T, class U>
+                XSIMD_INLINE batch<U, A> to_unsigned_batch_impl(batch<T, A> const& arg, index<2>) noexcept
+                {
+                    return svreinterpret_u16(static_cast<sizeless_t<T>>(arg));
+                }
 
-            template <class A, class T, class U>
-            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<4>) noexcept
-            {
-                return svreinterpret_u32(arg);
-            }
+                template <class A, class T, class U>
+                XSIMD_INLINE batch<U, A> to_unsigned_batch_impl(batch<T, A> const& arg, index<4>) noexcept
+                {
+                    return svreinterpret_u32(static_cast<sizeless_t<T>>(arg));
+                }
 
-            template <class A, class T, class U>
-            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<8>) noexcept
-            {
-                return svreinterpret_u64(arg);
-            }
+                template <class A, class T, class U>
+                XSIMD_INLINE batch<U, A> to_unsigned_batch_impl(batch<T, A> const& arg, index<8>) noexcept
+                {
+                    return svreinterpret_u64(static_cast<sizeless_t<T>>(arg));
+                }
 
-            template <class A, class T, class U = as_unsigned_integer_t<T>>
-            XSIMD_INLINE batch<U, A> sve_to_unsigned_batch(batch<T, A> const& arg) noexcept
-            {
-                return sve_to_unsigned_batch_impl<A, T, U>(arg, index<sizeof(T)> {});
-            }
-        } // namespace detail
+                template <class A, class T, class U = as_unsigned_integer_t<T>>
+                XSIMD_INLINE batch<U, A> to_unsigned_batch(batch<T, A> const& arg) noexcept
+                {
+                    return to_unsigned_batch_impl<A, T, U>(arg, index<sizeof(T)> {});
+                }
+            } // namespace XSIMD_SVE_NAMESPACE
+        } // namespace detail_sve
 
         // bitwise_lshift
         template <class A, class T, detail::enable_integral_t<T> = 0>
@@ -591,42 +596,42 @@ namespace xsimd
         {
             constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
-            return svlsl_x(detail::sve_ptrue<T>(), arg, n);
+            return svlsl_x(detail_sve::ptrue<T>(), arg, n);
         }
 
         template <class A, class T, detail::enable_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svlsl_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
+            return svlsl_x(detail_sve::ptrue<T>(), lhs, detail_sve::to_unsigned_batch<A, T>(rhs));
         }
 
         // bitwise_rshift
-        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        template <class A, class T, detail::enable_unsigned_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
         {
             constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
-            return svlsr_x(detail::sve_ptrue<T>(), arg, static_cast<T>(n));
+            return svlsr_x(detail_sve::ptrue<T>(), arg, static_cast<T>(n));
         }
 
-        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        template <class A, class T, detail::enable_unsigned_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svlsr_x(detail::sve_ptrue<T>(), lhs, rhs);
+            return svlsr_x(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
-        template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
+        template <class A, class T, detail::enable_signed_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
         {
             constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
             assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
-            return svasr_x(detail::sve_ptrue<T>(), arg, static_cast<as_unsigned_integer_t<T>>(n));
+            return svasr_x(detail_sve::ptrue<T>(), arg, static_cast<as_unsigned_integer_t<T>>(n));
         }
 
-        template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
+        template <class A, class T, detail::enable_signed_integral_t<T> = 0>
         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svasr_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
+            return svasr_x(detail_sve::ptrue<T>(), lhs, detail_sve::to_unsigned_batch<A, T>(rhs));
         }
 
         /**************
@@ -634,29 +639,29 @@ namespace xsimd
          **************/
 
         // reduce_add
-        template <class A, class T, class V = typename batch<T, A>::value_type, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, class V = typename batch<T, A>::value_type, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE V reduce_add(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             // sve integer reduction results are promoted to 64 bits
-            return static_cast<V>(svaddv(detail::sve_ptrue<T>(), arg));
+            return static_cast<V>(svaddv(detail_sve::ptrue<T>(), arg));
         }
 
         // reduce_max
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE T reduce_max(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svmaxv(detail::sve_ptrue<T>(), arg);
+            return svmaxv(detail_sve::ptrue<T>(), arg);
         }
 
         // reduce_min
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE T reduce_min(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svminv(detail::sve_ptrue<T>(), arg);
+            return svminv(detail_sve::ptrue<T>(), arg);
         }
 
         // haddp
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> haddp(const batch<T, A>* row, requires_arch<sve>) noexcept
         {
             constexpr std::size_t size = batch<T, A>::size;
@@ -665,7 +670,7 @@ namespace xsimd
             {
                 sums[i] = reduce_add(row[i], sve {});
             }
-            return svld1(detail::sve_ptrue<T>(), sums);
+            return svld1(detail_sve::ptrue<T>(), sums);
         }
 
         /***************
@@ -673,58 +678,58 @@ namespace xsimd
          ***************/
 
         // eq
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svcmpeq(detail::sve_ptrue<T>(), lhs, rhs);
+            return svcmpeq(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            const auto neq_result = sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
-            return svnot_z(detail::sve_ptrue<T>(), neq_result);
+            const auto neq_result = sveor_z(detail_sve::ptrue<T>(), lhs, rhs);
+            return svnot_z(detail_sve::ptrue<T>(), neq_result);
         }
 
         // neq
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svcmpne(detail::sve_ptrue<T>(), lhs, rhs);
+            return svcmpne(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+            return sveor_z(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // lt
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svcmplt(detail::sve_ptrue<T>(), lhs, rhs);
+            return svcmplt(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // le
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svcmple(detail::sve_ptrue<T>(), lhs, rhs);
+            return svcmple(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // gt
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svcmpgt(detail::sve_ptrue<T>(), lhs, rhs);
+            return svcmpgt(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         // ge
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
-            return svcmpge(detail::sve_ptrue<T>(), lhs, rhs);
+            return svcmpge(detail_sve::ptrue<T>(), lhs, rhs);
         }
 
         /***************
@@ -732,7 +737,7 @@ namespace xsimd
          ***************/
 
         //  rotate_left
-        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <size_t N, class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> rotate_left(batch<T, A> const& a, requires_arch<sve>) noexcept
         {
             return svext(a, a, N);
@@ -777,55 +782,58 @@ namespace xsimd
          *************/
 
         // extract_pair
-        namespace detail
+        namespace detail_sve
         {
-            template <class A, class T>
-            XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept
+            inline namespace XSIMD_SVE_NAMESPACE
             {
-                assert(false && "extract_pair out of bounds");
-                return batch<T, A> {};
-            }
-
-            template <class A, class T, size_t I, size_t... Is>
-            XSIMD_INLINE batch<T, A> sve_extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, std::index_sequence<I, Is...>) noexcept
-            {
-                if (n == I)
-                {
-                    return svext(rhs, lhs, I);
-                }
-                else
+                template <class A, class T>
+                XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept
                 {
-                    return sve_extract_pair(lhs, rhs, n, std::index_sequence<Is...>());
+                    assert(false && "extract_pair out of bounds");
+                    return batch<T, A> {};
                 }
-            }
 
-            template <class A, class T, size_t... Is>
-            XSIMD_INLINE batch<T, A> sve_extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept
-            {
-                if (n == 0)
+                template <class A, class T, size_t I, size_t... Is>
+                XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, std::index_sequence<I, Is...>) noexcept
                 {
-                    return rhs;
+                    if (n == I)
+                    {
+                        return svext(rhs, lhs, I);
+                    }
+                    else
+                    {
+                        return extract_pair(lhs, rhs, n, std::index_sequence<Is...>());
+                    }
                 }
-                else
+
+                template <class A, class T, size_t... Is>
+                XSIMD_INLINE batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept
                 {
-                    return sve_extract_pair(lhs, rhs, n, std::index_sequence<Is...>());
+                    if (n == 0)
+                    {
+                        return rhs;
+                    }
+                    else
+                    {
+                        return extract_pair(lhs, rhs, n, std::index_sequence<Is...>());
+                    }
                 }
-            }
-        }
+            } // namespace XSIMD_SVE_NAMESPACE
+        } // namespace detail_sve
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<sve>) noexcept
         {
             constexpr std::size_t size = batch<T, A>::size;
             assert(n < size && "index in bounds");
-            return detail::sve_extract_pair_impl(lhs, rhs, n, std::make_index_sequence<size>());
+            return detail_sve::extract_pair_impl(lhs, rhs, n, std::make_index_sequence<size>());
         }
 
         // select
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<sve>) noexcept
         {
-            return svsel(cond, a, b);
+            return svsel(cond, static_cast<detail_sve::sizeless_t<T>>(a), static_cast<detail_sve::sizeless_t<T>>(b));
         }
 
         template <class A, class T, bool... b>
@@ -835,14 +843,14 @@ namespace xsimd
         }
 
         // zip_lo
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svzip1(lhs, rhs);
         }
 
         // zip_hi
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
         {
             return svzip2(lhs, rhs);
@@ -853,21 +861,21 @@ namespace xsimd
          *****************************/
 
         // rsqrt
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svrsqrte(arg);
         }
 
         // sqrt
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svsqrt_x(detail::sve_ptrue<T>(), arg);
+            return svsqrt_x(detail_sve::ptrue<T>(), arg);
         }
 
         // reciprocal
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<sve>) noexcept
         {
             return svrecpe(arg);
@@ -878,44 +886,47 @@ namespace xsimd
          ******************************/
 
         // fast_cast
-        namespace detail
+        namespace detail_sve
         {
-            template <class A, class T, detail::enable_sized_integral_t<T, 4> = 0>
-            XSIMD_INLINE batch<float, A> fast_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+            inline namespace XSIMD_SVE_NAMESPACE
             {
-                return svcvt_f32_x(detail::sve_ptrue<T>(), arg);
-            }
+                template <class A, class T, detail::enable_sized_integral_t<T, 4> = 0>
+                XSIMD_INLINE batch<float, A> fast_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+                {
+                    return svcvt_f32_x(detail_sve::ptrue<T>(), arg);
+                }
 
-            template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
-            XSIMD_INLINE batch<double, A> fast_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
-            {
-                return svcvt_f64_x(detail::sve_ptrue<T>(), arg);
-            }
+                template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+                XSIMD_INLINE batch<double, A> fast_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
+                {
+                    return svcvt_f64_x(detail_sve::ptrue<T>(), arg);
+                }
 
-            template <class A>
-            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& arg, batch<int32_t, A> const&, requires_arch<sve>) noexcept
-            {
-                return svcvt_s32_x(detail::sve_ptrue<float>(), arg);
-            }
+                template <class A>
+                XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& arg, batch<int32_t, A> const&, requires_arch<sve>) noexcept
+                {
+                    return svcvt_s32_x(detail_sve::ptrue<float>(), arg);
+                }
 
-            template <class A>
-            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& arg, batch<uint32_t, A> const&, requires_arch<sve>) noexcept
-            {
-                return svcvt_u32_x(detail::sve_ptrue<float>(), arg);
-            }
+                template <class A>
+                XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& arg, batch<uint32_t, A> const&, requires_arch<sve>) noexcept
+                {
+                    return svcvt_u32_x(detail_sve::ptrue<float>(), arg);
+                }
 
-            template <class A>
-            XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& arg, batch<int64_t, A> const&, requires_arch<sve>) noexcept
-            {
-                return svcvt_s64_x(detail::sve_ptrue<double>(), arg);
-            }
+                template <class A>
+                XSIMD_INLINE batch<int64_t, A> fast_cast(batch<double, A> const& arg, batch<int64_t, A> const&, requires_arch<sve>) noexcept
+                {
+                    return svcvt_s64_x(detail_sve::ptrue<double>(), arg);
+                }
 
-            template <class A>
-            XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& arg, batch<uint64_t, A> const&, requires_arch<sve>) noexcept
-            {
-                return svcvt_u64_x(detail::sve_ptrue<double>(), arg);
-            }
-        }
+                template <class A>
+                XSIMD_INLINE batch<uint64_t, A> fast_cast(batch<double, A> const& arg, batch<uint64_t, A> const&, requires_arch<sve>) noexcept
+                {
+                    return svcvt_u64_x(detail_sve::ptrue<double>(), arg);
+                }
+            } // namespace XSIMD_SVE_NAMESPACE
+        } // namespace detail_sve
 
         /*********
          * Miscs *
@@ -925,246 +936,255 @@ namespace xsimd
         template <class A, class T, class... Args>
         XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sve>, Args... args) noexcept
         {
-            return detail::sve_vector_type<T> { args... };
+            return detail_sve::sve_vector_type<T> { args... };
         }
 
         template <class A, class T, class... Args>
         XSIMD_INLINE batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<sve>,
                                                    Args... args_complex) noexcept
         {
-            return batch<std::complex<T>>(detail::sve_vector_type<T> { args_complex.real()... },
-                                          detail::sve_vector_type<T> { args_complex.imag()... });
+            return batch<std::complex<T>>(detail_sve::sve_vector_type<T> { args_complex.real()... },
+                                          detail_sve::sve_vector_type<T> { args_complex.imag()... });
         }
 
         template <class A, class T, class... Args>
         XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sve>, Args... args) noexcept
         {
             using U = as_unsigned_integer_t<T>;
-            const auto values = detail::sve_vector_type<U> { static_cast<U>(args)... };
+            const auto values = detail_sve::sve_vector_type<U> { static_cast<U>(args)... };
             const auto zero = broadcast<A, U>(static_cast<U>(0), sve {});
-            return svcmpne(detail::sve_ptrue<T>(), values, zero);
+            return svcmpne(detail_sve::ptrue<T>(), values, zero);
         }
 
         // insert
-        namespace detail
+        namespace detail_sve
         {
-            // generate index sequence (iota)
-            XSIMD_INLINE svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); }
-            XSIMD_INLINE svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); }
-            XSIMD_INLINE svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); }
-            XSIMD_INLINE svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); }
-
-            template <class T, class V = sve_vector_type<as_unsigned_integer_t<T>>>
-            XSIMD_INLINE V sve_iota() noexcept { return sve_iota_impl(index<sizeof(T)> {}); }
-        } // namespace detail
-
-        template <class A, class T, size_t I, detail::sve_enable_all_t<T> = 0>
+            inline namespace XSIMD_SVE_NAMESPACE
+            {
+                // generate index sequence (iota)
+                XSIMD_INLINE svuint8_t iota_impl(index<1>) noexcept { return svindex_u8(0, 1); }
+                XSIMD_INLINE svuint16_t iota_impl(index<2>) noexcept { return svindex_u16(0, 1); }
+                XSIMD_INLINE svuint32_t iota_impl(index<4>) noexcept { return svindex_u32(0, 1); }
+                XSIMD_INLINE svuint64_t iota_impl(index<8>) noexcept { return svindex_u64(0, 1); }
+
+                template <class T, class V = sve_vector_type<as_unsigned_integer_t<T>>>
+                XSIMD_INLINE V iota() noexcept { return iota_impl(index<sizeof(T)> {}); }
+            } // namespace XSIMD_SVE_NAMESPACE
+        } // namespace detail_sve
+
+        template <class A, class T, size_t I, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<sve>) noexcept
         {
             // create a predicate with only the I-th lane activated
-            const auto iota = detail::sve_iota<T>();
-            const auto index_predicate = svcmpeq(detail::sve_ptrue<T>(), iota, static_cast<as_unsigned_integer_t<T>>(I));
-            return svsel(index_predicate, broadcast<A, T>(val, sve {}), arg);
+            const auto iota = detail_sve::iota<T>();
+            const auto index_predicate = svcmpeq(detail_sve::ptrue<T>(), iota, static_cast<as_unsigned_integer_t<T>>(I));
+            return svsel(index_predicate, static_cast<detail_sve::sizeless_t<T>>(broadcast<A, T>(val, sve {})), static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
         // first
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sve>) noexcept
         {
             return self.data[0];
         }
 
         // all
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return detail::sve_pcount<T>(arg) == batch_bool<T, A>::size;
+            return detail_sve::pcount<T>(arg) == batch_bool<T, A>::size;
         }
 
         // any
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE bool any(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return svptest_any(arg, arg);
         }
 
         // bitwise_cast
-        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 1> = 0>
+        template <class A, class T, class R, detail::enable_arithmetic_t<T> = 0, detail::enable_sized_unsigned_t<R, 1> = 0>
         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
-            return svreinterpret_u8(arg);
+            return svreinterpret_u8(static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
-        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 1> = 0>
+        template <class A, class T, class R, detail::enable_arithmetic_t<T> = 0, detail::enable_sized_signed_t<R, 1> = 0>
         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
-            return svreinterpret_s8(arg);
+            return svreinterpret_s8(static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
-        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 2> = 0>
+        template <class A, class T, class R, detail::enable_arithmetic_t<T> = 0, detail::enable_sized_unsigned_t<R, 2> = 0>
         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
-            return svreinterpret_u16(arg);
+            return svreinterpret_u16(static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
-        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 2> = 0>
+        template <class A, class T, class R, detail::enable_arithmetic_t<T> = 0, detail::enable_sized_signed_t<R, 2> = 0>
         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
-            return svreinterpret_s16(arg);
+            return svreinterpret_s16(static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
-        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 4> = 0>
+        template <class A, class T, class R, detail::enable_arithmetic_t<T> = 0, detail::enable_sized_unsigned_t<R, 4> = 0>
         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
-            return svreinterpret_u32(arg);
+            return svreinterpret_u32(static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
-        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 4> = 0>
+        template <class A, class T, class R, detail::enable_arithmetic_t<T> = 0, detail::enable_sized_signed_t<R, 4> = 0>
         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
-            return svreinterpret_s32(arg);
+            return svreinterpret_s32(static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
-        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 8> = 0>
+        template <class A, class T, class R, detail::enable_arithmetic_t<T> = 0, detail::enable_sized_unsigned_t<R, 8> = 0>
         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
-            return svreinterpret_u64(arg);
+            return svreinterpret_u64(static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
-        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 8> = 0>
+        template <class A, class T, class R, detail::enable_arithmetic_t<T> = 0, detail::enable_sized_signed_t<R, 8> = 0>
         XSIMD_INLINE batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
         {
-            return svreinterpret_s64(arg);
+            return svreinterpret_s64(static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
         {
-            return svreinterpret_f32(arg);
+            return svreinterpret_f32(static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
         {
-            return svreinterpret_f64(arg);
+            return svreinterpret_f64(static_cast<detail_sve::sizeless_t<T>>(arg));
         }
 
         // batch_bool_cast
-        template <class A, class T_out, class T_in, detail::sve_enable_all_t<T_in> = 0>
+        template <class A, class T_out, class T_in, detail::enable_arithmetic_t<T_in> = 0>
         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<sve>) noexcept
         {
             return arg.data;
         }
 
         // from_bool
-        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        template <class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return select(arg, batch<T, A>(1), batch<T, A>(0));
         }
 
         // slide_left
-        namespace detail
+        namespace detail_sve
         {
-            template <size_t N>
-            struct sve_slider_left
+            inline namespace XSIMD_SVE_NAMESPACE
             {
-                template <class A, class T>
-                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                template <size_t N>
+                struct slider_left
                 {
-                    using u8_vector = batch<uint8_t, A>;
-                    const auto left = svdup_n_u8(0);
-                    const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data;
-                    const u8_vector result(svext(left, right, u8_vector::size - N));
-                    return bitwise_cast(result, batch<T, A> {}, sve {});
-                }
-            };
-
-            template <>
-            struct sve_slider_left<0>
-            {
-                template <class A, class T>
-                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                    template <class A, class T>
+                    XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                    {
+                        using u8_vector = batch<uint8_t, A>;
+                        const auto left = svdup_n_u8(0);
+                        const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data;
+                        const u8_vector result(svext(left, right, u8_vector::size - N));
+                        return bitwise_cast(result, batch<T, A> {}, sve {});
+                    }
+                };
+
+                template <>
+                struct slider_left<0>
                 {
-                    return arg;
-                }
-            };
-        } // namespace detail
-
-        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+                    template <class A, class T>
+                    XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                    {
+                        return arg;
+                    }
+                };
+            } // namespace XSIMD_SVE_NAMESPACE
+        } // namespace detail_sve
+
+        template <size_t N, class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return detail::sve_slider_left<N>()(arg);
+            return detail_sve::slider_left<N>()(arg);
         }
 
         // slide_right
-        namespace detail
+        namespace detail_sve
         {
-            template <size_t N>
-            struct sve_slider_right
+            inline namespace XSIMD_SVE_NAMESPACE
             {
-                template <class A, class T>
-                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                template <size_t N>
+                struct slider_right
                 {
-                    using u8_vector = batch<uint8_t, A>;
-                    const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data;
-                    const auto right = svdup_n_u8(0);
-                    const u8_vector result(svext(left, right, N));
-                    return bitwise_cast(result, batch<T, A> {}, sve {});
-                }
-            };
-
-            template <>
-            struct sve_slider_right<batch<uint8_t, sve>::size>
-            {
-                template <class A, class T>
-                XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&) noexcept
+                    template <class A, class T>
+                    XSIMD_INLINE batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                    {
+                        using u8_vector = batch<uint8_t, A>;
+                        const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data;
+                        const auto right = svdup_n_u8(0);
+                        const u8_vector result(svext(left, right, N));
+                        return bitwise_cast(result, batch<T, A> {}, sve {});
+                    }
+                };
+
+                template <>
+                struct slider_right<batch<uint8_t, sve>::size>
                 {
-                    return batch<T, A> {};
-                }
-            };
-        } // namespace detail
-
-        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+                    template <class A, class T>
+                    XSIMD_INLINE batch<T, A> operator()(batch<T, A> const&) noexcept
+                    {
+                        return batch<T, A> {};
+                    }
+                };
+            } // namespace XSIMD_SVE_NAMESPACE
+        } // namespace detail_sve
+
+        template <size_t N, class A, class T, detail::enable_arithmetic_t<T> = 0>
         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return detail::sve_slider_right<N>()(arg);
+            return detail_sve::slider_right<N>()(arg);
         }
 
         // isnan
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
             return !(arg == arg);
         }
 
         // nearbyint
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<sve>) noexcept
         {
-            return svrintx_x(detail::sve_ptrue<T>(), arg);
+            return svrintx_x(detail_sve::ptrue<T>(), arg);
         }
 
         // nearbyint_as_int
         template <class A>
         XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& arg, requires_arch<sve>) noexcept
         {
-            const auto nearest = svrintx_x(detail::sve_ptrue<float>(), arg);
-            return svcvt_s32_x(detail::sve_ptrue<float>(), nearest);
+            const auto nearest = svrintx_x(detail_sve::ptrue<float>(), arg);
+            return svcvt_s32_x(detail_sve::ptrue<float>(), nearest);
         }
 
         template <class A>
         XSIMD_INLINE batch<int64_t, A> nearbyint_as_int(batch<double, A> const& arg, requires_arch<sve>) noexcept
         {
-            const auto nearest = svrintx_x(detail::sve_ptrue<double>(), arg);
-            return svcvt_s64_x(detail::sve_ptrue<double>(), nearest);
+            const auto nearest = svrintx_x(detail_sve::ptrue<double>(), arg);
+            return svcvt_s64_x(detail_sve::ptrue<double>(), nearest);
         }
 
         // ldexp
-        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        template <class A, class T, detail::enable_floating_point_t<T> = 0>
         XSIMD_INLINE batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& exp, requires_arch<sve>) noexcept
         {
-            return svscale_x(detail::sve_ptrue<T>(), x, exp);
+            return svscale_x(detail_sve::ptrue<T>(), x, exp);
         }
 
     } // namespace kernel
diff --git a/include/xsimd/arch/xsimd_vsx.hpp b/include/xsimd/arch/xsimd_vsx.hpp
index c07d3eab7..4a184136d 100644
--- a/include/xsimd/arch/xsimd_vsx.hpp
+++ b/include/xsimd/arch/xsimd_vsx.hpp
@@ -12,15 +12,15 @@
 #ifndef XSIMD_VSX_HPP
 #define XSIMD_VSX_HPP
 
-#include <complex>
-#include <limits>
-#include <type_traits>
-
+#include "../types/xsimd_batch_fwd.hpp"
 #include "../types/xsimd_vsx_register.hpp"
 #include "./common/xsimd_common_cast.hpp"
 
 #include <endian.h>
 
+#include <complex>
+#include <type_traits>
+
 namespace xsimd
 {
     template <typename T, class A, bool... Values>
@@ -34,6 +34,37 @@ namespace xsimd
 
     namespace kernel
     {
+        // builtin_t<T> - the scalar type as it would be used for a vector intrinsic
+        // VSX vector intrinsics do not support long, unsigned long, and char
+        // The builtin<T> definition can be used to map the incoming
+        // type to the right one to be used with the intrinsics.
+        template <typename T>
+        struct builtin_scalar
+        {
+            using type = T;
+        };
+
+        template <>
+        struct builtin_scalar<unsigned long>
+        {
+            using type = unsigned long long;
+        };
+
+        template <>
+        struct builtin_scalar<long>
+        {
+            using type = long long;
+        };
+
+        template <>
+        struct builtin_scalar<char>
+        {
+            using type = typename std::conditional<std::is_signed<char>::value, signed char, unsigned char>::type;
+        };
+
+        template <typename T>
+        using builtin_t = typename builtin_scalar<T>::type;
+
         template <class A, class T>
         XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
         template <class A, class T>
@@ -219,7 +250,7 @@ namespace xsimd
         template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<vsx>) noexcept
         {
-            return vec_splats(val);
+            return vec_splats(static_cast<builtin_t<T>>(val));
         }
 
         // ceil
@@ -422,18 +453,18 @@ namespace xsimd
             return ~vec_cmpeq(self.data, self.data);
         }
 
-        // load_aligned
+        // load_unaligned
         template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
-        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<vsx>) noexcept
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<vsx>) noexcept
         {
-            return vec_ld(0, reinterpret_cast<const typename batch<T, A>::register_type*>(mem));
+            return (typename batch<T, A>::register_type)vec_xl(0, (builtin_t<T>*)mem);
         }
 
-        // load_unaligned
+        // load_aligned
         template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
-        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<vsx>) noexcept
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<vsx>) noexcept
         {
-            return vec_vsx_ld(0, (typename batch<T, A>::register_type const*)mem);
+            return load_unaligned<A>(mem, kernel::convert<T> {}, vsx {});
         }
 
         // load_complex
@@ -606,8 +637,18 @@ namespace xsimd
         }
 
         // round
-        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
-        XSIMD_INLINE batch<T, A> round(batch<T, A> const& self, requires_arch<vsx>) noexcept
+
+        // vec_round exists also for float vectors but is mapped to vrfin instruction which uses the wrong rounding mode
+#if defined __has_builtin && __has_builtin(__builtin_vsx_xvrspi)
+        template <class A>
+        XSIMD_INLINE batch<float, A> round(batch<float, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return __builtin_vsx_xvrspi(self.data);
+        }
+#endif
+        // For double vectors vec_round uses xvrdpi which does the right thing
+        template <class A>
+        XSIMD_INLINE batch<double, A> round(batch<double, A> const& self, requires_arch<vsx>) noexcept
         {
             return vec_round(self.data);
         }
@@ -749,14 +790,14 @@ namespace xsimd
         template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
         XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<vsx>) noexcept
         {
-            return vec_st(self.data, 0, reinterpret_cast<typename batch<T, A>::register_type*>(mem));
+            vec_xst((typename batch<T, A>::register_type)self.data, 0, (builtin_t<T>*)mem);
         }
 
         // store_unaligned
         template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
         XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<vsx>) noexcept
         {
-            return vec_vsx_st(self.data, 0, reinterpret_cast<typename batch<T, A>::register_type*>(mem));
+            store_aligned<A>(mem, self, vsx {});
         }
 
         // sub
diff --git a/include/xsimd/arch/xsimd_vxe.hpp b/include/xsimd/arch/xsimd_vxe.hpp
new file mode 100644
index 000000000..c419daa52
--- /dev/null
+++ b/include/xsimd/arch/xsimd_vxe.hpp
@@ -0,0 +1,787 @@
+/***************************************************************************
+ * Copyright (c) Andreas Krebbel                                            *
+ * Based on xsimd_vsx.hpp                                                   *
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_VXE_HPP
+#define XSIMD_VXE_HPP
+
+#include "../types/xsimd_vxe_register.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        using namespace types;
+        using v1ti = __int128 __attribute__((vector_size(16)));
+        using v4sf = float __attribute__((vector_size(16)));
+        using v2df = double __attribute__((vector_size(16)));
+        using uv2di = unsigned long long int __attribute__((vector_size(16)));
+        using v2di = long long int __attribute__((vector_size(16)));
+        using uv4si = unsigned int __attribute__((vector_size(16)));
+        using v4si = int __attribute__((vector_size(16)));
+        using uv8hi = unsigned short int __attribute__((vector_size(16)));
+        using v8hi = short int __attribute__((vector_size(16)));
+        using uv16qi = unsigned char __attribute__((vector_size(16)));
+        using v16qi = signed char __attribute__((vector_size(16)));
+
+        // builtin_t<T> - the scalar type as it would be used for a vector intrinsic
+        // VXE vector intrinsics do not support long, unsigned long, and char
+        // The builtin<T> definition can be used to map the incoming
+        // type to the right one to be used with the intrinsics.
+        template <typename T>
+        struct builtin_scalar
+        {
+            using type = T;
+        };
+
+        template <>
+        struct builtin_scalar<unsigned long>
+        {
+            using type = unsigned long long;
+        };
+
+        template <>
+        struct builtin_scalar<long>
+        {
+            using type = long long;
+        };
+
+        template <>
+        struct builtin_scalar<char>
+        {
+            using type = typename std::conditional<std::is_signed<char>::value, signed char, unsigned char>::type;
+        };
+
+        template <typename T>
+        using builtin_t = typename builtin_scalar<T>::type;
+
+        // bitwise_cast
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T_out, A>::register_type)(self.data);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<vxe>) noexcept
+        {
+            return (typename batch_bool<T_out, A>::register_type)self.data;
+        }
+
+        // load
+
+        // load_unaligned
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type)vec_xl(0, (builtin_t<T>*)mem);
+        }
+
+        // load_aligned
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<vxe>) noexcept
+        {
+            return load_unaligned<A>(mem, kernel::convert<T> {}, vxe {});
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<vxe>) noexcept
+            {
+                // Interleave real and imaginary parts
+                // hi = [r0, i0, r1, i1], lo = [r2, i2, r3, i3]
+                // We need: real = [r0, r1, r2, r3], imag = [i0, i1, i2, i3]
+                using v4sf = float __attribute__((vector_size(16)));
+                uv16qi perm_real = (uv16qi) { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+                uv16qi perm_imag = (uv16qi) { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+                v4sf real = vec_perm((v4sf)hi.data, (v4sf)lo.data, perm_real);
+                v4sf imag = vec_perm((v4sf)hi.data, (v4sf)lo.data, perm_imag);
+                return { batch<float, A>(real), batch<float, A>(imag) };
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<vxe>) noexcept
+            {
+                // hi = [r0, i0], lo = [r1, i1]
+                // We need: real = [r0, r1], imag = [i0, i1]
+                using v2df = double __attribute__((vector_size(16)));
+                uv16qi perm_real = (uv16qi) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
+                uv16qi perm_imag = (uv16qi) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
+                v2df real = vec_perm((v2df)hi.data, (v2df)lo.data, perm_real);
+                v2df imag = vec_perm((v2df)hi.data, (v2df)lo.data, perm_imag);
+                return { batch<double, A>(real), batch<double, A>(imag) };
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<vxe>) noexcept
+            {
+                uv16qi perm = (uv16qi) { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
+                return batch<float, A>(vec_perm((v4sf)self.real().data, (v4sf)self.imag().data, perm));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<vxe>) noexcept
+            {
+                uv16qi perm = (uv16qi) { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 };
+                return batch<float, A>(vec_perm((v4sf)self.real().data, (v4sf)self.imag().data, perm));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<vxe>) noexcept
+            {
+                uv16qi perm = (uv16qi) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
+                return batch<double, A>(vec_perm((v2df)self.real().data, (v2df)self.imag().data, perm));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<vxe>) noexcept
+            {
+                uv16qi perm = (uv16qi) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
+                return batch<double, A>(vec_perm((v2df)self.real().data, (v2df)self.imag().data, perm));
+            }
+        }
+
+        // store
+        template <class A, class T>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<vxe>) noexcept
+        {
+            vec_xst(src.data, 0, (builtin_t<T>*)dst);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<vxe>) noexcept
+        {
+            store_aligned<A>(dst, src, vxe {});
+        }
+
+        // set
+        template <class A, class T, class... Values>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<vxe>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, A>::size, "consistent init");
+            return typename batch<T, A>::register_type { values... };
+        }
+
+        template <class A, class T, class... Values, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<vxe>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
+            return typename batch_bool<T, A>::register_type { static_cast<decltype(std::declval<typename batch_bool<T, A>::register_type>()[0])>(values ? -1LL : 0LL)... };
+        }
+        // first
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return self.data[0];
+        }
+        // insert
+        template <class A, class T, size_t I, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<vxe>) noexcept
+        {
+            // vec_insert on float is broken with clang
+            batch<T, A> out(self);
+            out.data[I] = val;
+            return out;
+        }
+
+        // eq
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data == other.data;
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data == other.data;
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data < other.data;
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data <= other.data;
+        }
+
+        // neq
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return ~(self.data == other.data);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return bitwise_xor(self, other);
+        }
+
+        // sub
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data - other.data;
+        }
+
+        // broadcast
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<vxe>) noexcept
+        {
+            return vec_splats(static_cast<builtin_t<T>>(val));
+        }
+
+        // abs
+        template <class A, class T, class = typename std::enable_if<std::is_signed<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return vec_abs(self.data);
+        }
+        // bitwise_and
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type)((v4si)self.data & (v4si)other.data);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data & other.data;
+        }
+
+        // bitwise_or
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type)((v4si)self.data | (v4si)other.data);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data | other.data;
+        }
+
+        // bitwise_xor
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type)((v4si)self.data ^ (v4si)other.data);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data ^ other.data;
+        }
+
+        // bitwise_not
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            // ~ operator does not work on floating point vectors
+            return (typename batch<T, A>::register_type)(~(v4si)self.data);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return ~self.data;
+        }
+
+        // bitwise_andnot
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type)((v4si)self.data & ~(v4si)other.data);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data & ~other.data;
+        }
+
+        // div
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data / other.data;
+        }
+
+        // neg
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type) { 0 } - self.data;
+        }
+
+        // add
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data + other.data;
+        }
+
+        // all
+        template <class A, class T>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return ((v1ti)self.data)[0] == -1;
+        }
+
+        // any
+        template <class A, class T>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return ((v1ti)self.data)[0] != 0;
+        }
+        // avgr
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return vec_avg(self.data, other.data);
+        }
+
+        // max
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return vec_max(self.data, other.data);
+        }
+        // min
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return vec_min(self.data, other.data);
+        }
+        // fma
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<vxe>) noexcept
+        {
+            return vec_madd(x.data, y.data, z.data);
+        }
+        // fms
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<vxe>) noexcept
+        {
+            return vec_msub(x.data, y.data, z.data);
+        }
+
+        // mul
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data * other.data;
+        }
+        // haddp
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* r, requires_arch<vxe>) noexcept
+        {
+            v4sf lo01, hi01, lo23, hi23, sum01, sum23, sumeven, sumodd;
+            lo01 = vec_mergel(r[0].data, r[1].data); // { r[0][2], r[1][2], r[0][3], r[1][3] }
+            hi01 = vec_mergeh(r[0].data, r[1].data); // { r[0][0], r[1][0], r[0][1], r[1][1] }
+            lo23 = vec_mergel(r[2].data, r[3].data); // { r[2][2], r[2][2], r[3][3], r[3][3] }
+            hi23 = vec_mergeh(r[2].data, r[3].data); // { r[2][0], r[2][0], r[3][1], r[3][1] }
+            sum01 = lo01 + hi01; // { r[0][0] + r[0][2], r[1][0] + r[1][2], r[0][1] + r[0][3], r[1][1] + r[1][3] }
+            sum23 = lo23 + hi23; // { r[2][0] + r[2][2], r[3][0] + r[3][2], r[2][1] + r[2][3], r[3][1] + r[3][3] }
+            sumeven = (v4sf)vec_mergeh((v2di)sum01, (v2di)sum23); // { r[0][0] + r[0][2], r[1][0] + r[1][2], r[2][0] + r[2][2], r[3][0] + r[3][2] }
+            sumodd = (v4sf)vec_mergel((v2di)sum01, (v2di)sum23); // { r[0][1] + r[0][3], r[1][1] + r[1][3], r[2][1] + r[2][3], r[3][1] + r[3][3] }
+            return sumeven + sumodd;
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<vxe>) noexcept
+        {
+            return vec_mergeh(row[0].data, row[1].data) + vec_mergel(row[0].data, row[1].data);
+        }
+
+        // reduce_add
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<vxe>) noexcept
+        {
+            v4sf shifted_64 = vec_sld(self.data, self.data, 8);
+            v4sf added_1 = self.data + shifted_64;
+            v4sf shifted_32 = vec_sld(added_1, added_1, 4);
+            return (added_1 + shifted_32)[0];
+        }
+
+        template <class A>
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return (self.data + vec_sld(self.data, self.data, 8))[0];
+        }
+
+        template <class A>
+        XSIMD_INLINE uint64_t reduce_add(batch<uint64_t, A> const& self, requires_arch<vxe>) noexcept
+        {
+            uv2di shifted = vec_sld((uv2di)self.data, (uv2di)self.data, 8);
+            uv2di sum = (uv2di)self.data + shifted;
+            return (uint64_t)sum[0];
+        }
+        template <class A>
+        XSIMD_INLINE int64_t reduce_add(batch<int64_t, A> const& self, requires_arch<vxe>) noexcept
+        {
+            v2di shifted = vec_sld((v2di)self.data, (v2di)self.data, 8);
+            v2di sum = (v2di)self.data + shifted;
+            return (int64_t)sum[0];
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                using t = typename batch<T, A>::register_type;
+                t shifted_64 = vec_sld(self.data, self.data, 8);
+                t added_1 = self.data + shifted_64;
+                t shifted_32 = vec_sld(added_1, added_1, 4);
+                return (added_1 + shifted_32)[0];
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                using t = typename batch<T, A>::register_type;
+                t shifted_64 = vec_sld(self.data, self.data, 8);
+                t added_1 = self.data + shifted_64;
+                t shifted_32 = vec_sld(added_1, added_1, 4);
+                t added_2 = added_1 + shifted_32;
+                t shifted_16 = vec_sld(added_2, added_2, 2);
+                return (added_2 + shifted_16)[0];
+            }
+            else
+            {
+                using t = typename batch<T, A>::register_type;
+                t shifted_64 = vec_sld(self.data, self.data, 8);
+                t added_1 = self.data + shifted_64;
+                t shifted_32 = vec_sld(added_1, added_1, 4);
+                t added_2 = added_1 + shifted_32;
+                t shifted_16 = vec_sld(added_2, added_2, 2);
+                t added_3 = added_2 + shifted_16;
+                t shifted_8 = vec_sld(added_3, added_3, 1);
+                return (added_3 + shifted_8)[0];
+            }
+        }
+
+        // select
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<vxe>) noexcept
+        {
+            return vec_sel(false_br.data, true_br.data, cond.data);
+        }
+        template <class A, class T, bool... Values, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<vxe>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, vxe {});
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<vxe>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(N == batch<T, A>::size * sizeof(T))
+            {
+                return batch<T, A>(0);
+            }
+            else
+            {
+                auto shift_count = vec_splats((uint8_t)(8 * N));
+                return vec_sll(x.data, shift_count);
+            }
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<vxe>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(N == batch<T, A>::size * sizeof(T))
+            {
+                return batch<T, A>(0);
+            }
+            else
+            {
+                auto shift_count = vec_splats((uint8_t)(8 * N));
+                return vec_srl(x.data, shift_count);
+            }
+        }
+
+        // sqrt
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& val, requires_arch<vxe>) noexcept
+        {
+            return vec_sqrt(val.data);
+        }
+
+        // rsqrt
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& val, requires_arch<vxe>) noexcept
+        {
+            return batch<T, A>(T(1)) / sqrt(val, vxe {});
+        }
+
+        // shuffle
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3>, requires_arch<vxe>) noexcept
+        {
+            return vec_perm(x.data, y.data,
+                            (__vector unsigned char) {
+                                4 * I0 + 0, 4 * I0 + 1, 4 * I0 + 2, 4 * I0 + 3,
+                                4 * I1 + 0, 4 * I1 + 1, 4 * I1 + 2, 4 * I1 + 3,
+                                4 * I2 + 0, 4 * I2 + 1, 4 * I2 + 2, 4 * I2 + 3,
+                                4 * I3 + 0, 4 * I3 + 1, 4 * I3 + 2, 4 * I3 + 3 });
+        }
+
+        template <class A, class ITy, ITy I0, ITy I1>
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1>, requires_arch<vxe>) noexcept
+        {
+            return vec_perm(x.data, y.data,
+                            (__vector unsigned char) {
+                                8 * I0 + 0,
+                                8 * I0 + 1,
+                                8 * I0 + 2,
+                                8 * I0 + 3,
+                                8 * I0 + 4,
+                                8 * I0 + 5,
+                                8 * I0 + 6,
+                                8 * I0 + 7,
+                                8 * I1 + 0,
+                                8 * I1 + 1,
+                                8 * I1 + 2,
+                                8 * I1 + 3,
+                                8 * I1 + 4,
+                                8 * I1 + 5,
+                                8 * I1 + 6,
+                                8 * I1 + 7,
+                            });
+        }
+
+        // swizzle
+        // 16 x 8bit
+        template <class A, uint8_t... Values>
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Values...>, requires_arch<vxe>) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<uint8_t, A>::size, "consistent init");
+            uv16qi perm = (uv16qi) { Values... };
+            return vec_perm(self.data, self.data, perm);
+        }
+        template <class A, uint8_t... Values>
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Values...>, requires_arch<vxe>) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<int8_t, A>::size, "consistent init");
+            uv16qi perm = (uv16qi) { Values... };
+            return vec_perm(self.data, self.data, perm);
+        }
+
+        // 8 x 16 bit
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                2 * V0, 2 * V0 + 1,
+                2 * V1, 2 * V1 + 1,
+                2 * V2, 2 * V2 + 1,
+                2 * V3, 2 * V3 + 1,
+                2 * V4, 2 * V4 + 1,
+                2 * V5, 2 * V5 + 1,
+                2 * V6, 2 * V6 + 1,
+                2 * V7, 2 * V7 + 1
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                2 * V0, 2 * V0 + 1,
+                2 * V1, 2 * V1 + 1,
+                2 * V2, 2 * V2 + 1,
+                2 * V3, 2 * V3 + 1,
+                2 * V4, 2 * V4 + 1,
+                2 * V5, 2 * V5 + 1,
+                2 * V6, 2 * V6 + 1,
+                2 * V7, 2 * V7 + 1
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+
+        // 4 x 32 bit
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3,
+                4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3,
+                4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3,
+                4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3,
+                4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3,
+                4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3,
+                4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3,
+                4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3,
+                4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3,
+                4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+
+        // 2 x 64 bit
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<vxe>) noexcept
+        {
+            using out = typename batch<uint64_t, A>::register_type;
+            uv16qi perm = (uv16qi) {
+                8 * V0 + 0,
+                8 * V0 + 1,
+                8 * V0 + 2,
+                8 * V0 + 3,
+                8 * V0 + 4,
+                8 * V0 + 5,
+                8 * V0 + 6,
+                8 * V0 + 7,
+                8 * V1 + 0,
+                8 * V1 + 1,
+                8 * V1 + 2,
+                8 * V1 + 3,
+                8 * V1 + 4,
+                8 * V1 + 5,
+                8 * V1 + 6,
+                8 * V1 + 7,
+            };
+            return (out)vec_perm((uv2di)self.data, (uv2di)self.data, perm);
+        }
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<vxe>) noexcept
+        {
+            using out = typename batch<int64_t, A>::register_type;
+            uv16qi perm = (uv16qi) {
+                8 * V0 + 0,
+                8 * V0 + 1,
+                8 * V0 + 2,
+                8 * V0 + 3,
+                8 * V0 + 4,
+                8 * V0 + 5,
+                8 * V0 + 6,
+                8 * V0 + 7,
+                8 * V1 + 0,
+                8 * V1 + 1,
+                8 * V1 + 2,
+                8 * V1 + 3,
+                8 * V1 + 4,
+                8 * V1 + 5,
+                8 * V1 + 6,
+                8 * V1 + 7,
+            };
+            return (out)vec_perm((v2di)self.data, (v2di)self.data, perm);
+        }
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                8 * V0 + 0,
+                8 * V0 + 1,
+                8 * V0 + 2,
+                8 * V0 + 3,
+                8 * V0 + 4,
+                8 * V0 + 5,
+                8 * V0 + 6,
+                8 * V0 + 7,
+                8 * V1 + 0,
+                8 * V1 + 1,
+                8 * V1 + 2,
+                8 * V1 + 3,
+                8 * V1 + 4,
+                8 * V1 + 5,
+                8 * V1 + 6,
+                8 * V1 + 7,
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+        // zip_hi
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return vec_mergel(self.data, other.data);
+        }
+
+        // zip_lo
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return vec_mergeh(self.data, other.data);
+        }
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<vxe>) noexcept
+        {
+            return self.data >> other;
+        }
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<vxe>) noexcept
+        {
+            return self.data << other;
+        }
+
+        // isnan
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return ~vec_cmpeq(self.data, self.data);
+        }
+
+        // ceil
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> ceil(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return vec_ceil(self.data);
+        }
+
+        // floor
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> floor(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return vec_floor(self.data);
+        }
+        // round
+        // vec_round rounds ties to even instead of zero
+#if defined __has_builtin && __has_builtin(__builtin_s390_vfi)
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> round(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return __builtin_s390_vfi(self.data, 4, 1);
+        }
+#endif
+        // trunc
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> trunc(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return vec_trunc(self.data);
+        }
+    }
+}
+#endif
diff --git a/include/xsimd/arch/xsimd_wasm.hpp b/include/xsimd/arch/xsimd_wasm.hpp
index 9e437dfb1..b657bbd6b 100644
--- a/include/xsimd/arch/xsimd_wasm.hpp
+++ b/include/xsimd/arch/xsimd_wasm.hpp
@@ -13,11 +13,11 @@
 #ifndef XSIMD_WASM_HPP
 #define XSIMD_WASM_HPP
 
-#include <type_traits>
-
 #include "../types/xsimd_wasm_register.hpp"
 #include "./common/xsimd_common_cast.hpp"
 
+#include <type_traits>
+
 namespace xsimd
 {
     template <typename T, class A, bool... Values>
diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp
index d69ff7560..b3995912e 100644
--- a/include/xsimd/config/xsimd_arch.hpp
+++ b/include/xsimd/config/xsimd_arch.hpp
@@ -12,14 +12,14 @@
 #ifndef XSIMD_ARCH_HPP
 #define XSIMD_ARCH_HPP
 
-#include <initializer_list>
-#include <type_traits>
-#include <utility>
-
 #include "../types/xsimd_all_registers.hpp"
 #include "./xsimd_config.hpp"
 #include "./xsimd_cpuid.hpp"
 
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+
 namespace xsimd
 {
 
@@ -51,8 +51,8 @@ namespace xsimd
 
         template <class T, class Ty, class... Tys>
         struct contains<T, Ty, Tys...>
-            : std::conditional<std::is_same<Ty, T>::value, std::true_type,
-                               contains<T, Tys...>>::type
+            : std::conditional_t<std::is_same<Ty, T>::value, std::true_type,
+                                 contains<T, Tys...>>
         {
         };
 
@@ -162,8 +162,8 @@ namespace xsimd
     } // namespace detail
 
     using all_x86_architectures = arch_list<
-        avx512vnni<avx512vbmi2>, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512cd, avx512f,
-        avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>,
+        avx512vnni<avx512vbmi2>, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512vl, avx512cd, avx512f,
+        avxvnni, avx512vl_256, fma3<avx2>, avx2, fma3<avx>, avx, avx512vl_128, avx2_128, avx_128, fma4, fma3<sse4_2>,
         sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
 
     using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
@@ -172,7 +172,8 @@ namespace xsimd
     using all_power_architectures = arch_list<vsx>;
     using all_riscv_architectures = all_rvv_architectures;
     using all_wasm_architectures = arch_list<wasm>;
-    using all_architectures = typename detail::join<all_power_architectures, all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
+    using all_s390x_architectures = arch_list<vxe>;
+    using all_architectures = typename detail::join<all_power_architectures, all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures, all_s390x_architectures>::type;
 
     using supported_architectures = typename detail::supported<all_architectures>::type;
 
@@ -180,6 +181,7 @@ namespace xsimd
     using arm_arch = typename detail::supported<all_arm_architectures>::type::best;
     using power_arch = typename detail::supported<all_power_architectures>::type::best;
     using riscv_arch = typename detail::supported<all_riscv_architectures>::type::best;
+    using s390x_arch = typename detail::supported<all_s390x_architectures>::type::best;
     using best_arch = typename supported_architectures::best;
 
 #ifdef XSIMD_DEFAULT_ARCH
diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp
index e81dd8053..e3887c276 100644
--- a/include/xsimd/config/xsimd_config.hpp
+++ b/include/xsimd/config/xsimd_config.hpp
@@ -13,7 +13,7 @@
 #define XSIMD_CONFIG_HPP
 
 #define XSIMD_VERSION_MAJOR 14
-#define XSIMD_VERSION_MINOR 0
+#define XSIMD_VERSION_MINOR 2
 #define XSIMD_VERSION_PATCH 0
 
 #if defined(__GNUC__) && defined(__BYTE_ORDER__)
@@ -27,12 +27,75 @@
 #define XSIMD_LITTLE_ENDIAN
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Normalized C++ version number, equivalent to __cplusplus but also works with
+ * MSVC which requires /Zc:__cplusplus to set it correctly (otherwise it's always
+ * 199711L). Use this instead of __cplusplus throughout the codebase.
+ */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define XSIMD_CPP_VERSION _MSVC_LANG
+#else
+#define XSIMD_CPP_VERSION __cplusplus
+#endif
+
 /**
  * high level free functions
  *
  * @defgroup xsimd_config_macro Instruction Set Detection
  */
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if the target is the x86 architecture family.
+ */
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
+#define XSIMD_TARGET_X86 1
+#else
+#define XSIMD_TARGET_X86 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if GNU-style inline assembly is available, to 0 otherwise.
+ */
+/* Use __clang__ || __GNUC__ for GNU-style inline asm. clang-cl runs in
+ * MSVC-compatibility mode and does not define __GNUC__ by default, but it
+ * still defines __clang__. Clang documents __asm__/__asm__ support and broad
+ * GCC-extension compatibility:
+ * https://clang.llvm.org/docs/LanguageExtensions.html
+ * Clang only emits __GNUC__ when GNUCVersion != 0:
+ * https://raw.githubusercontent.com/llvm/llvm-project/main/clang/lib/Frontend/InitPreprocessor.cpp
+ * and GNUCVersion defaults to 0:
+ * https://raw.githubusercontent.com/llvm/llvm-project/main/clang/include/clang/Basic/LangOptions.def
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define XSIMD_WITH_INLINE_ASM 1
+#else
+#define XSIMD_WITH_INLINE_ASM 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 when the compiler is allowed to reassociate floating-point
+ * operations (e.g. -ffast-math, -fassociative-math).  Detected
+ * automatically from __FAST_MATH__ (GCC/Clang) and __ASSOCIATIVE_MATH__
+ * (GCC).  Clang does not define a macro for standalone
+ * -fassociative-math; users should define XSIMD_REASSOCIATIVE_MATH=1
+ * manually in that case.
+ */
+#ifndef XSIMD_REASSOCIATIVE_MATH
+#if defined(__FAST_MATH__) || defined(__ASSOCIATIVE_MATH__)
+#define XSIMD_REASSOCIATIVE_MATH 1
+#else
+#define XSIMD_REASSOCIATIVE_MATH 0
+#endif
+#endif
+
 /**
  * @ingroup xsimd_config_macro
  *
@@ -244,6 +307,17 @@
 #define XSIMD_WITH_AVX512CD 0
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512VL is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512VL__
+#define XSIMD_WITH_AVX512VL XSIMD_WITH_AVX512CD
+#else
+#define XSIMD_WITH_AVX512VL 0
+#endif
+
 /**
  * @ingroup xsimd_config_macro
  *
@@ -346,12 +420,23 @@
 /**
  * @ingroup xsimd_config_macro
  *
- * Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
+ * Set to 1 if the target is in the ARM architecture family in 64 bits, to 0 otherwise
  */
 #if defined(__aarch64__) || defined(_M_ARM64)
-#define XSIMD_WITH_NEON64 1
+#define XSIMD_TARGET_ARM64 1
 #else
-#define XSIMD_WITH_NEON64 0
+#define XSIMD_TARGET_ARM64 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if the target is in the ARM architecture family, to 0 otherwise
+ */
+#if defined(__arm__) || defined(_M_ARM) || XSIMD_TARGET_ARM64
+#define XSIMD_TARGET_ARM 1
+#else
+#define XSIMD_TARGET_ARM 0
 #endif
 
 /**
@@ -359,12 +444,26 @@
  *
  * Set to 1 if NEON is available at compile-time, to 0 otherwise.
  */
-#if (defined(__ARM_NEON) && __ARM_ARCH >= 7) || XSIMD_WITH_NEON64
+#if (defined(__ARM_NEON) && (__ARM_ARCH >= 7)) || XSIMD_TARGET_ARM64
 #define XSIMD_WITH_NEON 1
 #else
 #define XSIMD_WITH_NEON 0
 #endif
 
+// Neon is always available on Arm64, though it is theoritially possible to compile
+// without it, such as -march=armv8-a+nosimd.
+// Note that MSVC may never define __ARM_NEON even when available.
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
+ */
+#if XSIMD_TARGET_ARM64
+#define XSIMD_WITH_NEON64 1
+#else
+#define XSIMD_WITH_NEON64 0
+#endif
+
 /**
  * @ingroup xsimd_config_macro
  *
@@ -389,6 +488,17 @@
 #define XSIMD_SVE_BITS 0
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if the target is the RISC-V architecture family.
+ */
+#ifdef __riscv
+#define XSIMD_TARGET_RISCV 1
+#else
+#define XSIMD_TARGET_RISCV 0
+#endif
+
 /**
  * @ingroup xsimd_config_macro
  *
@@ -413,6 +523,17 @@
 #define XSIMD_WITH_WASM 0
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if the target is in the PowerPC architecture family, to 0 otherwise
+ */
+#if defined(__powerpc__) || defined(__powerpc64__) || defined(_ARCH_PPC) || defined(_ARCH_PPC64)
+#define XSIMD_TARGET_PPC 1
+#else
+#define XSIMD_TARGET_PPC 0
+#endif
+
 /**
  * @ingroup xsimd_config_macro
  *
@@ -424,6 +545,29 @@
 #define XSIMD_WITH_VSX 0
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if the target is in the IBM Z architecture family, to 0 otherwise
+ */
+#if defined(__s390x__)
+#define XSIMD_TARGET_S390X 1
+#else
+#define XSIMD_TARGET_S390X 0
+#endif
+
+/**
++ * @ingroup xsimd_config_macro
++ *
++ * Set to 1 if s390x VXE is available at compile-time, to 0 otherwise.
++ * Float vectors have been introduced with VXE included with IBM z14.
++ */
+#if defined(__VEC__) && __VEC__ >= 10304 && __ARCH__ >= 12
+#define XSIMD_WITH_VXE 1
+#else
+#define XSIMD_WITH_VXE 0
+#endif
+
 // Workaround for MSVC compiler
 #ifdef _MSC_VER
 
@@ -482,8 +626,19 @@
 
 #endif
 
-#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_VSX && !XSIMD_WITH_EMULATED
+#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512VL && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_VSX && !XSIMD_WITH_EMULATED && !XSIMD_WITH_VXE
 #define XSIMD_NO_SUPPORTED_ARCHITECTURE
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if the target is a linux
+ */
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+#define XSIMD_HAVE_LINUX_GETAUXVAL 1
+#else
+#define XSIMD_HAVE_LINUX_GETAUXVAL 0
+#endif
+
 #endif
diff --git a/include/xsimd/config/xsimd_cpu_features.hpp b/include/xsimd/config/xsimd_cpu_features.hpp
new file mode 100644
index 000000000..5dcc00416
--- /dev/null
+++ b/include/xsimd/config/xsimd_cpu_features.hpp
@@ -0,0 +1,50 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_CPU_FEATURES_HPP
+#define XSIMD_CPU_FEATURES_HPP
+
+#include "./xsimd_cpu_features_arm.hpp"
+#include "./xsimd_cpu_features_ppc.hpp"
+#include "./xsimd_cpu_features_riscv.hpp"
+#include "./xsimd_cpu_features_s390x.hpp"
+#include "./xsimd_cpu_features_x86.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * Cross-platform CPU feature detection class.
+     *
+     * All member functions are safe to work on with all platforms.
+     *
+     * @warning This class is *not* thread safe.
+     * Its internal lazy querying structure makes even `const` member function prone to data race.
+     * The structure is also generally not appropriate for directly branching (e.g. on
+     * ``cpu_features::avx2``) because it include a branch that the compiler cannot optimize.
+     * The current appropriate way to use this class for dynamic dispatching is to store the
+     * result of the function calls (e.g. @ref cpu_features) into (static) constants.
+     * This is done in @ref xsimd::available_architectures.
+     *
+     * @see xsimd::dispatch
+     * @see xsimd::available_architectures
+     */
+    class cpu_features : public s390x_cpu_features,
+                         public ppc_cpu_features,
+                         public riscv_cpu_features,
+                         public arm_cpu_features,
+                         public x86_cpu_features
+    {
+    };
+
+}
+
+#endif
diff --git a/include/xsimd/config/xsimd_cpu_features_arm.hpp b/include/xsimd/config/xsimd_cpu_features_arm.hpp
new file mode 100644
index 000000000..57f1c56e3
--- /dev/null
+++ b/include/xsimd/config/xsimd_cpu_features_arm.hpp
@@ -0,0 +1,132 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ***************************************************************************/
+
+#ifndef XSIMD_CPU_FEATURES_ARM_HPP
+#define XSIMD_CPU_FEATURES_ARM_HPP
+
+#include "./xsimd_config.hpp"
+#include "./xsimd_getauxval.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if XSIMD_TARGET_ARM && XSIMD_HAVE_LINUX_GETAUXVAL
+// HWCAP_XXX masks to use on getauxval results.
+// Header does not exists on all architectures and masks are architecture
+// specific.
+#include <asm/hwcap.h>
+#endif // XSIMD_TARGET_ARM && XSIMD_HAVE_LINUX_GETAUXVAL
+
+namespace xsimd
+{
+
+    namespace detail
+    {
+        using arm_reg64_t = std::uint64_t;
+
+        /**
+         * Return the SVE vector length in bytes for the current thread.
+         *
+         * SVE vector length can be restricted
+         * Contrary to `svcntb` this does not require to be compiles with SVE, which
+         * should not be done in a dynamic dispatch jump function.
+         *
+         * Safety: It is the user responsibility to first make sure that SVE is
+         * available.
+         */
+        inline arm_reg64_t arm_rdvl_unsafe();
+    }
+
+    /**
+     * An opinionated CPU feature detection utility for ARM.
+     *
+     * Combines compile-time knowledge with runtime detection when available.
+     * On Linux, runtime detection uses getauxval to query the auxiliary vector.
+     * On other platforms, only compile-time information is used.
+     *
+     * This is well defined on all architectures.
+     * It will always return false on non-ARM architectures.
+     */
+    class arm_cpu_features : private linux_hwcap_backend_default
+    {
+    public:
+        inline bool neon() const noexcept;
+        inline bool neon64() const noexcept;
+        inline bool sve() const noexcept;
+        inline std::size_t sve_size_bytes() const noexcept;
+        inline bool i8mm() const noexcept;
+    };
+
+    /********************
+     *  Implementation  *
+     ********************/
+
+    namespace detail
+    {
+#if XSIMD_TARGET_ARM64 && (defined(__GNUC__) || defined(__clang__))
+        __attribute__((target("arch=armv8-a+sve"))) inline arm_reg64_t arm_rdvl_unsafe()
+        {
+            arm_reg64_t vl;
+            __asm__ volatile("rdvl %0, #1" : "=r"(vl));
+            return vl;
+        }
+#else
+        inline arm_reg64_t arm_rdvl_unsafe() { return 0; }
+#endif
+    }
+
+    inline bool arm_cpu_features::neon() const noexcept
+    {
+#if XSIMD_TARGET_ARM && !XSIMD_TARGET_ARM64 && XSIMD_HAVE_LINUX_GETAUXVAL
+        return hwcap().has_feature(HWCAP_NEON);
+#else
+        return static_cast<bool>(XSIMD_WITH_NEON);
+#endif
+    }
+
+    inline bool arm_cpu_features::neon64() const noexcept
+    {
+        return static_cast<bool>(XSIMD_WITH_NEON64);
+    }
+
+    inline bool arm_cpu_features::sve() const noexcept
+    {
+#if XSIMD_TARGET_ARM64 && XSIMD_HAVE_LINUX_GETAUXVAL
+        return hwcap().has_feature(HWCAP_SVE);
+#else
+        return false;
+#endif
+    }
+
+    inline std::size_t arm_cpu_features::sve_size_bytes() const noexcept
+    {
+        if (sve())
+        {
+            return detail::arm_rdvl_unsafe();
+        }
+        return 0;
+    }
+
+    inline bool arm_cpu_features::i8mm() const noexcept
+    {
+#if XSIMD_TARGET_ARM64 && XSIMD_HAVE_LINUX_GETAUXVAL
+#ifdef HWCAP2_I8MM
+        return hwcap2().has_feature(HWCAP2_I8MM);
+#else
+        // Possibly missing on older Linux distributions
+        return hwcap2().has_feature(1 << 13);
+#endif
+#else
+        return false;
+#endif
+    }
+}
+#endif
diff --git a/include/xsimd/config/xsimd_cpu_features_ppc.hpp b/include/xsimd/config/xsimd_cpu_features_ppc.hpp
new file mode 100644
index 000000000..510124617
--- /dev/null
+++ b/include/xsimd/config/xsimd_cpu_features_ppc.hpp
@@ -0,0 +1,54 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ***************************************************************************/
+
+#ifndef XSIMD_CPU_FEATURES_PPC_HPP
+#define XSIMD_CPU_FEATURES_PPC_HPP
+
+#include "./xsimd_config.hpp"
+#include "./xsimd_getauxval.hpp"
+
+namespace xsimd
+{
+    /**
+     * An opinionated CPU feature detection utility for PowerPC.
+     *
+     * On Linux, runtime detection uses getauxval to query the auxiliary vector.
+     * On other platforms, only compile-time information is used.
+     *
+     * This is well defined on all architectures.
+     * It will always return false on non-PowerPC architectures.
+     */
+    class ppc_cpu_features : private linux_hwcap_backend_default
+    {
+    public:
+        inline bool vsx() const noexcept;
+    };
+
+    /********************
+     *  Implementation  *
+     ********************/
+
+    inline bool ppc_cpu_features::vsx() const noexcept
+    {
+#if XSIMD_TARGET_PPC && XSIMD_HAVE_LINUX_GETAUXVAL
+#ifdef PPC_FEATURE_HAS_VSX
+        return hwcap().has_feature(PPC_FEATURE_HAS_VSX);
+#else
+        // Possibly missing on older Linux distributions
+        return hwcap().has_feature(0x00000080);
+#endif
+#else
+        return XSIMD_WITH_VSX;
+#endif
+    }
+}
+
+#endif
diff --git a/include/xsimd/config/xsimd_cpu_features_riscv.hpp b/include/xsimd/config/xsimd_cpu_features_riscv.hpp
new file mode 100644
index 000000000..d1d2ba7d6
--- /dev/null
+++ b/include/xsimd/config/xsimd_cpu_features_riscv.hpp
@@ -0,0 +1,95 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ***************************************************************************/
+
+#ifndef XSIMD_CPU_FEATURES_RISCV_HPP
+#define XSIMD_CPU_FEATURES_RISCV_HPP
+
+#include "./xsimd_config.hpp"
+#include "./xsimd_getauxval.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if XSIMD_TARGET_RISCV && XSIMD_HAVE_LINUX_GETAUXVAL
+// HWCAP_XXX masks to use on getauxval results.
+// Header does not exists on all architectures and masks are architecture
+// specific.
+#include <asm/hwcap.h>
+#endif // XSIMD_TARGET_RISCV && XSIMD_HAVE_LINUX_GETAUXVAL
+
+namespace xsimd
+{
+    namespace detail
+    {
+        using riscv_reg64_t = std::uint64_t;
+
+        /**
+         * Return the RVV vector length in bytes.
+         *
+         * This does not require to be compiles with SVE, which should not
+         * be done in a dynamic dispatch jump function.
+         *
+         * Safety: It is the user responsibility to first make sure that RVV is
+         * available.
+         */
+        inline riscv_reg64_t riscv_csrr_unsafe();
+    }
+
+    class riscv_cpu_features : private linux_hwcap_backend_default
+    {
+    public:
+        inline bool rvv() const noexcept;
+        inline std::size_t rvv_size_bytes() const noexcept;
+    };
+
+    /********************
+     *  Implementation  *
+     ********************/
+
+    namespace detail
+    {
+#if XSIMD_TARGET_RISCV && (defined(__GNUC__) || defined(__clang__))
+        __attribute__((target("arch=+v"))) inline riscv_reg64_t riscv_csrr_unsafe()
+        {
+            riscv_reg64_t vlenb;
+            __asm__ volatile("csrr %0, vlenb" : "=r"(vlenb));
+            return vlenb;
+        }
+#else
+        inline riscv_reg64_t riscv_csrr_unsafe() { return 0; }
+#endif
+    }
+
+    inline bool riscv_cpu_features::rvv() const noexcept
+    {
+#if XSIMD_TARGET_RISCV && XSIMD_HAVE_LINUX_GETAUXVAL
+#ifdef HWCAP_V
+        return hwcap().has_feature(HWCAP_V);
+#else
+        // Possibly missing on older Linux distributions
+        return hwcap().has_feature(1 << ('V' - 'A'));
+#endif
+#else
+        return false;
+#endif
+    }
+
+    inline std::size_t riscv_cpu_features::rvv_size_bytes() const noexcept
+    {
+        if (rvv())
+        {
+            return detail::riscv_csrr_unsafe();
+        }
+        return 0;
+    }
+}
+
+#endif
diff --git a/include/xsimd/config/xsimd_cpu_features_s390x.hpp b/include/xsimd/config/xsimd_cpu_features_s390x.hpp
new file mode 100644
index 000000000..e05fa53f3
--- /dev/null
+++ b/include/xsimd/config/xsimd_cpu_features_s390x.hpp
@@ -0,0 +1,56 @@
+/***************************************************************************
+ * Copyright (c) Andreas Krebbel                                            *
+ * Based on xsimd_cpu_features_ppc.hpp                                      *
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_CPU_FEATURES_S390X_HPP
+#define XSIMD_CPU_FEATURES_S390X_HPP
+
+#include "./xsimd_config.hpp"
+#include "./xsimd_getauxval.hpp"
+
+namespace xsimd
+{
+    /**
+     * An opinionated CPU feature detection utility for IBM Z.
+     *
+     * On Linux, runtime detection uses getauxval to query the auxiliary vector.
+     * On other platforms, only compile-time information is used.
+     *
+     * This is well defined on all architectures.
+     * It will always return false on non-IBM Z architectures.
+     */
+    class s390x_cpu_features : private linux_hwcap_backend_default
+    {
+    public:
+        inline bool vxe() const noexcept;
+    };
+
+    /********************
+     *  Implementation  *
+     ********************/
+
+    inline bool s390x_cpu_features::vxe() const noexcept
+    {
+#if XSIMD_TARGET_S390X && XSIMD_HAVE_LINUX_GETAUXVAL
+#ifdef HWCAP_S390_VXE
+        return hwcap().has_feature(HWCAP_S390_VXE);
+#else
+        // Possibly missing on older Linux distributions
+        return hwcap().has_feature(8192);
+#endif
+#else
+        return XSIMD_WITH_VXE;
+#endif
+    }
+}
+
+#endif
diff --git a/include/xsimd/config/xsimd_cpu_features_x86.hpp b/include/xsimd/config/xsimd_cpu_features_x86.hpp
new file mode 100644
index 000000000..0de055e8f
--- /dev/null
+++ b/include/xsimd/config/xsimd_cpu_features_x86.hpp
@@ -0,0 +1,1227 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_CPU_FEATURES_X86_HPP
+#define XSIMD_CPU_FEATURES_X86_HPP
+
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+#if XSIMD_CPP_VERSION >= 201703L
+#include <string_view>
+#endif
+
+#include "../utils/bits.hpp"
+#include "./xsimd_config.hpp"
+
+#if XSIMD_TARGET_X86 && defined(_MSC_VER)
+#include <intrin.h> // Contains the definition of __cpuidex
+#endif
+
+namespace xsimd
+{
+    namespace detail
+    {
+        using x86_reg32_t = std::uint32_t;
+
+        using cpuid_reg_t = std::array<x86_reg32_t, 4>;
+
+        /**
+         * CPU Identification (CPUID) instruction results.
+         *
+         * The CPUID instruction provides detailed information about the processor,
+         * including supported instruction set extensions (SSE, AVX, AVX-512, etc.).
+         * This function is well defined on all architectures but will return all zeros
+         * on all non-x86 architectures.
+         *
+         * @param leaf The value inputted to the EAX register.
+         * @param subleaf The value inputted to the ECX register.
+         *
+         * @see https://en.wikipedia.org/wiki/CPUID
+         */
+        inline cpuid_reg_t x86_cpuid(int leaf, int subleaf = 0) noexcept;
+
+        inline x86_reg32_t x86_xcr0_low() noexcept;
+
+        /** A strongly type bitset for a 32 bits register. */
+        template <typename E>
+        using x86_reg32_bitset = utils::uint_bitset<E, x86_reg32_t>;
+
+        /** A wrapper to attach a register bitfield descriptor and its CPUID index. */
+        template <typename E, x86_reg32_t I>
+        struct x86_reg_id
+        {
+            static constexpr x86_reg32_t index = I;
+            using bits = E;
+
+            static_assert(index >= 0 && index < 4, "At most 4 register in CPUID");
+        };
+
+        /** Find the register id with index k. */
+        template <x86_reg32_t K, typename... reg_ids>
+        struct find_reg_k;
+
+        /** Find the register id with index k (empty / nothing found case). */
+        template <x86_reg32_t K, typename... reg_ids>
+        struct find_reg_k
+        {
+            using type = x86_reg_id<void, 0>;
+        };
+
+        /** Find the register id with index k (recursive case). */
+        template <x86_reg32_t K, typename reg_id_head, typename... reg_id_tail>
+        struct find_reg_k<K, reg_id_head, reg_id_tail...>
+        {
+            using type = std::conditional_t<
+                reg_id_head::index == K,
+                reg_id_head,
+                typename find_reg_k<K, reg_id_tail...>::type>;
+        };
+
+        /**
+         * A class with strongly typed bitfield for `CPUID` registers.
+         *
+         * The class stores a variable number of register (up to four) from the CPUID
+         * output. This is a space optimization to avoid storing many zeros in the
+         * final `x86_cpu_features`.
+         * As a result, some of the type aliases `eax`, `ebx`, `ecx`, `edx` may be `void`.
+         */
+        template <x86_reg32_t leaf_num, x86_reg32_t subleaf_num, typename... reg_ids>
+        class x86_cpuid_regs : private x86_reg32_bitset<typename reg_ids::bits>...
+        {
+        private:
+            static_assert(sizeof...(reg_ids) <= 4, "At most 4 register in CPUID");
+
+            /* Parse CPUINFO register value into individual bit components.*/
+            constexpr explicit x86_cpuid_regs(const cpuid_reg_t& regs) noexcept
+                : x86_reg32_bitset<typename reg_ids::bits>(regs[reg_ids::index])...
+            {
+            }
+
+        public:
+            static constexpr x86_reg32_t leaf = leaf_num;
+            static constexpr x86_reg32_t subleaf = subleaf_num;
+
+            using eax = typename find_reg_k<0, reg_ids...>::type::bits;
+            using ebx = typename find_reg_k<1, reg_ids...>::type::bits;
+            using ecx = typename find_reg_k<2, reg_ids...>::type::bits;
+            using edx = typename find_reg_k<3, reg_ids...>::type::bits;
+
+            inline static x86_cpuid_regs read()
+            {
+                return x86_cpuid_regs(detail::x86_cpuid(leaf, subleaf));
+            }
+
+            constexpr x86_cpuid_regs() noexcept = default;
+
+            // TODO(C++17) compact version for which this was designed.
+            // The else clause contains a very verbose port.
+#if 0
+            using x86_reg32_bitset<typename reg_ids::bits>::all_bits_set...;
+            using x86_reg32_bitset<typename reg_ids::bits>::get_range...;
+#else
+
+        private:
+            template <int N>
+            struct m_empty_reg
+            {
+                enum class type {};
+            };
+
+            using eax_or_empty = std::conditional_t<std::is_void<eax>::value, typename m_empty_reg<0>::type, eax>;
+            using ebx_or_empty = std::conditional_t<std::is_void<ebx>::value, typename m_empty_reg<1>::type, ebx>;
+            using ecx_or_empty = std::conditional_t<std::is_void<ecx>::value, typename m_empty_reg<2>::type, ecx>;
+            using edx_or_empty = std::conditional_t<std::is_void<edx>::value, typename m_empty_reg<3>::type, edx>;
+
+        public:
+            template <eax_or_empty... bits, typename T = eax, std::enable_if_t<!std::is_void<T>::value, int> = 0>
+            constexpr bool all_bits_set() const noexcept
+            {
+                return x86_reg32_bitset<eax>::template all_bits_set<bits...>();
+            }
+
+            template <eax_or_empty start, eax_or_empty end, typename T = eax, std::enable_if_t<!std::is_void<T>::value, int> = 0>
+            constexpr x86_reg32_t get_range() const noexcept
+            {
+                return x86_reg32_bitset<eax>::template get_range<start, end>();
+            }
+
+            template <ebx_or_empty... bits, typename T = ebx, std::enable_if_t<!std::is_void<T>::value, int> = 0>
+            constexpr bool all_bits_set() const noexcept
+            {
+                return x86_reg32_bitset<ebx>::template all_bits_set<bits...>();
+            }
+
+            template <ebx_or_empty start, ebx_or_empty end, typename T = ebx, std::enable_if_t<!std::is_void<T>::value, int> = 0>
+            constexpr x86_reg32_t get_range() const noexcept
+            {
+                return x86_reg32_bitset<ebx>::template get_range<start, end>();
+            }
+
+            template <ecx_or_empty... bits, typename T = ecx, std::enable_if_t<!std::is_void<T>::value, int> = 0>
+            constexpr bool all_bits_set() const noexcept
+            {
+                return x86_reg32_bitset<ecx>::template all_bits_set<bits...>();
+            }
+
+            template <ecx_or_empty start, ecx_or_empty end, typename T = ecx, std::enable_if_t<!std::is_void<T>::value, int> = 0>
+            constexpr x86_reg32_t get_range() const noexcept
+            {
+                return x86_reg32_bitset<ecx>::template get_range<start, end>();
+            }
+
+            template <edx_or_empty... bits, typename T = edx, std::enable_if_t<!std::is_void<T>::value, int> = 0>
+            constexpr bool all_bits_set() const noexcept
+            {
+                return x86_reg32_bitset<edx>::template all_bits_set<bits...>();
+            }
+
+            template <edx_or_empty start, edx_or_empty end, typename T = edx, std::enable_if_t<!std::is_void<T>::value, int> = 0>
+            constexpr x86_reg32_t get_range() const noexcept
+            {
+                return x86_reg32_bitset<edx>::template get_range<start, end>();
+            }
+
+#endif // C++17
+        };
+
+        template <bool extended>
+        struct x86_cpuid_highest_func
+        {
+        private:
+            using x86_reg32_t = detail::x86_reg32_t;
+            using manufacturer_str = std::array<char, 3 * sizeof(x86_reg32_t)>;
+
+        public:
+            static constexpr x86_reg32_t leaf = extended ? 0x80000000 : 0x0;
+
+            inline static x86_cpuid_highest_func read()
+            {
+                auto regs = detail::x86_cpuid(0);
+                x86_cpuid_highest_func out {};
+                // Highest function parameter in EAX
+                out.m_highest_leaf = regs[0];
+
+                // Manufacturer string in EBX, EDX, ECX (in that order)
+                char* manuf = out.m_manufacturer_id.data();
+                std::memcpy(manuf + 0 * sizeof(x86_reg32_t), &regs[1], sizeof(x86_reg32_t));
+                std::memcpy(manuf + 1 * sizeof(x86_reg32_t), &regs[3], sizeof(x86_reg32_t));
+                std::memcpy(manuf + 2 * sizeof(x86_reg32_t), &regs[2], sizeof(x86_reg32_t));
+
+                return out;
+            }
+
+            constexpr x86_cpuid_highest_func() noexcept = default;
+
+            /**
+             * Highest available leaf in CPUID non-extended range.
+             *
+             * This is the highest function parameter (EAX) that can be passed to CPUID.
+             * This is valid in the specified range:
+             *   - if `extended` is `false`, that is below `0x80000000`,
+             *   - if `extended` is `true`, that is above `0x80000000`,
+             */
+            constexpr x86_reg32_t highest_leaf() const noexcept
+            {
+                return m_highest_leaf;
+            }
+
+            /**
+             * The manufacturer ID string in a static array.
+             *
+             * This raw character array is case specific and may contain both leading
+             * and trailing whitespaces.
+             * It cannot be assumed to be null terminated.
+             * This is not implemented for all manufacturer when `extended` is `true`.
+             */
+            constexpr manufacturer_str manufacturer_id_raw() const noexcept
+            {
+                return m_manufacturer_id;
+            }
+
+#if XSIMD_CPP_VERSION >= 201703L
+            constexpr std::string_view manufacturer_id() const noexcept
+            {
+                return { m_manufacturer_id.data(), m_manufacturer_id.size() };
+            }
+#endif
+
+        private:
+            manufacturer_str m_manufacturer_id {};
+            x86_reg32_t m_highest_leaf {};
+        };
+    }
+
+    /**
+     * Highest CPUID Function Parameter and Manufacturer ID (EAX=0).
+     *
+     * Returns the highest leaf value supported by CPUID in the standard range
+     * (below 0x80000000), and the processor manufacturer ID string.
+     *
+     * @see https://en.wikipedia.org/wiki/CPUID
+     */
+    using x86_cpuid_leaf0 = detail::x86_cpuid_highest_func<false>;
+
+    /**
+     * Known processor manufacturer ID strings returned by CPUID leaf 0.
+     *
+     * The 12-byte manufacturer ID is stored in EBX, EDX, ECX (in that order).
+     * Some strings are shared across physical CPUs, emulators, and virtual machines.
+     * Obscure, defunct, and soft-core CPUs are not represented; they map to `unknown`.
+     *
+     * @see https://en.wikipedia.org/wiki/CPUID
+     */
+    enum class x86_manufacturer
+    {
+        /**
+         * AMD ("AuthenticAMD", "AMD ISBETTER").
+         *
+         * "AMD ISBETTER" was used by early K5 engineering samples.
+         */
+        amd,
+        /**
+         * Intel ("GenuineIntel", "GenuineIotel").
+         *
+         * "GenuineIotel" is a rare typo variant seen on some chips.
+         */
+        intel,
+        /**
+         * VIA / Centaur ("CentaurHauls", "VIA VIA VIA ").
+         *
+         * Centaur Technology was acquired by VIA in 1999;
+         * older chips report "CentaurHauls", newer ones "VIA VIA VIA ".
+         */
+        via,
+        /** Zhaoxin ("  Shanghai  "). */
+        zhaoxin,
+        /** Hygon ("HygonGenuine"). */
+        hygon,
+        /**
+         * Transmeta ("TransmetaCPU", "GenuineTMx86").
+         *
+         * Two different ID strings were used across product lines.
+         */
+        transmeta,
+        /** MCST Elbrus ("E2K MACHINE "). */
+        elbrus,
+        /** Microsoft Virtual PC / x86-to-ARM ("Virtual CPU "). */
+        microsoft_vpc,
+        /** Unrecognized manufacturer ID string. */
+        unknown,
+    };
+
+    /**
+     * Parse a 12-byte CPUID manufacturer ID into an @ref x86_manufacturer value.
+     *
+     * The input is the raw character array returned by @ref x86_cpuid_leaf0::manufacturer_id_raw.
+     * Unrecognized strings map to @ref x86_manufacturer::unknown.
+     */
+    inline x86_manufacturer x86_parse_manufacturer(const std::array<char, 12>& id) noexcept
+    {
+        auto eq = [&id](const char(&s)[13]) noexcept -> bool
+        {
+            return std::memcmp(id.data(), s, 12) == 0;
+        };
+        if (eq("GenuineIntel") || eq("GenuineIotel"))
+            return x86_manufacturer::intel;
+        if (eq("AuthenticAMD") || eq("AMD ISBETTER"))
+            return x86_manufacturer::amd;
+        if (eq("CentaurHauls") || eq("VIA VIA VIA "))
+            return x86_manufacturer::via;
+        if (eq("  Shanghai  "))
+            return x86_manufacturer::zhaoxin;
+        if (eq("HygonGenuine"))
+            return x86_manufacturer::hygon;
+        if (eq("TransmetaCPU") || eq("GenuineTMx86"))
+            return x86_manufacturer::transmeta;
+        if (eq("E2K MACHINE "))
+            return x86_manufacturer::elbrus;
+        if (eq("Virtual CPU "))
+            return x86_manufacturer::microsoft_vpc;
+        return x86_manufacturer::unknown;
+    };
+
+    /** Return a string representation of an @ref x86_manufacturer value. */
+    constexpr const char* x86_manufacturer_name(x86_manufacturer m) noexcept
+    {
+        switch (m)
+        {
+        case x86_manufacturer::intel:
+            return "intel";
+        case x86_manufacturer::amd:
+            return "amd";
+        case x86_manufacturer::via:
+            return "via";
+        case x86_manufacturer::zhaoxin:
+            return "zhaoxin";
+        case x86_manufacturer::hygon:
+            return "hygon";
+        case x86_manufacturer::transmeta:
+            return "transmeta";
+        case x86_manufacturer::elbrus:
+            return "elbrus";
+        case x86_manufacturer::microsoft_vpc:
+            return "microsoft_vpc";
+        case x86_manufacturer::unknown:
+            return "unknown";
+        }
+        return "invalid";
+    }
+
+    struct x86_cpuid_leaf1_traits
+    {
+        static constexpr detail::x86_reg32_t leaf = 1;
+        static constexpr detail::x86_reg32_t subleaf = 0;
+
+        enum class eax
+        {
+            /* Stepping ID bit range. */
+            stepping_start = 0,
+            stepping_end = 4,
+            /* Model bit range. */
+            model_start = 4,
+            model_end = 8,
+            /* Family ID bit range. */
+            family_id_start = 8,
+            family_id_end = 12,
+            /* Extended Model ID bit range. */
+            ext_model_start = 16,
+            ext_model_end = 20,
+            /* Extended Family ID bit range. */
+            ext_family_start = 20,
+            ext_family_end = 28,
+        };
+        enum class ecx
+        {
+            /* Streaming SIMD Extensions 3. */
+            sse3 = 0,
+            /* Supplemental Streaming SIMD Extensions 3. */
+            ssse3 = 9,
+            /* Fused multiply-add with 3 operands (FMA3). */
+            fma3 = 12,
+            /* Streaming SIMD Extensions 4.1. */
+            sse4_1 = 19,
+            /* Streaming SIMD Extensions 4.2. */
+            sse4_2 = 20,
+            /* Population count instruction (POPCNT). */
+            popcnt = 23,
+            /* Advanced Encryption Standard instruction set. */
+            aes_ni = 25,
+            /* OS has enabled XSAVE/XRSTOR for extended processor state management. */
+            osxsave = 27,
+            /* Advanced Vector Extensions (256-bit SIMD). */
+            avx = 28,
+            /* Half to single floating point conversion. */
+            f16c = 29,
+            /* On-chip random number generator. */
+            rdrnd = 30,
+        };
+        enum class edx
+        {
+            /* Streaming SIMD Extensions 2. */
+            sse2 = 26,
+        };
+
+        using regs_t = detail::x86_cpuid_regs<leaf, subleaf,
+                                              detail::x86_reg_id<eax, 0>,
+                                              detail::x86_reg_id<ecx, 2>,
+                                              detail::x86_reg_id<edx, 3>>;
+    };
+
+    /**
+     * Processor Info and Feature Bits.
+     *
+     * Utility class that can read and parse the registers for the first leaf level
+     * of the CPUID instruction.
+     * This is well defined on all architectures but will return all false on all
+     * non-x86 architectures.
+     *
+     * @see https://en.wikipedia.org/wiki/CPUID
+     */
+    using x86_cpuid_leaf1 = typename x86_cpuid_leaf1_traits::regs_t;
+
+    struct x86_cpuid_leaf7_traits
+    {
+        static constexpr detail::x86_reg32_t leaf = 7;
+        static constexpr detail::x86_reg32_t subleaf = 0;
+
+        enum class eax
+        {
+            /* Start bit for the encoding of the highest subleaf available. */
+            highest_subleaf_start = 0,
+            /* End bit for the encoding of the highest subleaf available. */
+            highest_subleaf_end = 32,
+        };
+        enum class ebx
+        {
+            /* Bit Manipulation Instruction Set 1. */
+            bmi1 = 3,
+            /* Advanced Vector Extensions 2 (integer 256-bit SIMD). */
+            avx2 = 5,
+            /* Bit Manipulation Instruction Set 2. */
+            bmi2 = 8,
+            /* AVX-512 Foundation instructions. */
+            avx512f = 16,
+            /* AVX-512 Doubleword and Quadword instructions. */
+            avx512dq = 17,
+            /* Low-level access to the entropy-generating hardware. */
+            rdseed = 18,
+            /* Intel arbitrary precision add carry. */
+            adx = 19,
+            /* AVX-512 Integer Fused Multiply-Add instructions. */
+            avx512ifma = 21,
+            /* AVX-512 Prefetch instructions. */
+            avx512pf = 26,
+            /* AVX-512 Exponential and Reciprocal instructions. */
+            avx512er = 27,
+            /* AVX-512 Conflict Detection instructions. */
+            avx512cd = 28,
+            /* Sha-1 and Sha-256 extension. */
+            sha = 29,
+            /* AVX-512 Byte and Word instructions. */
+            avx512bw = 30,
+            /* AVX-512 Vector Length Extensions for xmm and ymm registers. */
+            avx512vl = 31,
+        };
+        enum class ecx
+        {
+            /* AVX-512 Vector Bit Manipulation instructions. */
+            avx512vbmi = 1,
+            /* AVX-512 Vector Bit Manipulation instructions 2. */
+            avx512vbmi2 = 6,
+            /* Galois Field instructions. */
+            gfni = 8,
+            /* Vector Advanced Encryption Standard instructions. */
+            vaes = 9,
+            /* Carry-less multiplication quadword instruction. */
+            vpclmulqdq = 10,
+            /* AVX-512 Vector Neural Network instructions. */
+            avx512vnni_bw = 11,
+            /* AVX-512 bit algorithm instructions (BITALG). */
+            avx512_bitalg = 12,
+            /* AVX-512 vector population count for doubleword and quadword. */
+            avx512_vpopcntdq = 14,
+        };
+        enum class edx
+        {
+            /* AVX-512 4-register neural network instructions (word variable precision). */
+            avx512_4vnniw = 2,
+            /* AVX-512 4-register multiply-accumulate single precision. */
+            avx512_4fmaps = 3,
+            /* AVX-512 intersect pairs of packed doubleword/quadword integers. */
+            avx512_vp2intersect = 8,
+            /* AVX-512 16-bit floating-point instructions. */
+            avx512_fp16 = 23,
+
+        };
+
+        using regs_t = detail::x86_cpuid_regs<leaf, subleaf,
+                                              detail::x86_reg_id<eax, 0>,
+                                              detail::x86_reg_id<ebx, 1>,
+                                              detail::x86_reg_id<ecx, 2>,
+                                              detail::x86_reg_id<edx, 3>>;
+    };
+
+    /**
+     * Extended Feature Bits (EAX=7, ECX=0).
+     *
+     * Utility class that can read and parse the registers for the extended
+     * feature bits leaf of the CPUID instruction.
+     * This is well defined on all architectures but will return all false on all
+     * non-x86 architectures.
+     *
+     * @see https://en.wikipedia.org/wiki/CPUID
+     */
+    using x86_cpuid_leaf7 = typename x86_cpuid_leaf7_traits::regs_t;
+
+    struct x86_cpuid_leaf7sub1_traits
+    {
+        static constexpr detail::x86_reg32_t leaf = 7;
+        static constexpr detail::x86_reg32_t subleaf = 1;
+
+        enum class eax
+        {
+            /* AVX (VEX-encoded) Vector Neural Network instructions. */
+            avxvnni = 4,
+            /* AVX-512 BFloat16 instructions. */
+            avx512_bf16 = 5,
+        };
+
+        using regs_t = detail::x86_cpuid_regs<leaf, subleaf,
+                                              detail::x86_reg_id<eax, 0>>;
+    };
+
+    /**
+     * Extended Feature Bits (EAX=7, ECX=1).
+     *
+     * Utility class that can read and parse the registers for the extended
+     * feature bits, subleaf 1, of the CPUID instruction.
+     * This is well defined on all architectures but will return all false on all
+     * non-x86 architectures.
+     *
+     * @see https://en.wikipedia.org/wiki/CPUID
+     */
+    using x86_cpuid_leaf7sub1 = typename x86_cpuid_leaf7sub1_traits::regs_t;
+
+    /**
+     * Highest Extended CPUID Function Parameter (EAX=0x80000000).
+     *
+     * Returns the highest leaf value supported by CPUID in the extended range
+     * (at or above 0x80000000), and the processor manufacturer ID string.
+     *
+     * @see https://en.wikipedia.org/wiki/CPUID
+     */
+    using x86_cpuid_leaf80000000 = detail::x86_cpuid_highest_func<true>;
+
+    struct x86_cpuid_leaf80000001_traits
+    {
+        static constexpr detail::x86_reg32_t leaf = 0x80000001;
+        static constexpr detail::x86_reg32_t subleaf = 0;
+
+        enum class ecx
+        {
+            /* AMD Fused multiply-add with 4 operands (FMA4). */
+            fma4 = 16,
+        };
+
+        using regs_t = detail::x86_cpuid_regs<leaf, subleaf,
+                                              detail::x86_reg_id<ecx, 2>>;
+    };
+
+    /**
+     * Extended Processor Info and Feature Bits.
+     *
+     * Utility class that can read and parse the registers for the extended
+     * processor info leaf of the CPUID instruction.
+     * This is well defined on all architectures but will return all false on all
+     * non-x86 architectures.
+     *
+     * @see https://en.wikipedia.org/wiki/CPUID
+     */
+    using x86_cpuid_leaf80000001 = typename x86_cpuid_leaf80000001_traits::regs_t;
+
+    /*
+     * Extended Control Register 0 (XCR0).
+     *
+     * Operating systems can explicitly disable the usage of instruction set (such
+     * as SSE or AVX extensions) by setting an appropriate flag in XCR0 register.
+     * This utility parses such bit values.
+     *
+     * @see https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html
+     */
+    class x86_xcr0
+    {
+    public:
+        enum class xcr0
+        {
+            /** x87 FPU/MMX support (must be 1). */
+            x87 = 0,
+            /** XSAVE support for MXCSR and XMM registers. */
+            sse = 1,
+            /** AVX enabled and XSAVE support for upper halves of YMM registers. */
+            avx = 2,
+            /** MPX enabled and XSAVE support for BND0-BND3 registers. */
+            bndreg = 3,
+            /** MPX enabled and XSAVE support for BNDCFGU and BNDSTATUS registers. */
+            bndcsr = 4,
+            /** AVX-512 enabled and XSAVE support for opmask registers k0-k7. */
+            opmask = 5,
+            /** AVX-512 enabled and XSAVE support for upper halves of lower ZMM registers. */
+            zmm_hi256 = 6,
+            /** AVX-512 enabled and XSAVE support for upper ZMM registers. */
+            hi16_zmm = 7,
+            /** Saving/restoring Intel Processor Trace state via XSAVE enabled.*/
+            processor_trace = 8,
+            /** XSAVE support for PKRU register. */
+            pkru = 9,
+        };
+
+        /**
+         * Create a default value with only SSE enabled.
+         *
+         * AVX and AVX512 strictly require OSXSAVE to be enabled by the OS.
+         * If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1), AVX state won't
+         * be preserved across context switches, so AVX cannot be used.
+         * SSE is therefore the only value safe to assume.
+         */
+        constexpr static x86_xcr0 safe_default() noexcept
+        {
+            x86_reg32_t low = {};
+            low = utils::make_bit_mask(static_cast<x86_reg32_t>(xcr0::sse));
+            return x86_xcr0(low);
+        }
+
+        /**
+         * Read the XCR0 register from the CPU if on the correct architecture.
+         *
+         * This is only safe to call if bit 18 of CR4.OSXSAVE has been set.
+         *
+         * @see cpu_id::osxsave
+         */
+        inline static x86_xcr0 read()
+        {
+            assert(x86_cpuid_leaf1::read().all_bits_set<x86_cpuid_leaf1::ecx::osxsave>());
+            return x86_xcr0(detail::x86_xcr0_low());
+        }
+
+        template <xcr0... bits>
+        constexpr bool all_bits_set() const noexcept
+        {
+            return m_low.all_bits_set<bits...>();
+        }
+
+        /** Create a value which return false to everything. */
+        constexpr x86_xcr0() noexcept = default;
+
+    private:
+        using x86_reg32_t = detail::x86_reg32_t;
+
+        using xcr0_reg_t = detail::x86_reg32_bitset<xcr0>;
+
+        /** Parse a XCR0 value into individual components. */
+        constexpr explicit x86_xcr0(x86_reg32_t low) noexcept
+            : m_low(low)
+        {
+        }
+
+        xcr0_reg_t m_low {};
+    };
+
+    /**
+     * Orchestrator for `CPUID` calls.
+     *
+     * This class orchestrate `CPUID` and `XCR0` calls so that they are made in the appropriate
+     * order. It also implements lazy calling and cache mechanism around those calls.
+     * Works on all platforms, and return all zeros on non `x86` platforms.
+     */
+    class x86_cpu_features_backend_cpuid
+    {
+    public:
+        x86_cpu_features_backend_cpuid() noexcept = default;
+
+        inline x86_xcr0 const& xcr0() const noexcept;
+        inline x86_cpuid_leaf0 const& leaf0() const;
+        inline x86_cpuid_leaf80000000 const& leaf80000000() const;
+        inline x86_cpuid_leaf1 const& leaf1() const;
+        inline x86_cpuid_leaf7 const& leaf7() const;
+        inline x86_cpuid_leaf7sub1 const& leaf7sub1() const;
+        inline x86_cpuid_leaf80000001 const& leaf80000001() const;
+
+    private:
+        enum class status
+        {
+            leaf0_valid = 0,
+            leaf1_valid = 1,
+            leaf7_valid = 2,
+            leaf7sub1_valid = 3,
+            leaf80000000_valid = 4,
+            leaf80000001_valid = 5,
+            xcr0_valid = 6,
+        };
+
+        using status_bitset = utils::uint_bitset<status, std::uint32_t>;
+
+        mutable x86_cpuid_leaf0 m_leaf0 {};
+        mutable x86_cpuid_leaf1 m_leaf1 {};
+        mutable x86_cpuid_leaf7 m_leaf7 {};
+        mutable x86_cpuid_leaf7sub1 m_leaf7sub1 {};
+        mutable x86_cpuid_leaf80000000 m_leaf80000000 {};
+        mutable x86_cpuid_leaf80000001 m_leaf80000001 {};
+        mutable x86_xcr0 m_xcr0 {};
+        mutable status_bitset m_status {};
+
+        inline bool osxsave() const noexcept;
+
+        /**
+         * Internal utility to lazily read and cache a CPUID leaf.
+         *
+         * @tparam status_id The status bit tracking whether this leaf has been read and cached.
+         * @tparam L The CPUID leaf type (e.g. x86_cpuid_leaf1, x86_cpuid_leaf7).
+         * @param leaf_cache A non-const reference to the class member that stores the leaf
+         *        value. It must be non-const because this function may write to it on first
+         *        call. It is passed explicitly (rather than accessed via `this`) to allow
+         *        factoring the caching logic across different leaf members.
+         * @return A const reference to `leaf_cache`. The non-const input / const-ref output
+         *         asymmetry is intentional: callers must not modify the cached value, but
+         *         this function needs write access to populate it.
+         *
+         * On first call, checks whether the leaf number is within the range advertised as
+         * supported by CPUID (via leaf 0 for the standard range, leaf 0x80000000 for the
+         * extended range). If supported, reads the leaf from the CPU; otherwise leaves
+         * `leaf_cache` at its zero-initialized default (all feature bits false). Either
+         * way, `status_id` is set so subsequent calls return immediately.
+         */
+        template <status status_id, typename L>
+        inline auto const& safe_read_leaf(L& leaf_cache) const;
+    };
+
+    /**
+     * No-Op orchestrator for `CPUID` calls
+     *
+     * This does nothing and return zero-constructed objects on all calls.
+     * This is meant as an optimization on non `x86` platforms as the
+     * `x86_cpu_features_backend_cpuid` can be slightly large (hundred of bytes).
+     */
+    class x86_cpu_features_backend_noop
+    {
+    public:
+        constexpr x86_xcr0 xcr0() const noexcept { return {}; }
+        constexpr x86_cpuid_leaf0 leaf0() const { return {}; }
+        constexpr x86_cpuid_leaf80000000 leaf80000000() const { return {}; }
+        constexpr x86_cpuid_leaf1 leaf1() const { return {}; }
+        constexpr x86_cpuid_leaf7 leaf7() const { return {}; }
+        constexpr x86_cpuid_leaf7sub1 leaf7sub1() const { return {}; }
+        constexpr x86_cpuid_leaf80000001 leaf80000001() const { return {}; }
+    };
+
+#if XSIMD_TARGET_X86
+    using x86_cpu_features_backend_default = x86_cpu_features_backend_cpuid;
+#else
+    using x86_cpu_features_backend_default = x86_cpu_features_backend_noop;
+#endif
+
+    /**
+     * An opiniated CPU feature detection utility for x86.
+     *
+     * These are high level features that combine multiple registers reads in sequence.
+     * Instead of looking directly at raw CPUID results, this utility also checks that
+     * permissions (e.g. OSXSAVE) are enabled, and otherwise return conservative defaults.
+     *
+     * This is well defined on all architectures. It will always return false on
+     * non-x86 architectures.
+     */
+    class x86_cpu_features : private x86_cpu_features_backend_default
+    {
+    public:
+        x86_cpu_features() noexcept = default;
+
+        inline bool sse_enabled() const noexcept
+        {
+            return xcr0().all_bits_set<x86_xcr0::xcr0::sse>();
+        }
+
+        inline bool avx_enabled() const noexcept
+        {
+            // Check both SSE and AVX bits even though AVX must imply SSE
+            return xcr0().all_bits_set<x86_xcr0::xcr0::sse, x86_xcr0::xcr0::avx>();
+        }
+
+        inline bool avx512_enabled() const noexcept
+        {
+            // Check all SSE, AVX, optmask, and AVX512 bits even though AVX512 must
+            // imply AVX, SSE, and masked operations.
+            return xcr0().all_bits_set<x86_xcr0::xcr0::sse, x86_xcr0::xcr0::avx, x86_xcr0::xcr0::opmask, x86_xcr0::xcr0::zmm_hi256>();
+        }
+
+        /**
+         * The manufacturer ID string in a static array.
+         *
+         * This raw character array is case specific and may contain both leading
+         * and trailing whitespaces.
+         * It cannot be assumed to be null terminated.
+         */
+        inline auto manufacturer_id_raw() const noexcept
+        {
+            return leaf0().manufacturer_id_raw();
+        }
+
+#if XSIMD_CPP_VERSION >= 201703L
+        inline std::string_view manufacturer_id() const noexcept
+        {
+            return leaf0().manufacturer_id();
+        }
+#endif
+
+        /** The manufacturer ID string parsed into known common vendors. */
+        inline x86_manufacturer known_manufacturer() const noexcept
+        {
+            return x86_parse_manufacturer(manufacturer_id_raw());
+        }
+
+        /**
+         * Indicates whether the OS has enabled extended state management.
+         *
+         * When true, the OS has set bit 18 (OSXSAVE) in the CR4 control register,
+         * enabling the XGETBV/XSETBV instructions to access XCR0 and support
+         * processor extended state management using XSAVE/XRSTOR.
+         *
+         * This value is read from CPUID leaf 0x1, ECX bit 27, which reflects
+         * the state of CR4.OSXSAVE.
+         */
+        inline bool osxsave() const noexcept { return leaf1().all_bits_set<x86_cpuid_leaf1::ecx::osxsave>(); }
+
+        /**
+         * Effective processor family.
+         *
+         * Per wikipedia CPUID page:
+         * > The actual processor family is derived from the Family ID and Extended Family ID fields.
+         * > If the Family ID field is equal to 15, the family is equal to the sum of the Extended
+         * > Family ID and the Family ID fields. Otherwise, the family is equal to the value of the
+         * > Family ID field.
+         *
+         * @see https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits
+         */
+        inline detail::x86_reg32_t cpu_family() const noexcept
+        {
+            using eax = x86_cpuid_leaf1::eax;
+            auto family_id = leaf1().get_range<eax::family_id_start, eax::family_id_end>();
+            auto ext_family_id = leaf1().get_range<eax::ext_family_start, eax::ext_family_end>();
+            return family_id + (family_id == 15 ? ext_family_id : 0);
+        }
+
+        /**
+         * Effective processor model.
+         *
+         * Per wikipedia CPUID page:
+         * > The actual processor model is derived from the Model, Extended Model ID and Family ID
+         * > fields. If the Family ID field is either 6 or 15, the model is equal to the sum of the
+         * > Extended Model ID field shifted left by 4 bits and the Model field.
+         * > Otherwise, the model is equal to the value of the Model field.
+         *
+         * @see https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits
+         */
+        inline detail::x86_reg32_t cpu_model() const noexcept
+        {
+            using eax = x86_cpuid_leaf1::eax;
+            auto model = leaf1().get_range<eax::model_start, eax::model_end>();
+            auto ext_model = leaf1().get_range<eax::ext_model_start, eax::ext_model_end>();
+            auto family_id = leaf1().get_range<eax::family_id_start, eax::family_id_end>();
+            return (family_id == 15 || family_id == 6) ? ((ext_model << 4) + model) : model;
+        }
+
+        inline bool sse2() const noexcept { return sse_enabled() && leaf1().all_bits_set<x86_cpuid_leaf1::edx::sse2>(); }
+
+        inline bool sse3() const noexcept { return sse_enabled() && leaf1().all_bits_set<x86_cpuid_leaf1::ecx::sse3>(); }
+
+        inline bool ssse3() const noexcept { return sse_enabled() && leaf1().all_bits_set<x86_cpuid_leaf1::ecx::ssse3>(); }
+
+        inline bool sse4_1() const noexcept { return sse_enabled() && leaf1().all_bits_set<x86_cpuid_leaf1::ecx::sse4_1>(); }
+
+        inline bool sse4_2() const noexcept { return sse_enabled() && leaf1().all_bits_set<x86_cpuid_leaf1::ecx::sse4_2>(); }
+
+        inline bool popcnt() const noexcept { return leaf1().all_bits_set<x86_cpuid_leaf1::ecx::popcnt>(); }
+
+        inline bool fma3() const noexcept { return sse_enabled() && leaf1().all_bits_set<x86_cpuid_leaf1::ecx::fma3>(); }
+
+        inline bool avx() const noexcept { return avx_enabled() && leaf1().all_bits_set<x86_cpuid_leaf1::ecx::avx>(); }
+
+        inline bool avx_128() const noexcept
+        {
+            // Avx 128 bit instructions use the same xmm registers from SSE so checking if those
+            // are enabled is sufficient.
+            return sse_enabled() && leaf1().all_bits_set<x86_cpuid_leaf1::ecx::avx>();
+        }
+
+        inline bool aes_ni() const noexcept { return sse_enabled() && leaf1().all_bits_set<x86_cpuid_leaf1::ecx::aes_ni>(); }
+
+        inline bool f16c() const noexcept { return avx_enabled() && leaf1().all_bits_set<x86_cpuid_leaf1::ecx::f16c>(); }
+
+        inline bool rdrnd() const noexcept { return leaf1().all_bits_set<x86_cpuid_leaf1::ecx::rdrnd>(); }
+
+        inline bool bmi1() const noexcept { return leaf7().all_bits_set<x86_cpuid_leaf7::ebx::bmi1>(); }
+
+        inline bool avx2() const noexcept { return avx_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx2>(); }
+
+        inline bool avx2_128() const noexcept
+        {
+            // Avx 128 bit instructions use the same xmm registers from SSE so checking if those
+            // are enabled is sufficient.
+            return sse_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx2>();
+        }
+
+        inline bool bmi2() const noexcept { return leaf7().all_bits_set<x86_cpuid_leaf7::ebx::bmi2>(); }
+
+        /**
+         * BMI2 support with efficient PEXT and PDEP instructions.
+         *
+         * > AMD processors before Zen 3 that implement PDEP and PEXT do so in microcode, with a
+         * > latency of 18 cycles rather than (Zen 3) 3 cycles. As a result it is often faster
+         * > to use other instructions on these processors.
+         *
+         * @see https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
+         */
+        inline bool efficient_bmi2() const noexcept
+        {
+            if (known_manufacturer() == x86_manufacturer::amd)
+            {
+                // Zen 3 and Zen 4 report family 0x19; Zen 5 reports family 0x1A.
+                // Earlier AMD microarchitectures (Zen / Zen+ / Zen 2) report family 0x17.
+                return bmi2() && (cpu_family() >= 0x19);
+            }
+            return bmi2();
+        }
+
+        inline bool avx512f() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx512f>(); }
+
+        inline bool avx512dq() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx512dq>(); }
+
+        inline bool rdseed() const noexcept { return leaf7().all_bits_set<x86_cpuid_leaf7::ebx::rdseed>(); }
+
+        inline bool adx() const noexcept { return leaf7().all_bits_set<x86_cpuid_leaf7::ebx::adx>(); }
+
+        inline bool avx512ifma() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx512ifma>(); }
+
+        inline bool avx512pf() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx512pf>(); }
+
+        inline bool avx512er() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx512er>(); }
+
+        inline bool avx512cd() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx512cd>(); }
+
+        inline bool sha() const noexcept { return leaf7().all_bits_set<x86_cpuid_leaf7::ebx::sha>(); }
+
+        inline bool avx512bw() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx512bw>(); }
+
+        inline bool avx512vl() const noexcept
+        {
+            return xcr0().all_bits_set<x86_xcr0::xcr0::opmask>()
+                && leaf7().all_bits_set<x86_cpuid_leaf7::ebx::avx512f, x86_cpuid_leaf7::ebx::avx512vl>();
+        }
+
+        inline bool avx512vl_128() const noexcept
+        {
+            // AVX512 128 bit instructions use the same xmm registers from SSE so checking if those
+            // are enabled is sufficient.
+            return sse_enabled() && avx512vl();
+        }
+
+        inline bool avx512vl_256() const noexcept
+        {
+            // AVX512 256 bit instructions use the same ymm registers from AVX so checking if those
+            // are enabled is sufficient.
+            return avx_enabled() && avx512vl();
+        }
+
+        inline bool avx512vbmi() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ecx::avx512vbmi>(); }
+
+        inline bool avx512vbmi2() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ecx::avx512vbmi2>(); }
+
+        inline bool gfni() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ecx::gfni>(); }
+
+        inline bool vaes() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ecx::vaes>(); }
+
+        inline bool vpclmulqdq() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ecx::vpclmulqdq>(); }
+
+        inline bool avx512vnni_bw() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ecx::avx512vnni_bw>(); }
+
+        inline bool avx512_bitalg() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ecx::avx512_bitalg>(); }
+
+        inline bool avx512_vpopcntdq() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::ecx::avx512_vpopcntdq>(); }
+
+        inline bool avx512_4vnniw() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::edx::avx512_4vnniw>(); }
+
+        inline bool avx512_4fmaps() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::edx::avx512_4fmaps>(); }
+
+        inline bool avx512_vp2intersect() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::edx::avx512_vp2intersect>(); }
+
+        inline bool avx512_fp16() const noexcept { return avx512_enabled() && leaf7().all_bits_set<x86_cpuid_leaf7::edx::avx512_fp16>(); }
+
+        inline bool avxvnni() const noexcept { return avx_enabled() && leaf7sub1().all_bits_set<x86_cpuid_leaf7sub1::eax::avxvnni>(); }
+
+        inline bool avx512_bf16() const noexcept { return avx512_enabled() && leaf7sub1().all_bits_set<x86_cpuid_leaf7sub1::eax::avx512_bf16>(); }
+
+        inline bool fma4() const noexcept { return avx_enabled() && leaf80000001().all_bits_set<x86_cpuid_leaf80000001::ecx::fma4>(); }
+    };
+
+    /********************
+     *  Implementation  *
+     ********************/
+
+    template <x86_cpu_features_backend_cpuid::status status_id, typename L>
+    inline auto const& x86_cpu_features_backend_cpuid::safe_read_leaf(L& leaf_cache) const
+    {
+        // Check if already initialized
+        if (m_status.bit_is_set<status_id>())
+        {
+            return leaf_cache;
+        }
+
+        // Limit where we need to check leaf0 or leaf 80000000.
+        constexpr auto extended_threshold = x86_cpuid_leaf80000000::leaf;
+
+        // Check if it is safe to call CPUID with this value.
+        // First we identify if the leaf is in the regular or extended range.
+        // TODO(C++17): if constexpr
+        if (L::leaf < extended_threshold)
+        {
+            // Check leaf0 in regular range
+            if (L::leaf <= leaf0().highest_leaf())
+            {
+                leaf_cache = L::read();
+            }
+        }
+        else
+        {
+            // Check leaf80000000 in extended range
+            if (L::leaf <= leaf80000000().highest_leaf())
+            {
+                leaf_cache = L::read();
+            }
+        }
+
+        // Mark as valid in all cases, including if it was not read.
+        // In this case it will be filled with zeros (all false).
+        m_status.set_bit<status_id>();
+        return leaf_cache;
+    }
+
+    inline x86_xcr0 const& x86_cpu_features_backend_cpuid::xcr0() const noexcept
+    {
+        if (!m_status.bit_is_set<status::xcr0_valid>())
+        {
+            m_xcr0 = osxsave() ? x86_xcr0::read() : x86_xcr0::safe_default();
+            m_status.set_bit<status::xcr0_valid>();
+        }
+        return m_xcr0;
+    }
+
+    inline x86_cpuid_leaf0 const& x86_cpu_features_backend_cpuid::leaf0() const
+    {
+        if (!m_status.bit_is_set<status::leaf0_valid>())
+        {
+            m_leaf0 = x86_cpuid_leaf0::read();
+            m_status.set_bit<status::leaf0_valid>();
+        }
+        return m_leaf0;
+    }
+
+    inline x86_cpuid_leaf80000000 const& x86_cpu_features_backend_cpuid::leaf80000000() const
+    {
+        if (!m_status.bit_is_set<status::leaf80000000_valid>())
+        {
+            m_leaf80000000 = x86_cpuid_leaf80000000::read();
+            m_status.set_bit<status::leaf80000000_valid>();
+        }
+        return m_leaf80000000;
+    }
+
+    inline x86_cpuid_leaf1 const& x86_cpu_features_backend_cpuid::leaf1() const
+    {
+        return safe_read_leaf<status::leaf1_valid>(m_leaf1);
+    }
+
+    inline x86_cpuid_leaf7 const& x86_cpu_features_backend_cpuid::leaf7() const
+    {
+        return safe_read_leaf<status::leaf7_valid>(m_leaf7);
+    }
+
+    inline x86_cpuid_leaf7sub1 const& x86_cpu_features_backend_cpuid::leaf7sub1() const
+    {
+        // Check if already initialized
+        if (m_status.bit_is_set<status::leaf7sub1_valid>())
+        {
+            return m_leaf7sub1;
+        }
+
+        // Check if safe to call CPUID with this value as subleaf.
+        constexpr auto start = x86_cpuid_leaf7::eax::highest_subleaf_start;
+        constexpr auto end = x86_cpuid_leaf7::eax::highest_subleaf_end;
+        const auto highest_subleaf7 = leaf7().get_range<start, end>();
+        if (x86_cpuid_leaf7sub1::subleaf <= highest_subleaf7)
+        {
+            m_leaf7sub1 = x86_cpuid_leaf7sub1::read();
+        }
+
+        // Mark as valid in all cases, including if it was not read.
+        // In this case it will be filled with zeros (all false).
+        m_status.set_bit<status::leaf7sub1_valid>();
+        return m_leaf7sub1;
+    }
+
+    inline x86_cpuid_leaf80000001 const& x86_cpu_features_backend_cpuid::leaf80000001() const
+    {
+        return safe_read_leaf<status::leaf80000001_valid>(m_leaf80000001);
+    }
+
+    inline bool x86_cpu_features_backend_cpuid::osxsave() const noexcept
+    {
+        return leaf1().all_bits_set<x86_cpuid_leaf1::ecx::osxsave>();
+    }
+
+    namespace detail
+    {
+#if XSIMD_TARGET_X86
+
+        inline cpuid_reg_t x86_cpuid(int leaf, int subleaf) noexcept
+        {
+            cpuid_reg_t reg = {};
+#if defined(_MSC_VER)
+            int buf[4];
+            __cpuidex(buf, leaf, subleaf);
+            std::memcpy(reg.data(), buf, sizeof(buf));
+
+// Intel compiler has long had support for `__cpuid`, but only recently for `__cpuidex`.
+// Modern Clang and GCC also now support `__cpuidex`.
+// It was decided to keep the inline ASM version for maximum compatibility, as the difference
+// in ASM is negligible compared to the cost of CPUID.
+// https://github.com/xtensor-stack/xsimd/pull/1278
+#elif XSIMD_WITH_INLINE_ASM
+
+#if defined(__i386__) && defined(__PIC__)
+            // %ebx may be the PIC register
+            __asm__("xchg{l}\t{%%}ebx, %1\n\t"
+                    "cpuid\n\t"
+                    "xchg{l}\t{%%}ebx, %1\n\t"
+                    : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
+                    : "0"(leaf), "2"(subleaf));
+
+#else
+            __asm__("cpuid\n\t"
+                    : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
+                    : "0"(leaf), "2"(subleaf));
+#endif
+#endif
+            return reg;
+        }
+
+        inline x86_reg32_t x86_xcr0_low() noexcept
+        {
+#if defined(_MSC_VER)
+#if _MSC_VER >= 1400
+            return static_cast<x86_reg32_t>(_xgetbv(0));
+#else
+#error "_MSC_VER < 1400 is not supported"
+#endif
+
+#elif XSIMD_WITH_INLINE_ASM
+            x86_reg32_t xcr0 = {};
+            __asm__(
+                "xorl %%ecx, %%ecx\n"
+                "xgetbv\n"
+                : "=a"(xcr0)
+                :
+#if defined(__i386__)
+                : "ecx", "edx"
+#else
+                : "rcx", "rdx"
+#endif
+            );
+            return xcr0;
+#endif
+        }
+
+#else // XSIMD_TARGET_X86
+
+        inline cpuid_reg_t x86_cpuid(int /* leaf */, int /* subleaf */) noexcept
+        {
+            return {}; // All bits to zero
+        }
+
+        inline x86_reg32_t x86_xcr0_low() noexcept
+        {
+            return {}; // All bits to zero
+        }
+
+#endif // XSIMD_TARGET_X86
+    }
+}
+#endif
diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp
index 1be4f018a..7466cd5f8 100644
--- a/include/xsimd/config/xsimd_cpuid.hpp
+++ b/include/xsimd/config/xsimd_cpuid.hpp
@@ -12,25 +12,9 @@
 #ifndef XSIMD_CPUID_HPP
 #define XSIMD_CPUID_HPP
 
-#include <algorithm>
-#include <cstring>
-
-#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector))
-#include <asm/hwcap.h>
-#include <sys/auxv.h>
-
-#ifndef HWCAP2_I8MM
-#define HWCAP2_I8MM (1 << 13)
-#endif
-
-#endif
-
-#if defined(_MSC_VER)
-// Contains the definition of __cpuidex
-#include <intrin.h>
-#endif
-
 #include "../types/xsimd_all_registers.hpp"
+#include "./xsimd_cpu_features.hpp"
+#include "./xsimd_macros.hpp"
 
 namespace xsimd
 {
@@ -40,7 +24,7 @@ namespace xsimd
         {
 
 #define ARCH_FIELD_EX(arch, field_name) \
-    unsigned field_name;                \
+    unsigned field_name = 0;            \
     XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; }
 
 #define ARCH_FIELD_EX_REUSE(arch, field_name) \
@@ -65,6 +49,7 @@ namespace xsimd
             ARCH_FIELD_EX(fma3<::xsimd::avx2>, fma3_avx2)
             ARCH_FIELD(avx512f)
             ARCH_FIELD(avx512cd)
+            ARCH_FIELD(avx512vl)
             ARCH_FIELD(avx512dq)
             ARCH_FIELD(avx512bw)
             ARCH_FIELD(avx512er)
@@ -77,183 +62,76 @@ namespace xsimd
             ARCH_FIELD(neon)
             ARCH_FIELD(neon64)
             ARCH_FIELD_EX(i8mm<::xsimd::neon64>, i8mm_neon64)
-            ARCH_FIELD_EX(detail::sve<512>, sve)
-            ARCH_FIELD_EX_REUSE(detail::sve<256>, sve)
-            ARCH_FIELD_EX_REUSE(detail::sve<128>, sve)
-            ARCH_FIELD_EX(detail::rvv<512>, rvv)
-            ARCH_FIELD_EX_REUSE(detail::rvv<256>, rvv)
-            ARCH_FIELD_EX_REUSE(detail::rvv<128>, rvv)
+            ARCH_FIELD_EX(detail::sve<512>, sve512)
+            ARCH_FIELD_EX(detail::sve<256>, sve256)
+            ARCH_FIELD_EX(detail::sve<128>, sve128)
+            ARCH_FIELD_EX(detail::rvv<512>, rvv512)
+            ARCH_FIELD_EX(detail::rvv<256>, rvv256)
+            ARCH_FIELD_EX(detail::rvv<128>, rvv128)
             ARCH_FIELD(wasm)
             ARCH_FIELD(vsx)
+            ARCH_FIELD(vxe)
 
 #undef ARCH_FIELD
 
             XSIMD_INLINE supported_arch() noexcept
             {
-                memset(this, 0, sizeof(supported_arch));
-
 #if XSIMD_WITH_WASM
                 wasm = 1;
 #endif
 
-#if XSIMD_WITH_VSX
-                vsx = 1;
-#endif
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-                neon = 1;
-                neon64 = 1;
-#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
-                i8mm_neon64 = bool(getauxval(AT_HWCAP2) & HWCAP2_I8MM);
-                sve = bool(getauxval(AT_HWCAP) & HWCAP_SVE);
-#endif
-
-#elif defined(__ARM_NEON) || defined(_M_ARM)
-
-#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
-                neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
-#endif
-
-#elif defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0
-
-#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
-#ifndef HWCAP_V
-#define HWCAP_V (1 << ('V' - 'A'))
-#endif
-                rvv = bool(getauxval(AT_HWCAP) & HWCAP_V);
-#endif
-
-#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
-
-                auto get_xcr0_low = []() noexcept
-                {
-                    uint32_t xcr0;
-
-#if defined(_MSC_VER) && _MSC_VER >= 1400
-
-                    xcr0 = (uint32_t)_xgetbv(0);
-
-#elif defined(__GNUC__)
-
-                    __asm__(
-                        "xorl %%ecx, %%ecx\n"
-                        "xgetbv\n"
-                        : "=a"(xcr0)
-                        :
-#if defined(__i386__)
-                        : "ecx", "edx"
-#else
-                        : "rcx", "rdx"
-#endif
-                    );
-
-#else /* _MSC_VER < 1400 */
-#error "_MSC_VER < 1400 is not supported"
-#endif /* _MSC_VER && _MSC_VER >= 1400 */
-                    return xcr0;
-                };
-
-                auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept
-                {
-
-#if defined(_MSC_VER)
-                    __cpuidex(reg, level, count);
-
-#elif defined(__INTEL_COMPILER)
-                    __cpuid(reg, level);
-
-#elif defined(__GNUC__) || defined(__clang__)
-
-#if defined(__i386__) && defined(__PIC__)
-                    // %ebx may be the PIC register
-                    __asm__("xchg{l}\t{%%}ebx, %1\n\t"
-                            "cpuid\n\t"
-                            "xchg{l}\t{%%}ebx, %1\n\t"
-                            : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
-                            : "0"(level), "2"(count));
-
-#else
-                    __asm__("cpuid\n\t"
-                            : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3])
-                            : "0"(level), "2"(count));
-#endif
-
-#else
-#error "Unsupported configuration"
-#endif
-                };
-
-                int regs1[4];
+                const auto cpu = xsimd::cpu_features();
 
-                get_cpuid(regs1, 0x1);
+                vxe = cpu.vxe();
 
-                // OS can explicitly disable the usage of SSE/AVX extensions
-                // by setting an appropriate flag in CR0 register
-                //
-                // https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html
+                vsx = cpu.vsx();
 
-                unsigned sse_state_os_enabled = 1;
-                // AVX and AVX512 strictly require OSXSAVE to be enabled by the OS.
-                // If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1),
-                // AVX state won't be preserved across context switches, so AVX cannot be used.
-                unsigned avx_state_os_enabled = 0;
-                unsigned avx512_state_os_enabled = 0;
+                rvv128 = cpu.rvv() && (cpu.rvv_size_bytes() >= (128 / 8));
+                rvv256 = cpu.rvv() && (cpu.rvv_size_bytes() >= (256 / 8));
+                rvv512 = cpu.rvv() && (cpu.rvv_size_bytes() >= (512 / 8));
 
-                // OSXSAVE: A value of 1 indicates that the OS has set CR4.OSXSAVE[bit
-                // 18] to enable XSETBV/XGETBV instructions to access XCR0 and
-                // to support processor extended state management using
-                // XSAVE/XRSTOR.
-                bool osxsave = regs1[2] >> 27 & 1;
-                if (osxsave)
-                {
+                neon = cpu.neon();
+                neon64 = cpu.neon64();
+                i8mm_neon64 = cpu.neon64() && cpu.i8mm();
 
-                    uint32_t xcr0 = get_xcr0_low();
+                // Running SVE128 on a SVE256 machine is more tricky than the x86 equivalent
+                // of running SSE code on an AVX machine and requires to explicitly change the
+                // vector length using `prctl` (per thread setting).
+                // This is something we have not tested and not integrated in xsimd so the safe
+                // default is to assume only one valid SVE width at runtime.
+                sve128 = cpu.sve() && (cpu.sve_size_bytes() * 8 == 128);
+                sve256 = cpu.sve() && (cpu.sve_size_bytes() * 8 == 256);
+                sve512 = cpu.sve() && (cpu.sve_size_bytes() * 8 == 512);
 
-                    sse_state_os_enabled = xcr0 >> 1 & 1;
-                    avx_state_os_enabled = xcr0 >> 2 & sse_state_os_enabled;
-                    avx512_state_os_enabled = xcr0 >> 6 & avx_state_os_enabled;
-                }
+                sse2 = cpu.sse2();
+                sse3 = cpu.sse3();
+                ssse3 = cpu.ssse3();
+                sse4_1 = cpu.sse4_1();
+                sse4_2 = cpu.sse4_2();
+                fma3_sse42 = cpu.fma3();
 
-                sse2 = regs1[3] >> 26 & sse_state_os_enabled;
-                sse3 = regs1[2] >> 0 & sse_state_os_enabled;
-                ssse3 = regs1[2] >> 9 & sse_state_os_enabled;
-                sse4_1 = regs1[2] >> 19 & sse_state_os_enabled;
-                sse4_2 = regs1[2] >> 20 & sse_state_os_enabled;
-                fma3_sse42 = regs1[2] >> 12 & sse_state_os_enabled;
+                // sse4a not implemented in cpu_id yet
+                // xop not implemented in cpu_id yet
 
-                avx = regs1[2] >> 28 & avx_state_os_enabled;
+                avx = cpu.avx();
                 fma3_avx = avx && fma3_sse42;
-
-                int regs8[4];
-                get_cpuid(regs8, 0x80000001);
-                fma4 = regs8[2] >> 16 & avx_state_os_enabled;
-
-                // sse4a = regs[2] >> 6 & 1;
-
-                // xop = regs[2] >> 11 & 1;
-
-                int regs7[4];
-                get_cpuid(regs7, 0x7);
-                avx2 = regs7[1] >> 5 & avx_state_os_enabled;
-
-                int regs7a[4];
-                get_cpuid(regs7a, 0x7, 0x1);
-                avxvnni = regs7a[0] >> 4 & avx_state_os_enabled;
-
+                fma4 = cpu.fma4();
+                avx2 = cpu.avx2();
+                avxvnni = cpu.avxvnni();
                 fma3_avx2 = avx2 && fma3_sse42;
 
-                avx512f = regs7[1] >> 16 & avx512_state_os_enabled;
-                avx512cd = regs7[1] >> 28 & avx512_state_os_enabled;
-                avx512dq = regs7[1] >> 17 & avx512_state_os_enabled;
-                avx512bw = regs7[1] >> 30 & avx512_state_os_enabled;
-                avx512er = regs7[1] >> 27 & avx512_state_os_enabled;
-                avx512pf = regs7[1] >> 26 & avx512_state_os_enabled;
-                avx512ifma = regs7[1] >> 21 & avx512_state_os_enabled;
-                avx512vbmi = regs7[2] >> 1 & avx512_state_os_enabled;
-                avx512vbmi2 = regs7[2] >> 6 & avx512_state_os_enabled;
-                avx512vnni_bw = regs7[2] >> 11 & avx512_state_os_enabled;
+                avx512f = cpu.avx512f();
+                avx512cd = cpu.avx512cd();
+                avx512vl = cpu.avx512vl();
+                avx512dq = cpu.avx512dq();
+                avx512bw = cpu.avx512bw();
+                avx512er = cpu.avx512er();
+                avx512pf = cpu.avx512pf();
+                avx512ifma = cpu.avx512ifma();
+                avx512vbmi = cpu.avx512vbmi();
+                avx512vbmi2 = cpu.avx512vbmi2();
+                avx512vnni_bw = cpu.avx512vnni_bw();
                 avx512vnni_vbmi2 = avx512vbmi2 && avx512vnni_bw;
-#endif
             }
         };
     } // namespace detail
diff --git a/include/xsimd/config/xsimd_getauxval.hpp b/include/xsimd/config/xsimd_getauxval.hpp
new file mode 100644
index 000000000..0c9cff57e
--- /dev/null
+++ b/include/xsimd/config/xsimd_getauxval.hpp
@@ -0,0 +1,159 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ***************************************************************************/
+
+#ifndef XSIMD_GETAUXVAL_HPP
+#define XSIMD_GETAUXVAL_HPP
+
+#include "../utils/bits.hpp"
+#include "./xsimd_config.hpp"
+
+#if XSIMD_HAVE_LINUX_GETAUXVAL
+#include <sys/auxv.h> // getauxval
+#endif
+
+namespace xsimd
+{
+    namespace detail
+    {
+        using linux_getauxval_t = unsigned long;
+
+        inline linux_getauxval_t linux_getauxval(linux_getauxval_t type) noexcept;
+    }
+
+    /*
+     * Holds the value of a Linux auxiliary vector entry (e.g. AT_HWCAP).
+     *
+     * On Linux systems, the kernel exposes some CPU features through the
+     * auxiliary vector, which can be queried via `getauxval(AT_HWCAP)`.
+     * Well defined on all platforms, and will return always falsw on
+     * non-linux platforms.
+     *
+     * Usage:
+     *   auto hwcap = linux_auxval::read(AT_HWCAP);
+     *   bool neon = hwcap.has_feature(HWCAP_NEON);
+     *
+     * @see https://www.kernel.org/doc/Documentation/arm64/elf_hwcaps.txt
+     */
+    class linux_auxval
+    {
+    private:
+        using getauxval_t = detail::linux_getauxval_t;
+
+    public:
+        constexpr linux_auxval() noexcept = default;
+
+        inline static linux_auxval read(getauxval_t type) noexcept
+        {
+            return linux_auxval(detail::linux_getauxval(type));
+        }
+
+        constexpr bool has_feature(getauxval_t feat) const noexcept
+        {
+            return (m_auxval & feat) == feat;
+        }
+
+    private:
+        getauxval_t m_auxval = {};
+
+        constexpr explicit linux_auxval(getauxval_t v) noexcept
+            : m_auxval(v)
+        {
+        }
+    };
+
+    class linux_hwcap_backend
+    {
+    public:
+        inline linux_auxval hwcap() const noexcept;
+
+        inline linux_auxval hwcap2() const noexcept;
+
+    private:
+        enum class status
+        {
+            hwcap_valid = 0,
+            hwcap2_valid = 1,
+        };
+
+        using status_bitset = utils::uint_bitset<status, std::uint32_t>;
+
+        mutable status_bitset m_status {};
+        mutable xsimd::linux_auxval m_hwcap {};
+        mutable xsimd::linux_auxval m_hwcap2 {};
+    };
+
+    class linux_hwcap_backend_noop
+    {
+    public:
+        inline linux_auxval hwcap() const noexcept { return {}; }
+
+        inline linux_auxval hwcap2() const noexcept { return {}; }
+    };
+
+#if XSIMD_HAVE_LINUX_GETAUXVAL
+    using linux_hwcap_backend_default = linux_hwcap_backend;
+#else
+    // Contrary to CPUID that is only used on one architecture, HWCAP are
+    // available on multiple architectures with different meaning for the
+    // different bit fields.
+    // We use the Linux `HWCAP` constants directly to avoid repetition, so
+    // we could not use a default implementation without already being on
+    // Linux anyways.
+    struct linux_hwcap_backend_default
+    {
+    };
+#endif
+
+    /********************
+     *  Implementation  *
+     ********************/
+
+    namespace detail
+    {
+#if XSIMD_HAVE_LINUX_GETAUXVAL
+        inline linux_getauxval_t linux_getauxval(linux_getauxval_t type) noexcept
+        {
+            return getauxval(type);
+        }
+#else
+        inline linux_getauxval_t linux_getauxval(linux_getauxval_t) noexcept
+        {
+            return {}; // All bits set to 0
+        }
+#endif
+    }
+
+    inline linux_auxval linux_hwcap_backend::hwcap() const noexcept
+    {
+        if (!m_status.bit_is_set<status::hwcap_valid>())
+        {
+#if XSIMD_HAVE_LINUX_GETAUXVAL
+            m_hwcap = linux_auxval::read(AT_HWCAP);
+#endif
+            m_status.set_bit<status::hwcap_valid>();
+        }
+        return m_hwcap;
+    }
+
+    inline linux_auxval linux_hwcap_backend::hwcap2() const noexcept
+    {
+        if (!m_status.bit_is_set<status::hwcap2_valid>())
+        {
+#if XSIMD_HAVE_LINUX_GETAUXVAL
+            m_hwcap2 = linux_auxval::read(AT_HWCAP2);
+#endif
+            m_status.set_bit<status::hwcap2_valid>();
+        }
+        return m_hwcap2;
+    }
+}
+
+#endif
diff --git a/include/xsimd/config/xsimd_macros.hpp b/include/xsimd/config/xsimd_macros.hpp
new file mode 100644
index 000000000..fb178db91
--- /dev/null
+++ b/include/xsimd/config/xsimd_macros.hpp
@@ -0,0 +1,78 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_MACROS_HPP
+#define XSIMD_MACROS_HPP
+
+#include "./xsimd_config.hpp"
+
+#if defined(__VEC__)
+#define XSIMD_INLINE inline
+#elif defined __has_attribute
+#if __has_attribute(always_inline)
+#define XSIMD_INLINE inline __attribute__((always_inline))
+#else
+#define XSIMD_INLINE inline
+#endif
+#elif defined(_MSC_VER)
+#define XSIMD_INLINE inline __forceinline
+#else
+#define XSIMD_INLINE inline
+#endif
+
+#define XSIMD_CONCAT_INNER(a, b) a##b
+#define XSIMD_CONCAT(a, b) XSIMD_CONCAT_INNER(a, b)
+
+#if defined(__FAST_MATH__)
+#define XSIMD_NO_DENORMALS
+#define XSIMD_NO_INFINITIES
+#define XSIMD_NO_NANS
+#endif
+
+#if defined(__has_cpp_attribute)
+// if this check passes, then the compiler supports feature test macros
+#if __has_cpp_attribute(nodiscard) >= 201603L
+// if this check passes, then the compiler supports [[nodiscard]] without a message
+#define XSIMD_NO_DISCARD [[nodiscard]]
+#endif
+#endif
+
+#if !defined(XSIMD_NO_DISCARD) && XSIMD_CPP_VERSION >= 201703L
+// this means that the previous tests failed, but we are using C++17 or higher
+#define XSIMD_NO_DISCARD [[nodiscard]]
+#endif
+
+#if !defined(XSIMD_NO_DISCARD) && (defined(__GNUC__) || defined(__clang__))
+// this means that the previous checks failed, but we are using GCC or Clang
+#define XSIMD_NO_DISCARD __attribute__((warn_unused_result))
+#endif
+
+#if !defined(XSIMD_NO_DISCARD)
+// this means that all the previous checks failed, so we fallback to doing nothing
+#define XSIMD_NO_DISCARD
+#endif
+
+#ifdef __cpp_if_constexpr
+// this means that the compiler supports the `if constexpr` construct
+#define XSIMD_IF_CONSTEXPR if constexpr
+#endif
+
+#if !defined(XSIMD_IF_CONSTEXPR) && XSIMD_CPP_VERSION >= 201703L
+// this means that the previous test failed, but we are using C++17 or higher
+#define XSIMD_IF_CONSTEXPR if constexpr
+#endif
+
+#if !defined(XSIMD_IF_CONSTEXPR)
+// this means that all the previous checks failed, so we fallback to a normal `if`
+#define XSIMD_IF_CONSTEXPR if
+#endif
+
+#endif
diff --git a/include/xsimd/memory/xsimd_aligned_allocator.hpp b/include/xsimd/memory/xsimd_aligned_allocator.hpp
index f6f446ac6..ee13e1dda 100644
--- a/include/xsimd/memory/xsimd_aligned_allocator.hpp
+++ b/include/xsimd/memory/xsimd_aligned_allocator.hpp
@@ -21,11 +21,11 @@
 #include <cstdlib>
 #endif
 
+#include "../config/xsimd_arch.hpp"
+
 #include <cassert>
 #include <memory>
 
-#include "../config/xsimd_arch.hpp"
-
 namespace xsimd
 {
 
diff --git a/include/xsimd/memory/xsimd_alignment.hpp b/include/xsimd/memory/xsimd_alignment.hpp
index 2d59ac1fc..30c2d16de 100644
--- a/include/xsimd/memory/xsimd_alignment.hpp
+++ b/include/xsimd/memory/xsimd_alignment.hpp
@@ -13,7 +13,7 @@
 #define XSIMD_ALIGNMENT_HPP
 
 #include "../types/xsimd_utils.hpp"
-#include "xsimd_aligned_allocator.hpp"
+#include "./xsimd_aligned_allocator.hpp"
 
 namespace xsimd
 {
@@ -33,6 +33,17 @@ namespace xsimd
     {
     };
 
+    /**
+     * @struct stream_mode
+     * @brief tag for load and store of aligned non-temporal memory.
+     *
+     * Streaming accesses expect aligned pointers. When no architecture-specific
+     * implementation is available, they fall back to aligned semantics.
+     */
+    struct stream_mode
+    {
+    };
+
     /***********************
      * Allocator alignment *
      ***********************/
diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp
index 33f9b465d..eb058f9b7 100644
--- a/include/xsimd/types/xsimd_all_registers.hpp
+++ b/include/xsimd/types/xsimd_all_registers.hpp
@@ -9,47 +9,38 @@
  * The full license is in the file LICENSE, distributed with this software. *
  ****************************************************************************/
 
-#include "xsimd_fma3_sse_register.hpp"
-#include "xsimd_fma4_register.hpp"
-#include "xsimd_sse2_register.hpp"
-#include "xsimd_sse3_register.hpp"
-#include "xsimd_sse4_1_register.hpp"
-#include "xsimd_sse4_2_register.hpp"
-
-#include "xsimd_avx2_register.hpp"
-#include "xsimd_avx_register.hpp"
-#include "xsimd_avxvnni_register.hpp"
-#include "xsimd_fma3_avx2_register.hpp"
-#include "xsimd_fma3_avx_register.hpp"
-
-#include "xsimd_avx512vnni_avx512bw_register.hpp"
-#include "xsimd_avx512vnni_avx512vbmi2_register.hpp"
-
-#include "xsimd_avx512ifma_register.hpp"
-#include "xsimd_avx512vbmi2_register.hpp"
-#include "xsimd_avx512vbmi_register.hpp"
-
-#include "xsimd_avx512er_register.hpp"
-#include "xsimd_avx512pf_register.hpp"
-
-#include "xsimd_avx512bw_register.hpp"
-#include "xsimd_avx512cd_register.hpp"
-#include "xsimd_avx512dq_register.hpp"
-#include "xsimd_avx512f_register.hpp"
-
-#include "xsimd_i8mm_neon64_register.hpp"
-
-#include "xsimd_neon64_register.hpp"
-#include "xsimd_neon_register.hpp"
-
-#include "xsimd_sve_register.hpp"
-
-#include "xsimd_rvv_register.hpp"
-
-#include "xsimd_wasm_register.hpp"
-
-#include "xsimd_vsx_register.hpp"
+#include "./xsimd_avx2_register.hpp"
+#include "./xsimd_avx512bw_register.hpp"
+#include "./xsimd_avx512cd_register.hpp"
+#include "./xsimd_avx512dq_register.hpp"
+#include "./xsimd_avx512er_register.hpp"
+#include "./xsimd_avx512f_register.hpp"
+#include "./xsimd_avx512ifma_register.hpp"
+#include "./xsimd_avx512pf_register.hpp"
+#include "./xsimd_avx512vbmi2_register.hpp"
+#include "./xsimd_avx512vbmi_register.hpp"
+#include "./xsimd_avx512vl_register.hpp"
+#include "./xsimd_avx512vnni_avx512bw_register.hpp"
+#include "./xsimd_avx512vnni_avx512vbmi2_register.hpp"
+#include "./xsimd_avx_register.hpp"
+#include "./xsimd_avxvnni_register.hpp"
+#include "./xsimd_fma3_avx2_register.hpp"
+#include "./xsimd_fma3_avx_register.hpp"
+#include "./xsimd_fma3_sse_register.hpp"
+#include "./xsimd_fma4_register.hpp"
+#include "./xsimd_i8mm_neon64_register.hpp"
+#include "./xsimd_neon64_register.hpp"
+#include "./xsimd_neon_register.hpp"
+#include "./xsimd_rvv_register.hpp"
+#include "./xsimd_sse2_register.hpp"
+#include "./xsimd_sse3_register.hpp"
+#include "./xsimd_sse4_1_register.hpp"
+#include "./xsimd_sse4_2_register.hpp"
+#include "./xsimd_sve_register.hpp"
+#include "./xsimd_vsx_register.hpp"
+#include "./xsimd_vxe_register.hpp"
+#include "./xsimd_wasm_register.hpp"
 
 #if XSIMD_WITH_EMULATED
-#include "xsimd_emulated_register.hpp"
+#include "./xsimd_emulated_register.hpp"
 #endif
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
index aa64df4da..589c4cf50 100644
--- a/include/xsimd/types/xsimd_api.hpp
+++ b/include/xsimd/types/xsimd_api.hpp
@@ -12,14 +12,16 @@
 #ifndef XSIMD_API_HPP
 #define XSIMD_API_HPP
 
+#include "../arch/xsimd_isa.hpp"
+#include "../types/xsimd_batch.hpp"
+#include "../types/xsimd_traits.hpp"
+#include "../utils/xsimd_type_traits.hpp"
+
 #include <complex>
 #include <cstddef>
 #include <limits>
 #include <ostream>
-
-#include "../arch/xsimd_isa.hpp"
-#include "../types/xsimd_batch.hpp"
-#include "../types/xsimd_traits.hpp"
+#include <utility>
 
 namespace xsimd
 {
@@ -353,6 +355,43 @@ namespace xsimd
         return kernel::bitwise_cast<A>(x, batch<T_out, A> {}, A {});
     }
 
+    namespace detail
+    {
+        // Detection for kernel overloads accepting ``batch_constant`` in ``bitwise_lshift``
+        // directly (or in a parent register function).
+        // The ``batch_constant`` overload is a rare but useful optimization.
+        // Running the detection here is less error prone than to add a fallback to all
+        // architectures.
+
+        template <class Arch, class Batch, class BatchConstant, class = void>
+        struct has_bitwise_lshift_batch_const : std::false_type
+        {
+        };
+
+        template <class Arch, class Batch, class BatchConstant>
+        struct has_bitwise_lshift_batch_const<
+            Arch, Batch, BatchConstant,
+            void_t<decltype(kernel::bitwise_lshift<Arch>(
+                std::declval<Batch>(), std::declval<BatchConstant>(), Arch {}))>>
+            : std::true_type
+        {
+        };
+
+        template <class Arch, class T, T... Values>
+        XSIMD_INLINE batch<T, Arch> bitwise_lshift_batch_const(batch<T, Arch> const& x, batch_constant<T, Arch, Values...> shift, std::true_type) noexcept
+        {
+            // Optimized ``batch_constant`` implementation
+            return kernel::bitwise_lshift<Arch>(x, shift, Arch {});
+        }
+
+        template <class Arch, class T, T... Values>
+        XSIMD_INLINE batch<T, Arch> bitwise_lshift_batch_const(batch<T, Arch> const& x, batch_constant<T, Arch, Values...> shift, std::false_type) noexcept
+        {
+            // Fallback to regular run-time implementation
+            return kernel::bitwise_lshift<Arch>(x, shift.as_batch(), Arch {});
+        }
+    }
+
     /**
      * @ingroup batch_bitwise
      *
@@ -367,17 +406,24 @@ namespace xsimd
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_lshift<A>(x, shift, A {});
     }
+    template <size_t shift, class T, class A>
+    XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_lshift<shift, A>(x, A {});
+    }
     template <class T, class A>
     XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::bitwise_lshift<A>(x, shift, A {});
     }
-    template <size_t shift, class T, class A>
-    XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x) noexcept
+    template <class T, class A, T... Values>
+    XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& x, batch_constant<T, A, Values...> shift) noexcept
     {
         detail::static_check_supported_config<T, A>();
-        return kernel::bitwise_lshift<shift, A>(x, A {});
+        using has_batch_const_impl = detail::has_bitwise_lshift_batch_const<A, decltype(x), decltype(shift)>;
+        return detail::bitwise_lshift_batch_const<A>(x, shift, has_batch_const_impl {});
     }
 
     /**
@@ -1065,6 +1111,37 @@ namespace xsimd
         return x > y;
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Extract the scalar element at compile-time index \c I from batch \c b.
+     * @param b the batch to extract from.
+     * @return the scalar element at index \c I.
+     */
+    template <size_t I, class T, class A>
+    XSIMD_INLINE T get(batch<T, A> const& b) noexcept
+    {
+        static_assert(I < batch<T, A>::size, "index out of bounds");
+        detail::static_check_supported_config<T, A>();
+        return kernel::get(b, index<I> {}, A {});
+    }
+
+    template <size_t I, class T, class A>
+    XSIMD_INLINE bool get(batch_bool<T, A> const& b) noexcept
+    {
+        static_assert(I < batch_bool<T, A>::size, "index out of bounds");
+        detail::static_check_supported_config<T, A>();
+        return kernel::get(b, index<I> {}, A {});
+    }
+
+    template <size_t I, class T, class A>
+    XSIMD_INLINE typename batch<std::complex<T>, A>::value_type get(batch<std::complex<T>, A> const& b) noexcept
+    {
+        static_assert(I < batch<std::complex<T>, A>::size, "index out of bounds");
+        detail::static_check_supported_config<T, A>();
+        return kernel::get(b, index<I> {}, A {});
+    }
+
     /**
      * @ingroup batch_reducers
      *
@@ -1334,6 +1411,30 @@ namespace xsimd
         return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
     }
 
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE simd_return_type<From, To, A> load_as(From const* ptr, stream_mode) noexcept
+    {
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        detail::static_check_supported_config<From, A>();
+        detail::static_check_supported_config<To, A>();
+        return kernel::load_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+    template <class To, class A = default_arch>
+    XSIMD_INLINE simd_return_type<bool, To, A> load_as(bool const* ptr, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        return simd_return_type<bool, To, A>::load_stream(ptr);
+    }
+
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
+        return kernel::load_complex_stream<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class To, class A = default_arch, class From, bool i3ec>
     XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
@@ -1342,6 +1443,14 @@ namespace xsimd
         detail::static_check_supported_config<From, A>();
         return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), aligned_mode());
     }
+
+    template <class To, class A = default_arch, class From, bool i3ec>
+    XSIMD_INLINE simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), stream_mode());
+    }
 #endif
 
     /**
@@ -1416,6 +1525,13 @@ namespace xsimd
         return load_as<From, A>(ptr, unaligned_mode {});
     }
 
+    template <class A = default_arch, class From>
+    XSIMD_INLINE batch<From, A> load(From const* ptr, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, stream_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -1629,6 +1745,54 @@ namespace xsimd
         return x * y;
     }
 
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the low N bits of the 2N-bit lane-wise product of \c x and \c y.
+     * Equivalent to ``mul(x, y)``; the low half is identical for signed and unsigned.
+     * @param x batch involved in the product.
+     * @param y batch involved in the product.
+     * @return the low N bits of the product, lane-wise.
+     */
+    template <class T, class A, class = std::enable_if_t<std::is_integral<T>::value>>
+    XSIMD_INLINE batch<T, A> mul_lo(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x * y;
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the high N bits of the 2N-bit lane-wise product of \c x and \c y.
+     * The signedness of \c T selects the signed or unsigned high half.
+     * @param x batch involved in the product.
+     * @param y batch involved in the product.
+     * @return the high N bits of the product, lane-wise.
+     */
+    template <class T, class A, class = std::enable_if_t<std::is_integral<T>::value>>
+    XSIMD_INLINE batch<T, A> mul_hi(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::mul_hi<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the full 2N-bit lane-wise product of \c x and \c y as ``{hi, lo}``.
+     * @param x batch involved in the product.
+     * @param y batch involved in the product.
+     * @return pair of batches ``{hi, lo}``.
+     */
+    template <class T, class A, class = std::enable_if_t<std::is_integral<T>::value>>
+    XSIMD_INLINE std::pair<batch<T, A>, batch<T, A>>
+    mul_hilo(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::mul_hilo<A>(x, y, A {});
+    }
+
     /**
      * @ingroup batch_rounding
      *
@@ -2420,12 +2584,40 @@ namespace xsimd
         kernel::store_complex_aligned<A>(dst, src, A {});
     }
 
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE void store_as(To* dst, batch<From, A> const& src, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        kernel::store_stream<A>(dst, src, A {});
+    }
+
+    template <class A = default_arch, class From>
+    XSIMD_INLINE void store_as(bool* dst, batch_bool<From, A> const& src, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        kernel::store_stream<A>(src, dst, A {});
+    }
+
+    template <class To, class A = default_arch, class From>
+    XSIMD_INLINE void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<std::complex<From>, A>();
+        kernel::store_complex_stream<A>(dst, src, A {});
+    }
+
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class To, class A = default_arch, class From, bool i3ec>
     XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
     {
         store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
     }
+
+    template <class To, class A = default_arch, class From, bool i3ec>
+    XSIMD_INLINE void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<std::complex<From>, A>();
+        store_as(reinterpret_cast<std::complex<To>*>(dst), src, stream_mode());
+    }
 #endif
 
     /**
@@ -2494,6 +2686,12 @@ namespace xsimd
         store_as<T, A>(mem, val, unaligned_mode {});
     }
 
+    template <class A, class T>
+    XSIMD_INLINE void store(T* mem, batch<T, A> const& val, stream_mode) noexcept
+    {
+        store_as<T, A>(mem, val, stream_mode {});
+    }
+
     /**
      * @ingroup batch_data_transfer
      *
@@ -2840,6 +3038,62 @@ namespace xsimd
         return !xsimd::any(x);
     }
 
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return the number of leading `false` values in the batch.
+     * @param x the batch to reduce.
+     * @return an integer scalar.
+     */
+    template <class T, class A>
+    XSIMD_INLINE size_t countl_zero(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::countl_zero<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return the number of leading `true` values in the batch.
+     * @param x the batch to reduce.
+     * @return an integer scalar.
+     */
+    template <class T, class A>
+    XSIMD_INLINE size_t countl_one(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::countl_one<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return the number of trailing `false` values in the batch.
+     * @param x the batch to reduce.
+     * @return an integer scalar.
+     */
+    template <class T, class A>
+    XSIMD_INLINE size_t countr_zero(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::countr_zero<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return the number of trailing `true` values in the batch.
+     * @param x the batch to reduce.
+     * @return an integer scalar.
+     */
+    template <class T, class A>
+    XSIMD_INLINE size_t countr_one(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::countr_one<A>(x, A {});
+    }
+
     /**
      * @ingroup batch_conversion
      *
diff --git a/include/xsimd/types/xsimd_avx2_register.hpp b/include/xsimd/types/xsimd_avx2_register.hpp
index a53132b94..5f393343d 100644
--- a/include/xsimd/types/xsimd_avx2_register.hpp
+++ b/include/xsimd/types/xsimd_avx2_register.hpp
@@ -28,6 +28,18 @@ namespace xsimd
         static constexpr char const* name() noexcept { return "avx2"; }
     };
 
+    /**
+     * @ingroup architectures
+     *
+     * AVX2 instructions extension for 128 bits registers
+     */
+    struct avx2_128 : avx_128
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx2/128"; }
+    };
+
 #if XSIMD_WITH_AVX2
 
 #if !XSIMD_WITH_AVX
@@ -37,6 +49,7 @@ namespace xsimd
     namespace types
     {
         XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2_128, avx_128);
     }
 #endif
 }
diff --git a/include/xsimd/types/xsimd_avx512f_register.hpp b/include/xsimd/types/xsimd_avx512f_register.hpp
index 279ae4caa..c54161209 100644
--- a/include/xsimd/types/xsimd_avx512f_register.hpp
+++ b/include/xsimd/types/xsimd_avx512f_register.hpp
@@ -13,6 +13,7 @@
 #define XSIMD_AVX512F_REGISTER_HPP
 
 #include "./xsimd_common_arch.hpp"
+#include "./xsimd_fma3_avx2_register.hpp"
 
 namespace xsimd
 {
@@ -69,7 +70,6 @@ namespace xsimd
         XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i);
         XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512);
         XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d);
-
     }
 #endif
 }
diff --git a/include/xsimd/types/xsimd_avx512vl_register.hpp b/include/xsimd/types/xsimd_avx512vl_register.hpp
new file mode 100644
index 000000000..c73c2a963
--- /dev/null
+++ b/include/xsimd/types/xsimd_avx512vl_register.hpp
@@ -0,0 +1,88 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VL_REGISTER_HPP
+#define XSIMD_AVX512VL_REGISTER_HPP
+
+#include "./xsimd_avx512cd_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512DQ instructions
+     */
+    struct avx512vl : avx512cd
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VL; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512vl"; }
+    };
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VL instructions extension for 128 bits registers
+     */
+    struct avx512vl_128 : avx2_128
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VL; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512vl/128"; }
+    };
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VL instructions extension for 256 bits registers
+     */
+    struct avx512vl_256 : fma3<avx2>
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VL; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512vl/256"; }
+    };
+
+#if XSIMD_WITH_AVX512VL
+
+#if !XSIMD_WITH_AVX512CD
+#error "architecture inconsistency: avx512vl requires avx512cd"
+#endif
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vl>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vl, avx512cd);
+
+        template <class T>
+        struct get_bool_simd_register<T, avx512vl_128>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vl_128, avx2_128);
+
+        template <class T>
+        struct get_bool_simd_register<T, avx512vl_256>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vl_256, avx2);
+    }
+#endif
+}
+#endif
diff --git a/include/xsimd/types/xsimd_avx_register.hpp b/include/xsimd/types/xsimd_avx_register.hpp
index 47997ee76..515b60901 100644
--- a/include/xsimd/types/xsimd_avx_register.hpp
+++ b/include/xsimd/types/xsimd_avx_register.hpp
@@ -13,6 +13,7 @@
 #define XSIMD_AVX_REGISTER_HPP
 
 #include "./xsimd_common_arch.hpp"
+#include "./xsimd_sse4_2_register.hpp"
 
 namespace xsimd
 {
@@ -30,6 +31,18 @@ namespace xsimd
         static constexpr bool requires_alignment() noexcept { return true; }
         static constexpr char const* name() noexcept { return "avx"; }
     };
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX instructions extension for 128 bits registers
+     */
+    struct avx_128 : sse4_2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx/128"; }
+    };
 }
 
 #if XSIMD_WITH_AVX
@@ -58,6 +71,8 @@ namespace xsimd
         XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
         XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
         XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx_128, sse4_2);
     }
 }
 #endif
diff --git a/include/xsimd/types/xsimd_batch.hpp b/include/xsimd/types/xsimd_batch.hpp
index 5ff525a11..970483150 100644
--- a/include/xsimd/types/xsimd_batch.hpp
+++ b/include/xsimd/types/xsimd_batch.hpp
@@ -12,20 +12,18 @@
 #ifndef XSIMD_BATCH_HPP
 #define XSIMD_BATCH_HPP
 
-#include <cassert>
-#include <complex>
-
 #include "../config/xsimd_arch.hpp"
+#include "../config/xsimd_config.hpp"
+#include "../config/xsimd_macros.hpp"
 #include "../memory/xsimd_alignment.hpp"
+#include "./xsimd_batch_fwd.hpp"
 #include "./xsimd_utils.hpp"
 
+#include <cassert>
+#include <complex>
+
 namespace xsimd
 {
-    template <typename T, class A, bool... Values>
-    struct batch_bool_constant;
-    template <class T, class A = default_arch>
-    class batch;
-
     namespace types
     {
         template <class T, class A>
@@ -132,6 +130,27 @@ namespace xsimd
         XSIMD_INLINE explicit batch(batch_bool_type const& b) noexcept;
         XSIMD_INLINE batch(register_type reg) noexcept;
 
+        /* Re-expose the conversion to register_type at the most-derived
+         * level. Some compilers fail to invoke the conversion inherited from
+         * types::simd_register when a batch is fed to an intrinsic defined as
+         * a macro (e.g. certain GCC shift/mul_lo imm intrinsics), because the
+         * textual C-style cast inside the macro does not traverse the alias
+         * inheritance chain.
+         *
+         * NOTE: this has to be a redefined member, not a using-declaration of
+         * `simd_register<T, A>::operator register_type`. The using-decl is
+         * evaluated at class-template instantiation, but `simd_register<T, A>`
+         * is only specialised (and therefore only carries `operator
+         * register_type`) for *supported* (T, A) pairs — for unsupported
+         * pairs the generic `simd_register` is empty and a using-decl would
+         * fail to compile. A redefined member is only instantiated when
+         * actually called, which keeps unsupported batches well-formed up to
+         * the point a user tries to use them. */
+        XSIMD_INLINE operator register_type() const noexcept
+        {
+            return this->data;
+        }
+
         template <class U>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch broadcast(U val) noexcept;
 
@@ -144,6 +163,8 @@ namespace xsimd
         XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept;
         template <class U>
         XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept;
+        template <class U>
+        XSIMD_INLINE void store(U* mem, stream_mode) const noexcept;
 
         // Compile-time mask overloads
         template <class U, bool... Values, class Mode = aligned_mode>
@@ -160,6 +181,8 @@ namespace xsimd
         // Compile-time mask overloads
         template <class U, bool... Values, class Mode = aligned_mode>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant<T, A, Values...> mask, Mode = {}) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept;
 
         template <class U, class V>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch<V, arch_type> const& index) noexcept;
@@ -282,7 +305,7 @@ namespace xsimd
         XSIMD_INLINE batch logical_or(batch const& other) const noexcept;
     };
 
-#if __cplusplus < 201703L
+#if XSIMD_CPP_VERSION < 201703L
     template <class T, class A>
     constexpr std::size_t batch<T, A>::size;
 #endif
@@ -296,7 +319,7 @@ namespace xsimd
      * @tparam T the type of the predicated values.
      * @tparam A the architecture this batch is tied too.
      **/
-    template <class T, class A = default_arch>
+    template <class T, class A>
     class batch_bool : public types::get_bool_simd_register_t<T, A>
     {
         using base_type = types::get_bool_simd_register_t<T, A>;
@@ -323,8 +346,10 @@ namespace xsimd
         // memory operators
         XSIMD_INLINE void store_aligned(bool* mem) const noexcept;
         XSIMD_INLINE void store_unaligned(bool* mem) const noexcept;
+        XSIMD_INLINE void store_stream(bool* mem) const noexcept;
         XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_aligned(bool const* mem) noexcept;
         XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_unaligned(bool const* mem) noexcept;
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_stream(bool const* mem) noexcept;
 
         XSIMD_INLINE bool get(std::size_t i) const noexcept;
 
@@ -360,7 +385,7 @@ namespace xsimd
         static XSIMD_INLINE register_type make_register(std::index_sequence<>, V... v) noexcept;
     };
 
-#if __cplusplus < 201703L
+#if XSIMD_CPP_VERSION < 201703L
     template <class T, class A>
     constexpr std::size_t batch_bool<T, A>::size;
 #endif
@@ -417,12 +442,16 @@ namespace xsimd
         template <class U, bool... Values, class Mode = aligned_mode>
         XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant<value_type, A, Values...> mask, Mode = {}) noexcept;
         template <class U>
+        XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept;
+        template <class U>
         XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept;
         template <class U>
         XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept;
         // Compile-time mask overloads
         template <class U, bool... Values, class Mode = aligned_mode>
         XSIMD_INLINE void store(U* mem, batch_bool_constant<value_type, A, Values...> mask, Mode = {}) const noexcept;
+        template <class U>
+        XSIMD_INLINE void store(U* mem, stream_mode) const noexcept;
 
         XSIMD_INLINE real_batch real() const noexcept;
         XSIMD_INLINE real_batch imag() const noexcept;
@@ -502,7 +531,7 @@ namespace xsimd
         real_batch m_imag;
     };
 
-#if __cplusplus < 201703L
+#if XSIMD_CPP_VERSION < 201703L
     template <class T, class A>
     constexpr std::size_t batch<std::complex<T>, A>::size;
 #endif
@@ -634,6 +663,16 @@ namespace xsimd
 
     // masked store free functions are provided in xsimd_api.hpp
 
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE void batch<T, A>::store(U* mem, stream_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "store location is not properly aligned");
+        kernel::store_stream<A>(mem, *this, A {});
+    }
+
     /**
      * Loading from aligned memory. May involve a conversion if \c U is different
      * from \c T.
@@ -728,6 +767,16 @@ namespace xsimd
         }
     }
 
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE batch<T, A> batch<T, A>::load(U const* mem, stream_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "loaded pointer is not properly aligned");
+        return kernel::load_stream<A>(mem, kernel::convert<T> {}, A {});
+    }
+
     /**
      * Create a new batch gathering elements starting at address \c src and
      * offset by each element in \c index.
@@ -1051,6 +1100,12 @@ namespace xsimd
         store_aligned(mem);
     }
 
+    template <class T, class A>
+    XSIMD_INLINE void batch_bool<T, A>::store_stream(bool* mem) const noexcept
+    {
+        kernel::store_stream<A>(*this, mem, A {});
+    }
+
     template <class T, class A>
     XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_aligned(bool const* mem) noexcept
     {
@@ -1063,6 +1118,12 @@ namespace xsimd
         return kernel::load_unaligned<A>(mem, batch_bool<T, A>(), A {});
     }
 
+    template <class T, class A>
+    XSIMD_INLINE batch_bool<T, A> batch_bool<T, A>::load_stream(bool const* mem) noexcept
+    {
+        return kernel::load_stream<A>(mem, batch_bool<T, A>(), A {});
+    }
+
     /**
      * Extract a scalar mask representation from this @c batch_bool.
      *
@@ -1327,6 +1388,16 @@ namespace xsimd
         return kernel::load_masked<A>(mem, mask, kernel::convert<value_type> {}, mode, A {});
     }
 
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, stream_mode) noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "loaded pointer is not properly aligned");
+        auto* ptr = reinterpret_cast<value_type const*>(mem);
+        return kernel::load_complex_stream<A>(ptr, kernel::convert<value_type> {}, A {});
+    }
+
     template <class T, class A>
     template <class U>
     XSIMD_INLINE void batch<std::complex<T>, A>::store(U* mem, aligned_mode) const noexcept
@@ -1341,6 +1412,16 @@ namespace xsimd
         return store_unaligned(mem);
     }
 
+    template <class T, class A>
+    template <class U>
+    XSIMD_INLINE void batch<std::complex<T>, A>::store(U* mem, stream_mode) const noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "store location is not properly aligned");
+        auto* ptr = reinterpret_cast<value_type*>(mem);
+        return kernel::store_complex_stream(ptr, *this, A {});
+    }
+
     template <class T, class A>
     XSIMD_INLINE auto batch<std::complex<T>, A>::real() const noexcept -> real_batch
     {
diff --git a/include/xsimd/types/xsimd_batch_constant.hpp b/include/xsimd/types/xsimd_batch_constant.hpp
index 4da2e3da6..edecee3c3 100644
--- a/include/xsimd/types/xsimd_batch_constant.hpp
+++ b/include/xsimd/types/xsimd_batch_constant.hpp
@@ -12,13 +12,14 @@
 #ifndef XSIMD_BATCH_CONSTANT_HPP
 #define XSIMD_BATCH_CONSTANT_HPP
 
+#include "../config/xsimd_config.hpp"
+#include "./xsimd_batch.hpp"
+#include "./xsimd_utils.hpp"
+
 #include <cstddef>
 #include <functional>
 #include <utility>
 
-#include "./xsimd_batch.hpp"
-#include "./xsimd_utils.hpp"
-
 namespace xsimd
 {
     /**
@@ -408,6 +409,15 @@ namespace xsimd
             return {};
         }
 
+#if XSIMD_CPP_VERSION >= 202002L
+        template <std::array Arr, class A, std::size_t... Is>
+        XSIMD_INLINE constexpr batch_constant<typename decltype(Arr)::value_type, A, Arr[Is]...>
+        make_batch_constant(std::index_sequence<Is...>) noexcept
+        {
+            return {};
+        }
+#endif
+
         template <typename T, class G, class A, std::size_t... Is>
         XSIMD_INLINE constexpr batch_bool_constant<T, A, G::get(Is, sizeof...(Is))...>
         make_batch_bool_constant(std::index_sequence<Is...>) noexcept
@@ -422,6 +432,15 @@ namespace xsimd
             return {};
         }
 
+#if XSIMD_CPP_VERSION >= 202002L
+        template <typename T, std::array Arr, class A, std::size_t... Is>
+        XSIMD_INLINE constexpr batch_bool_constant<T, A, Arr[Is]...>
+        make_batch_bool_constant(std::index_sequence<Is...>) noexcept
+        {
+            return {};
+        }
+#endif
+
     } // namespace detail
 
     /**
@@ -479,6 +498,21 @@ namespace xsimd
         return {};
     }
 
+#if XSIMD_CPP_VERSION >= 202002L
+    /**
+     * @brief Build a @c batch_constant from a std::array (C++20)
+     *
+     * @tparam Arr The std::array containing the values (non type template argument).
+     * @tparam A Architecture that will be used when converting to a regular batch.
+     */
+    template <std::array Arr, class A = default_arch>
+        requires(Arr.size() == batch<typename decltype(Arr)::value_type, A>::size)
+    XSIMD_INLINE constexpr auto make_batch_constant() noexcept
+    {
+        return detail::make_batch_constant<Arr, A>(std::make_index_sequence<Arr.size()>());
+    }
+#endif
+
     /*
      * @brief Build a @c batch_bool_constant with a single repeated value.
      *
@@ -491,6 +525,23 @@ namespace xsimd
         return {};
     }
 
+#if XSIMD_CPP_VERSION >= 202002L
+    /**
+     * @brief Build a @c batch_constant from a std::array of boolean (C++20)
+     *
+     * @tparam Arr The std::array containing the boolean values (non type template argument).
+     * @tparam A Architecture that will be used when converting to a regular batch.
+     */
+    template <typename T, std::array Arr, class A = default_arch>
+        requires(
+            (Arr.size() == batch_bool<T, A>::size)
+            && std::is_same_v<typename decltype(Arr)::value_type, bool>)
+    XSIMD_INLINE constexpr auto make_batch_bool_constant() noexcept
+    {
+        return detail::make_batch_bool_constant<T, Arr, A>(std::make_index_sequence<Arr.size()>());
+    }
+#endif
+
 #endif
 
     namespace generator
diff --git a/include/xsimd/types/xsimd_batch_fwd.hpp b/include/xsimd/types/xsimd_batch_fwd.hpp
new file mode 100644
index 000000000..62e3cbba7
--- /dev/null
+++ b/include/xsimd/types/xsimd_batch_fwd.hpp
@@ -0,0 +1,41 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_BATCH_FWD_HPP
+#define XSIMD_BATCH_FWD_HPP
+
+#include "../config/xsimd_config.hpp"
+
+// TODO this is somehow redundant with XSIMD_DEFAULT_ARCH but is only supported
+// when an architecture is defined.
+#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
+#define XSIMD_BATCH_DEFAULT_ARCH_IMPL void
+#else
+#include "../config/xsimd_arch.hpp"
+#define XSIMD_BATCH_DEFAULT_ARCH_IMPL default_arch
+#endif // XSIMD_NO_SUPPORTED_ARCHITECTURE
+
+namespace xsimd
+{
+    template <class T, class A = XSIMD_BATCH_DEFAULT_ARCH_IMPL>
+    class batch_bool;
+
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T, class A = XSIMD_BATCH_DEFAULT_ARCH_IMPL>
+    class batch;
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+}
+
+#endif
diff --git a/include/xsimd/types/xsimd_common_arch.hpp b/include/xsimd/types/xsimd_common_arch.hpp
index 28491aeda..8f92ad4dc 100644
--- a/include/xsimd/types/xsimd_common_arch.hpp
+++ b/include/xsimd/types/xsimd_common_arch.hpp
@@ -14,6 +14,8 @@
 
 #include "../config/xsimd_config.hpp"
 
+#include <cstddef>
+
 /**
  * @defgroup architectures Architecture description
  * */
diff --git a/include/xsimd/types/xsimd_emulated_register.hpp b/include/xsimd/types/xsimd_emulated_register.hpp
index 306ca0cf4..6bfc04e94 100644
--- a/include/xsimd/types/xsimd_emulated_register.hpp
+++ b/include/xsimd/types/xsimd_emulated_register.hpp
@@ -15,6 +15,14 @@
 #include "./xsimd_common_arch.hpp"
 #include "./xsimd_register.hpp"
 
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+#include <xtl/xcomplex.hpp>
+#endif
+
+#include <array>
+#include <complex>
+#include <cstddef>
+
 namespace xsimd
 {
     /**
@@ -70,7 +78,7 @@ namespace xsimd
         };
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
         template <typename T, bool i3ec, size_t N>
-        struct has_simd_register<xtl::complex<T, T, i3ec>, emulated<N>> : std::true_type
+        struct has_simd_register<xtl::xcomplex<T, T, i3ec>, emulated<N>> : std::true_type
         {
         };
 #endif
diff --git a/include/xsimd/types/xsimd_neon64_register.hpp b/include/xsimd/types/xsimd_neon64_register.hpp
index 7fa0b2ce5..1df2d4abb 100644
--- a/include/xsimd/types/xsimd_neon64_register.hpp
+++ b/include/xsimd/types/xsimd_neon64_register.hpp
@@ -12,7 +12,7 @@
 #ifndef XSIMD_NEON64_REGISTER_HPP
 #define XSIMD_NEON64_REGISTER_HPP
 
-#include "xsimd_neon_register.hpp"
+#include "./xsimd_neon_register.hpp"
 
 namespace xsimd
 {
diff --git a/include/xsimd/types/xsimd_neon_register.hpp b/include/xsimd/types/xsimd_neon_register.hpp
index ef9973828..82e27a986 100644
--- a/include/xsimd/types/xsimd_neon_register.hpp
+++ b/include/xsimd/types/xsimd_neon_register.hpp
@@ -12,12 +12,18 @@
 #ifndef XSIMD_NEON_REGISTER_HPP
 #define XSIMD_NEON_REGISTER_HPP
 
-#include "xsimd_common_arch.hpp"
-#include "xsimd_register.hpp"
+#include "../config/xsimd_config.hpp"
+#include "../utils/xsimd_type_traits.hpp"
+#include "./xsimd_common_arch.hpp"
+#include "./xsimd_register.hpp"
 
 #if XSIMD_WITH_NEON
+#if defined(_MSC_VER) && !defined(__clang__) && XSIMD_WITH_NEON64
+#include <arm64_neon.h>
+#else
 #include <arm_neon.h>
 #endif
+#endif
 
 namespace xsimd
 {
@@ -103,40 +109,10 @@ namespace xsimd
 
         namespace detail
         {
-            template <size_t S>
-            struct get_unsigned_type;
-
-            template <>
-            struct get_unsigned_type<1>
-            {
-                using type = uint8_t;
-            };
-
-            template <>
-            struct get_unsigned_type<2>
-            {
-                using type = uint16_t;
-            };
-
-            template <>
-            struct get_unsigned_type<4>
-            {
-                using type = uint32_t;
-            };
-
-            template <>
-            struct get_unsigned_type<8>
-            {
-                using type = uint64_t;
-            };
-
-            template <size_t S>
-            using get_unsigned_type_t = typename get_unsigned_type<S>::type;
-
             template <class T, class A>
             struct neon_bool_simd_register
             {
-                using type = simd_register<get_unsigned_type_t<sizeof(T)>, A>;
+                using type = simd_register<xsimd::sized_uint_t<sizeof(T)>, A>;
             };
         }
 
diff --git a/include/xsimd/types/xsimd_register.hpp b/include/xsimd/types/xsimd_register.hpp
index 018418af6..e4f9b1ddc 100644
--- a/include/xsimd/types/xsimd_register.hpp
+++ b/include/xsimd/types/xsimd_register.hpp
@@ -12,6 +12,8 @@
 #ifndef XSIMD_REGISTER_HPP
 #define XSIMD_REGISTER_HPP
 
+#include "../config/xsimd_macros.hpp"
+
 #include <type_traits>
 
 namespace xsimd
diff --git a/include/xsimd/types/xsimd_rvv_register.hpp b/include/xsimd/types/xsimd_rvv_register.hpp
index 83f8bfcfd..85f786b91 100644
--- a/include/xsimd/types/xsimd_rvv_register.hpp
+++ b/include/xsimd/types/xsimd_rvv_register.hpp
@@ -13,8 +13,9 @@
 #ifndef XSIMD_RVV_REGISTER_HPP
 #define XSIMD_RVV_REGISTER_HPP
 
-#include "xsimd_common_arch.hpp"
-#include "xsimd_register.hpp"
+#include "../utils/xsimd_type_traits.hpp"
+#include "./xsimd_common_arch.hpp"
+#include "./xsimd_register.hpp"
 
 #if XSIMD_WITH_RVV
 #include <riscv_vector.h>
@@ -287,15 +288,6 @@ namespace xsimd
             {
             };
 
-            // It's difficult dealing with both char and whichever *int8_t type
-            // is compatible with char, so just avoid it altogether.
-            //
-            using rvv_char_t = std::conditional_t<std::is_signed<char>::value, int8_t, uint8_t>;
-            template <class T>
-            using rvv_fix_char_t = std::conditional_t<
-                std::is_same<char, std::decay_t<T>>::value,
-                rvv_char_t, T>;
-
             // An explicit constructor isn't really explicit enough to allow
             // implicit bit-casting operations between incompatible types, so
             // we add this vacuous flag argument when we're serious:
@@ -334,7 +326,7 @@ namespace xsimd
                 operator register_type() const noexcept { return value.get(); }
             };
             template <class T, size_t Width = XSIMD_RVV_BITS>
-            using rvv_reg_t = std::conditional_t<!std::is_void<T>::value, rvv_reg<rvv_fix_char_t<T>, Width>, void>;
+            using rvv_reg_t = std::conditional_t<!std::is_void<T>::value, rvv_reg<map_to_sized_type_t<T>, Width>, void>;
 
             // And some more of the same stuff for bool types, which have
             // similar problems and similar workarounds.
@@ -396,7 +388,7 @@ namespace xsimd
 
             template <class T, size_t Width = XSIMD_RVV_BITS>
             using rvv_bool_t = std::enable_if_t < !std::is_void<T>::value,
-                  rvv_bool<rvv_fix_char_t<T>, Width<rvv_width_m1 ? rvv_width_m1 : Width>>;
+                  rvv_bool<map_to_sized_type_t<T>, Width<rvv_width_m1 ? rvv_width_m1 : Width>>;
 
             template <size_t S>
             struct rvv_vector_type_impl;
diff --git a/include/xsimd/types/xsimd_sve_register.hpp b/include/xsimd/types/xsimd_sve_register.hpp
index 7ac748f8d..802588691 100644
--- a/include/xsimd/types/xsimd_sve_register.hpp
+++ b/include/xsimd/types/xsimd_sve_register.hpp
@@ -13,8 +13,8 @@
 #ifndef XSIMD_SVE_REGISTER_HPP
 #define XSIMD_SVE_REGISTER_HPP
 
-#include "xsimd_common_arch.hpp"
-#include "xsimd_register.hpp"
+#include "./xsimd_common_arch.hpp"
+#include "./xsimd_register.hpp"
 
 #if XSIMD_WITH_SVE
 #include <arm_sve.h>
@@ -67,55 +67,115 @@ namespace xsimd
             struct sve_vector_type_impl;
 
             template <>
-            struct sve_vector_type_impl<8>
+            struct sve_vector_type_impl<1>
             {
                 using signed_type = sve_int8_t;
                 using unsigned_type = sve_uint8_t;
                 using floating_point_type = void;
+                using sizeless_unsigned_type = svuint8_t;
+                using sizeless_signed_type = svint8_t;
+                using sizeless_floating_point_type = void;
             };
 
             template <>
-            struct sve_vector_type_impl<16>
+            struct sve_vector_type_impl<2>
             {
                 using signed_type = sve_int16_t;
                 using unsigned_type = sve_uint16_t;
                 using floating_point_type = void;
+                using sizeless_unsigned_type = svuint16_t;
+                using sizeless_signed_type = svint16_t;
+                using sizeless_floating_point_type = void;
             };
 
             template <>
-            struct sve_vector_type_impl<32>
+            struct sve_vector_type_impl<4>
             {
                 using signed_type = sve_int32_t;
                 using unsigned_type = sve_uint32_t;
                 using floating_point_type = sve_float32_t;
+                using sizeless_unsigned_type = svuint32_t;
+                using sizeless_signed_type = svint32_t;
+                using sizeless_floating_point_type = svfloat32_t;
             };
 
             template <>
-            struct sve_vector_type_impl<64>
+            struct sve_vector_type_impl<8>
             {
                 using signed_type = sve_int64_t;
                 using unsigned_type = sve_uint64_t;
                 using floating_point_type = sve_float64_t;
+                using sizeless_unsigned_type = svuint64_t;
+                using sizeless_signed_type = svint64_t;
+                using sizeless_floating_point_type = svfloat64_t;
             };
 
             template <class T>
-            using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type;
+            using signed_int_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_int_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::unsigned_type;
 
             template <class T>
-            using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+            using floating_point_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::floating_point_type;
 
             template <class T>
-            using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type;
+            using sizeless_signed_int_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::sizeless_signed_type;
 
             template <class T>
-            using signed_int_or_floating_point_sve_vector_type = std::conditional_t<std::is_floating_point<T>::value,
-                                                                                    floating_point_sve_vector_type<T>,
-                                                                                    signed_int_sve_vector_type<T>>;
+            using sizeless_unsigned_int_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::sizeless_unsigned_type;
 
             template <class T>
-            using sve_vector_type = std::conditional_t<std::is_signed<T>::value,
-                                                       signed_int_or_floating_point_sve_vector_type<T>,
-                                                       unsigned_int_sve_vector_type<T>>;
+            using sizeless_floating_point_sve_vector_type = typename sve_vector_type_impl<sizeof(T)>::sizeless_floating_point_type;
+
+            template <typename T, typename = void>
+            struct sve_vector_impl;
+
+            template <typename T>
+            struct sve_vector_impl<T, std::enable_if_t<std::is_floating_point<T>::value>>
+            {
+                using type = floating_point_sve_vector_type<T>;
+            };
+
+            template <typename T>
+            struct sve_vector_impl<T, std::enable_if_t<!std::is_floating_point<T>::value && std::is_signed<T>::value>>
+            {
+                using type = signed_int_sve_vector_type<T>;
+            };
+
+            template <typename T>
+            struct sve_vector_impl<T, std::enable_if_t<!std::is_floating_point<T>::value && std::is_unsigned<T>::value>>
+            {
+                using type = unsigned_int_sve_vector_type<T>;
+            };
+
+            template <typename T, typename = void>
+            struct sizeless_sve_vector_impl;
+
+            template <typename T>
+            struct sizeless_sve_vector_impl<T, std::enable_if_t<std::is_floating_point<T>::value>>
+            {
+                using type = sizeless_floating_point_sve_vector_type<T>;
+            };
+
+            template <typename T>
+            struct sizeless_sve_vector_impl<T, std::enable_if_t<!std::is_floating_point<T>::value && std::is_signed<T>::value>>
+            {
+                using type = sizeless_signed_int_sve_vector_type<T>;
+            };
+
+            template <typename T>
+            struct sizeless_sve_vector_impl<T, std::enable_if_t<!std::is_floating_point<T>::value && std::is_unsigned<T>::value>>
+            {
+                using type = sizeless_unsigned_int_sve_vector_type<T>;
+            };
+
+            template <class T>
+            using sve_vector_type = typename detail::sve_vector_impl<T>::type;
+
+            template <class T>
+            using sizeless_sve_vector_type = typename detail::sizeless_sve_vector_impl<T>::type;
+
         } // namespace detail
 
         XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type<signed char>);
diff --git a/include/xsimd/types/xsimd_traits.hpp b/include/xsimd/types/xsimd_traits.hpp
index ace06dcfc..0f7485ae4 100644
--- a/include/xsimd/types/xsimd_traits.hpp
+++ b/include/xsimd/types/xsimd_traits.hpp
@@ -12,9 +12,17 @@
 #ifndef XSIMD_TRAITS_HPP
 #define XSIMD_TRAITS_HPP
 
+#include <complex>
+#include <cstdint>
 #include <type_traits>
 
-#include "xsimd_batch.hpp"
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+#include <xtl/xcomplex.hpp>
+#endif
+
+#include "../config/xsimd_config.hpp"
+#include "./xsimd_batch_fwd.hpp"
+#include "./xsimd_utils.hpp"
 
 /**
  * high level type traits
@@ -53,7 +61,7 @@ namespace xsimd
             static constexpr size_t size = 1;
         };
 
-#if __cplusplus < 201703L
+#if XSIMD_CPP_VERSION < 201703L
         template <class T>
         constexpr size_t simd_traits_impl<T, false>::size;
 #endif
@@ -66,7 +74,7 @@ namespace xsimd
             static constexpr size_t size = type::size;
         };
 
-#if __cplusplus < 201703L
+#if XSIMD_CPP_VERSION < 201703L
         template <class T>
         constexpr size_t simd_traits_impl<T, true>::size;
 #endif
@@ -132,7 +140,7 @@ namespace xsimd
         static constexpr size_t size = simd_traits<type>::size;
     };
 
-#if __cplusplus < 201703L
+#if XSIMD_CPP_VERSION < 201703L
     template <class T>
     constexpr size_t revert_simd_traits<T>::size;
 #endif
@@ -144,7 +152,7 @@ namespace xsimd
         static constexpr size_t size = batch<T>::size;
     };
 
-#if __cplusplus < 201703L
+#if XSIMD_CPP_VERSION < 201703L
     template <class T>
     constexpr size_t revert_simd_traits<batch<T>>::size;
 #endif
@@ -250,7 +258,7 @@ namespace xsimd
         static constexpr bool is_complex = detail::is_complex<T>::value; ///< True if T is complex or a batch of complex values.
     };
 
-#if __cplusplus < 201703L
+#if XSIMD_CPP_VERSION < 201703L
     template <class T>
     constexpr bool batch_traits<T>::is_batch;
     template <class T>
@@ -273,7 +281,7 @@ namespace xsimd
         static constexpr bool is_complex = detail::is_complex<T>::value;
     };
 
-#if __cplusplus < 201703L
+#if XSIMD_CPP_VERSION < 201703L
     template <class T, class A>
     constexpr bool batch_traits<batch<T, A>>::is_batch;
     template <class T, class A>
@@ -296,7 +304,7 @@ namespace xsimd
         static constexpr bool is_complex = false;
     };
 
-#if __cplusplus < 201703L
+#if XSIMD_CPP_VERSION < 201703L
     template <class T, class A>
     constexpr bool batch_traits<batch_bool<T, A>>::is_batch;
     template <class T, class A>
@@ -396,38 +404,6 @@ namespace xsimd
 
     template <class T>
     using mask_type_t = typename mask_type<T>::type;
-
-    namespace detail
-    {
-        template <typename T>
-        struct widen
-        {
-            using type = std::make_signed_t<typename widen<std::make_unsigned_t<T>>::type>;
-        };
-
-        template <>
-        struct widen<uint32_t>
-        {
-            using type = uint64_t;
-        };
-        template <>
-        struct widen<uint16_t>
-        {
-            using type = uint32_t;
-        };
-        template <>
-        struct widen<uint8_t>
-        {
-            using type = uint16_t;
-        };
-        template <>
-        struct widen<float>
-        {
-            using type = double;
-        };
-    }
-    template <typename T>
-    using widen_t = typename detail::widen<T>::type;
 }
 
 #endif
diff --git a/include/xsimd/types/xsimd_utils.hpp b/include/xsimd/types/xsimd_utils.hpp
index aa6b90607..5dbab8551 100644
--- a/include/xsimd/types/xsimd_utils.hpp
+++ b/include/xsimd/types/xsimd_utils.hpp
@@ -12,26 +12,21 @@
 #ifndef XSIMD_UTILS_HPP
 #define XSIMD_UTILS_HPP
 
+#include <array>
 #include <complex>
 #include <cstdint>
 #include <cstring>
-#include <tuple>
 #include <type_traits>
 #include <utility>
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
-#include "xtl/xcomplex.hpp"
+#include <xtl/xcomplex.hpp>
 #endif
 
+#include "./xsimd_batch_fwd.hpp"
+
 namespace xsimd
 {
-
-    template <class T, class A>
-    class batch;
-
-    template <class T, class A>
-    class batch_bool;
-
     /**************
      * index      *
      **************/
@@ -211,9 +206,25 @@ namespace xsimd
              * enabling / disabling metafunctions *
              **************************************/
 
+            template <class T>
+            using enable_arithmetic_t = std::enable_if_t<std::is_arithmetic<T>::value, int>;
+
+            /// Enable signed integral or floating point
+            template <class T>
+            using enable_signed_numeral_t = std::enable_if_t<std::is_signed<T>::value, int>;
+
+            template <class T>
+            using enable_floating_point_t = std::enable_if_t<std::is_floating_point<T>::value, int>;
+
             template <class T>
             using enable_integral_t = std::enable_if_t<std::is_integral<T>::value, int>;
 
+            template <class T>
+            using enable_signed_integral_t = std::enable_if_t<std::is_integral<T>::value && std::is_signed<T>::value, int>;
+
+            template <class T>
+            using enable_unsigned_integral_t = std::enable_if_t<std::is_integral<T>::value && std::is_unsigned<T>::value, int>;
+
             template <class T, size_t S>
             using enable_sized_signed_t = std::enable_if_t<std::is_integral<T>::value && std::is_signed<T>::value && sizeof(T) == S, int>;
 
diff --git a/include/xsimd/types/xsimd_vsx_register.hpp b/include/xsimd/types/xsimd_vsx_register.hpp
index cfd450317..36b933902 100644
--- a/include/xsimd/types/xsimd_vsx_register.hpp
+++ b/include/xsimd/types/xsimd_vsx_register.hpp
@@ -39,7 +39,7 @@ namespace xsimd
     namespace types
     {
 
-#define XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(T, Tb)                  \
+#define XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(T, Tv, Tb)              \
     template <>                                                      \
     struct get_bool_simd_register<T, vsx>                            \
     {                                                                \
@@ -55,19 +55,26 @@ namespace xsimd
             operator register_type() const noexcept { return data; } \
         };                                                           \
     };                                                               \
-    XSIMD_DECLARE_SIMD_REGISTER(T, vsx, __vector T)
+    XSIMD_DECLARE_SIMD_REGISTER(T, vsx, __vector Tv)
 
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(signed char, char);
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned char, char);
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(char, char);
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned short, short);
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(short, short);
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned int, int);
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(int, int);
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned long, long);
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(long, long);
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(float, int);
-        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(double, long);
+        // The VSX vector intrinsics do not support long, unsigned long,
+        // and char data types.  batches of these types are vectors of
+        // equivalent types.
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(signed char, signed char, char);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned char, unsigned char, char);
+#ifdef __CHAR_UNSIGNED__
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(char, unsigned char, char);
+#else
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(char, signed char, char);
+#endif
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned short, unsigned short, short);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(short, short, short);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned int, unsigned int, int);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(int, int, int);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned long, unsigned long long, long long);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(long, long long, long long);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(float, float, int);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(double, double, long long);
 
 #undef XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER
     }
diff --git a/include/xsimd/types/xsimd_vxe_register.hpp b/include/xsimd/types/xsimd_vxe_register.hpp
new file mode 100644
index 000000000..ba051baa5
--- /dev/null
+++ b/include/xsimd/types/xsimd_vxe_register.hpp
@@ -0,0 +1,86 @@
+/***************************************************************************
+ * Copyright (c) Andreas Krebbel                                            *
+ * Based on xsimd_vsx_register.hpp                                          *
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_VXE_REGISTER_HPP
+#define XSIMD_VXE_REGISTER_HPP
+
+#include "./xsimd_common_arch.hpp"
+#include "./xsimd_register.hpp"
+
+#if XSIMD_WITH_VXE
+#include <vecintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * VXE instructions
+     */
+    struct vxe : common
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_VXE; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "vxe"; }
+    };
+
+#if XSIMD_WITH_VXE
+    namespace types
+    {
+
+#define XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(T, Tv, Tb)              \
+    template <>                                                      \
+    struct get_bool_simd_register<T, vxe>                            \
+    {                                                                \
+        struct type                                                  \
+        {                                                            \
+            using register_type = __vector __bool Tb;                \
+            register_type data;                                      \
+            type() = default;                                        \
+            type(register_type r)                                    \
+                : data(r)                                            \
+            {                                                        \
+            }                                                        \
+            operator register_type() const noexcept { return data; } \
+        };                                                           \
+    };                                                               \
+    XSIMD_DECLARE_SIMD_REGISTER(T, vxe, __vector Tv)
+
+        // The VXE vector intrinsics do not support long, unsigned long,
+        // and char data types.  batches of these types are vectors of
+        // equivalent types.
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(signed char, signed char, char);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned char, unsigned char, char);
+#ifdef __CHAR_UNSIGNED__
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(char, unsigned char, char);
+#else
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(char, signed char, char);
+#endif
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned short, unsigned short, short);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(short, short, short);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned int, unsigned int, int);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(int, int, int);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned long, unsigned long long, long long);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(long, long long, long long);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(float, float, int);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(double, double, long long);
+
+#undef XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER
+    }
+#endif
+}
+
+#endif
diff --git a/include/xsimd/types/xsimd_wasm_register.hpp b/include/xsimd/types/xsimd_wasm_register.hpp
index 70b6ff9ad..5091b7636 100644
--- a/include/xsimd/types/xsimd_wasm_register.hpp
+++ b/include/xsimd/types/xsimd_wasm_register.hpp
@@ -13,8 +13,8 @@
 #ifndef XSIMD_WASM_REGISTER_HPP
 #define XSIMD_WASM_REGISTER_HPP
 
-#include "xsimd_common_arch.hpp"
-#include "xsimd_register.hpp"
+#include "./xsimd_common_arch.hpp"
+#include "./xsimd_register.hpp"
 
 #if XSIMD_WITH_WASM
 #include <wasm_simd128.h>
diff --git a/include/xsimd/utils/bits.hpp b/include/xsimd/utils/bits.hpp
new file mode 100644
index 000000000..a21e4273b
--- /dev/null
+++ b/include/xsimd/utils/bits.hpp
@@ -0,0 +1,122 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ***************************************************************************/
+
+#ifndef XSIMD_CPUID_UTILS_HPP
+#define XSIMD_CPUID_UTILS_HPP
+
+#include <cassert>
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace utils
+    {
+        template <typename I>
+        constexpr I make_bit_mask(I bit)
+        {
+            static_assert(std::is_unsigned<I>::value, "Bit operations must be done on unsigned integers");
+            assert(bit < static_cast<I>(8 * sizeof(I)));
+            return static_cast<I>(I { 1 } << bit);
+        }
+
+        template <typename I, typename... Args>
+        constexpr I make_bit_mask(I bit, Args... bits)
+        {
+            // TODO(C++17): Use fold expression
+            static_assert(std::is_unsigned<I>::value, "Bit operations must be done on unsigned integers");
+            return make_bit_mask<I>(bit) | make_bit_mask<I>(static_cast<I>(bits)...);
+        }
+
+        template <int... Bits, typename I>
+        constexpr bool all_bits_set(I value)
+        {
+            static_assert(std::is_unsigned<I>::value, "Bit operations must be done on unsigned integers");
+            constexpr I mask = make_bit_mask<I>(static_cast<I>(Bits)...);
+            return (value & mask) == mask;
+        }
+
+        template <int Bit, typename I>
+        constexpr I set_bit(I value)
+        {
+            static_assert(std::is_unsigned<I>::value, "Bit operations must be done on unsigned integers");
+            constexpr I mask = make_bit_mask<I>(static_cast<I>(Bit));
+            return value | mask;
+        }
+
+        /**
+         * Return a mask with the `width` lowest bits set.
+         */
+        template <typename I>
+        constexpr I make_low_mask(I width) noexcept
+        {
+            static_assert(std::is_unsigned<I>::value, "Bit operations must be done on unsigned integers");
+            assert(width <= static_cast<I>(8 * sizeof(I)));
+            if (width == static_cast<I>(8 * sizeof(I)))
+            {
+                return ~I { 0 };
+            }
+            return (I { 1 } << width) - I { 1 };
+        }
+
+        /* A bitset over an unsigned integer type, indexed by an enum key type. */
+        template <typename K, typename U>
+        struct uint_bitset
+        {
+            /* The underlying unsigned integer type storing the bits. */
+            using storage_type = U;
+            /* The enum type whose values name individual bits. */
+            using key_type = K;
+
+            /* Construct from a raw bit pattern. */
+            constexpr explicit uint_bitset(storage_type bitset = {}) noexcept
+                : m_bitset(bitset)
+            {
+            }
+
+            /* Return true if every bit named by the template arguments is set. */
+            template <key_type... bits>
+            constexpr bool all_bits_set() const noexcept
+            {
+                return utils::all_bits_set<static_cast<storage_type>(bits)...>(m_bitset);
+            }
+
+            /* Return true if the bit is set. */
+            template <key_type bit>
+            constexpr bool bit_is_set() const noexcept
+            {
+                return all_bits_set<bit>();
+            }
+
+            /* Set the corresponding bit to true in the bitfield. */
+            template <key_type bit>
+            constexpr void set_bit() noexcept
+            {
+                m_bitset = utils::set_bit<static_cast<storage_type>(bit)>(m_bitset);
+            }
+
+            /* Extract the bits in [start, end[, shifted down to start at bit 0. */
+            template <key_type start, key_type end>
+            constexpr storage_type get_range() const noexcept
+            {
+                constexpr storage_type start_bit = static_cast<storage_type>(start);
+                constexpr storage_type end_bit = static_cast<storage_type>(end);
+                constexpr storage_type width = end_bit - start_bit;
+                constexpr storage_type mask = make_low_mask<storage_type>(width);
+                return (m_bitset >> start_bit) & mask;
+            }
+
+        private:
+            storage_type m_bitset = { 0 };
+        };
+    }
+}
+
+#endif
diff --git a/include/xsimd/utils/xsimd_type_traits.hpp b/include/xsimd/utils/xsimd_type_traits.hpp
new file mode 100644
index 000000000..f6fea5d55
--- /dev/null
+++ b/include/xsimd/utils/xsimd_type_traits.hpp
@@ -0,0 +1,145 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_TYPE_TRAITS_HPP
+#define XSIMD_TYPE_TRAITS_HPP
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace detail
+    {
+        template <std::size_t S>
+        struct sized_num_types;
+
+        template <>
+        struct sized_num_types<1>
+        {
+            using signed_type = std::int8_t;
+            using unsigned_type = std::uint8_t;
+            using floating_point_type = void;
+        };
+
+        template <>
+        struct sized_num_types<2>
+        {
+            using signed_type = std::int16_t;
+            using unsigned_type = std::uint16_t;
+            using floating_point_type = void;
+        };
+
+        template <>
+        struct sized_num_types<4>
+        {
+            using signed_type = std::int32_t;
+            using unsigned_type = std::uint32_t;
+            using floating_point_type = float;
+        };
+
+        template <>
+        struct sized_num_types<8>
+        {
+            using signed_type = std::int64_t;
+            using unsigned_type = std::uint64_t;
+            using floating_point_type = double;
+        };
+    }
+
+    /**
+     * @ingroup type_traits
+     *
+     * Signed integer type with exactly @c S bytes (1, 2, 4, or 8).
+     *
+     * @tparam S size in bytes.
+     */
+    template <std::size_t S>
+    using sized_int_t = typename detail::sized_num_types<S>::signed_type;
+
+    /**
+     * @ingroup type_traits
+     *
+     * Unsigned integer type with exactly @c S bytes (1, 2, 4, or 8).
+     *
+     * @tparam S size in bytes.
+     */
+    template <std::size_t S>
+    using sized_uint_t = typename detail::sized_num_types<S>::unsigned_type;
+
+    /**
+     * @ingroup type_traits
+     *
+     * Floating-point type with exactly @c S bytes (4 for @c float, 8 for @c double).
+     * Yields @c void for sizes without a standard floating-point type (1, 2).
+     *
+     * @tparam S size in bytes.
+     */
+    template <std::size_t S>
+    using sized_fp_t = typename detail::sized_num_types<S>::floating_point_type;
+
+    namespace detail
+    {
+        template <typename T, std::size_t factor, typename = void>
+        struct remap_num
+        {
+            using type = T;
+        };
+
+        template <typename T, std::size_t factor>
+        struct remap_num<T, factor, std::enable_if_t<std::is_floating_point<T>::value>>
+        {
+            using type = xsimd::sized_fp_t<sizeof(T) * factor>;
+        };
+
+        template <typename T, std::size_t factor>
+        struct remap_num<T, factor, std::enable_if_t<!std::is_floating_point<T>::value && std::is_signed<T>::value>>
+        {
+            using type = xsimd::sized_int_t<sizeof(T) * factor>;
+        };
+
+        template <typename T, std::size_t factor>
+        struct remap_num<T, factor, std::enable_if_t<!std::is_floating_point<T>::value && std::is_unsigned<T>::value>>
+        {
+            using type = xsimd::sized_uint_t<sizeof(T) * factor>;
+        };
+    }
+
+    /**
+     * @ingroup type_traits
+     *
+     * Remap numeral types to their fixed sized variant (``[u]int{8,16,32}_t``
+     * and pass through other types).
+     * Certain platforms have different types (*i.e.* not aliases) between
+     * ``char`` and ``int8_t``, or ``long long`` and ``int{32,64}_t``, with SIMD
+     * intrinsicts only defined for some of them.
+     * Handling them requires to cast to a known predictable type.
+     *
+     * @tparam T arithmetic type to project from.
+     */
+    template <typename T>
+    using map_to_sized_type_t = typename detail::remap_num<T, /* factor= */ 1>::type;
+
+    /**
+     * @ingroup type_traits
+     *
+     * The next-wider arithmetic type for @c T: doubles the size while preserving
+     * signedness for integers and yielding @c double for @c float.
+     * Supported input types: @c [u]int{8,16,32}_t and @c float.
+     *
+     * @tparam T arithmetic type to widen.
+     */
+    template <typename T>
+    using widen_t = typename detail::remap_num<T, /* factor= */ 2>::type;
+}
+
+#endif
diff --git a/include/xsimd/xsimd.hpp b/include/xsimd/xsimd.hpp
index e50dc3bd1..705dce71b 100644
--- a/include/xsimd/xsimd.hpp
+++ b/include/xsimd/xsimd.hpp
@@ -12,73 +12,31 @@
 #ifndef XSIMD_HPP
 #define XSIMD_HPP
 
-#if defined(__FAST_MATH__)
-#define XSIMD_NO_DENORMALS
-#define XSIMD_NO_INFINITIES
-#define XSIMD_NO_NANS
-#endif
-
-#if defined(__has_cpp_attribute)
-// if this check passes, then the compiler supports feature test macros
-#if __has_cpp_attribute(nodiscard) >= 201603L
-// if this check passes, then the compiler supports [[nodiscard]] without a message
-#define XSIMD_NO_DISCARD [[nodiscard]]
-#endif
-#endif
-
-#if !defined(XSIMD_NO_DISCARD) && __cplusplus >= 201703L
-// this means that the previous tests failed, but we are using C++17 or higher
-#define XSIMD_NO_DISCARD [[nodiscard]]
-#endif
-
-#if !defined(XSIMD_NO_DISCARD) && (defined(__GNUC__) || defined(__clang__))
-// this means that the previous checks failed, but we are using GCC or Clang
-#define XSIMD_NO_DISCARD __attribute__((warn_unused_result))
-#endif
-
-#if !defined(XSIMD_NO_DISCARD)
-// this means that all the previous checks failed, so we fallback to doing nothing
-#define XSIMD_NO_DISCARD
-#endif
-
-#ifdef __cpp_if_constexpr
-// this means that the compiler supports the `if constexpr` construct
-#define XSIMD_IF_CONSTEXPR if constexpr
-#endif
-
-#if !defined(XSIMD_IF_CONSTEXPR) && __cplusplus >= 201703L
-// this means that the previous test failed, but we are using C++17 or higher
-#define XSIMD_IF_CONSTEXPR if constexpr
-#endif
-
-#if !defined(XSIMD_IF_CONSTEXPR)
-// this means that all the previous checks failed, so we fallback to a normal `if`
-#define XSIMD_IF_CONSTEXPR if
-#endif
-
-#include "config/xsimd_config.hpp"
-#include "config/xsimd_inline.hpp"
-
-#include "arch/xsimd_scalar.hpp"
-#include "memory/xsimd_aligned_allocator.hpp"
+#include "./arch/xsimd_scalar.hpp"
+#include "./config/xsimd_config.hpp"
+#include "./config/xsimd_macros.hpp"
+#include "./memory/xsimd_aligned_allocator.hpp"
+#include "./types/xsimd_batch_fwd.hpp"
 
 #if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
-// no type definition or anything apart from scalar definition and aligned allocator
 namespace xsimd
 {
-    template <class T, class A = void>
+    // no type definition or anything apart from scalar definition and aligned allocator
+    template <class T, class A>
     class batch
     {
         static constexpr bool supported_architecture = sizeof(A*) == 0; // type-dependant but always false
         static_assert(supported_architecture, "No SIMD architecture detected, cannot instantiate a batch");
     };
 }
+
 #else
-#include "types/xsimd_batch.hpp"
-#include "types/xsimd_batch_constant.hpp"
-#include "types/xsimd_traits.hpp"
+#include "./types/xsimd_batch.hpp"
+#include "./types/xsimd_batch_constant.hpp"
+#include "./types/xsimd_traits.hpp"
 
 // This include must come last
-#include "types/xsimd_api.hpp"
-#endif
+#include "./types/xsimd_api.hpp"
+#endif // XSIMD_NO_SUPPORTED_ARCHITECTURE
+
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ee32f3f5e..662dcdc3f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -9,7 +9,7 @@
 # The full license is in the file LICENSE, distributed with this software. #
 ############################################################################
 
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.13)
 
 if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
     project(xsimd-test)
@@ -17,7 +17,6 @@ if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
     enable_testing()
 
     find_package(xsimd REQUIRED CONFIG)
-    set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIRS})
 endif ()
 
 if(NOT CMAKE_BUILD_TYPE)
@@ -44,6 +43,13 @@ OPTION(CROSS_COMPILE_ARM "cross compile for ARM targets" OFF)
 # Note: to compile on ARM (or cross compile), you may need to add the following:
 # -DTARGET_ARCH="armv8-a -mfpu=neon -mfloat-abi=softfp -target arm-linux-gnueabi"
 set(TARGET_ARCH "native" CACHE STRING "Target architecture arguments")
+string(REGEX MATCH "emulated\\<[0-9]+\\>" TARGET_EMULATED ${TARGET_ARCH})
+
+if (TARGET_EMULATED)
+    message(STATUS "Using emulated target: ${TARGET_EMULATED}")
+    set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${TARGET_ARCH};-DXSIMD_WITH_EMULATED=1)
+    unset(TARGET_ARCH CACHE)
+endif()
 
 if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel")
     if (NOT WIN32 AND NOT ANDROID)
@@ -109,8 +115,10 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU"
         # Nothing specific
     elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
         # Nothing specific
+    elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
     elseif(NOT WIN32 AND NOT EMSCRIPTEN)
-        if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES)
+        if(TARGET_ARCH AND NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES)
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}")
         endif()
     endif()
@@ -138,6 +146,7 @@ set(XSIMD_TESTS
     test_batch_complex.cpp
     test_batch_float.cpp
     test_batch_int.cpp
+    test_bit.cpp
     test_bitwise_cast.cpp
     test_batch_constant.cpp
     test_batch_manip.cpp
@@ -146,6 +155,7 @@ set(XSIMD_TESTS
     test_complex_power.cpp
     test_complex_trigonometric.cpp
     test_conversion.cpp
+    test_cpu_features.cpp
     test_custom_default_arch.cpp
     test_error_gamma.cpp
     test_explicit_batch_instantiation.cpp
@@ -163,6 +173,7 @@ set(XSIMD_TESTS
     test_sum.cpp
     test_traits.cpp
     test_trigonometric.cpp
+    test_utils_bits.cpp
     test_xsimd_api.cpp
     test_utils.hpp
 )
@@ -171,8 +182,8 @@ if(NOT MSVC)
     list(APPEND XSIMD_TESTS test_gnu_source.cpp)
 endif()
 
-add_executable(test_xsimd ${XSIMD_TESTS} ${XSIMD_HEADERS})
-target_include_directories(test_xsimd PRIVATE ${XSIMD_INCLUDE_DIR})
+add_executable(test_xsimd ${XSIMD_TESTS})
+target_link_libraries(test_xsimd PRIVATE xsimd)
 
 option(DOWNLOAD_DOCTEST OFF)
 find_package(doctest QUIET)
@@ -223,6 +234,11 @@ endif()
 add_subdirectory(doc)
 add_subdirectory(architectures)
 
+if(EMULATED_COMPILE_FLAGS)
+    message(STATUS ${EMULATED_COMPILE_FLAGS})
+    target_compile_options(test_xsimd PRIVATE ${EMULATED_COMPILE_FLAGS})
+endif()
+
 if(EMSCRIPTEN)
     set_target_properties(test_xsimd PROPERTIES LINK_FLAGS "-s MODULARIZE=1 -s EXPORT_NAME=test_xsimd_wasm -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -lembind")
     target_compile_options(test_xsimd
diff --git a/test/architectures/CMakeLists.txt b/test/architectures/CMakeLists.txt
index 597803904..a374973b3 100644
--- a/test/architectures/CMakeLists.txt
+++ b/test/architectures/CMakeLists.txt
@@ -3,6 +3,10 @@ set(INTEL_PROCESSORS
     icelake-server cascadelake cooperlake tigerlake sapphirerapids alderlake
     rocketlake graniterapids graniterapids-d znver4)
 
+if(NOT TARGET xsimd)
+    find_package(xsimd REQUIRED CONFIG)
+endif()
+
 foreach(INTEL_PROCESSOR ${INTEL_PROCESSORS})
     # Adding the werror here to choke if the -march is incompatible with the
     # native one.
@@ -11,7 +15,7 @@ foreach(INTEL_PROCESSOR ${INTEL_PROCESSORS})
         message(STATUS ${INTEL_PROCESSOR})
         add_library(test_${INTEL_PROCESSOR} OBJECT dummy.cpp)
         target_compile_options(test_${INTEL_PROCESSOR} PRIVATE -march=${INTEL_PROCESSOR})
-        target_include_directories(test_${INTEL_PROCESSOR} PRIVATE ${XSIMD_INCLUDE_DIR})
+        target_link_libraries(test_${INTEL_PROCESSOR} PRIVATE xsimd)
         add_dependencies(xtest test_${INTEL_PROCESSOR})
         if(ENABLE_XTL_COMPLEX)
             target_compile_features(test_${INTEL_PROCESSOR} PRIVATE cxx_std_14)
diff --git a/test/check_inline_specifier.sh b/test/check_inline_specifier.sh
index 1ccdda130..2337b3707 100755
--- a/test/check_inline_specifier.sh
+++ b/test/check_inline_specifier.sh
@@ -3,7 +3,7 @@
 # Usage: $0 top_srcdir
 #
 # This script walks all headers in $top_srcdir/include and makes sure that all
-# functions declared tehre are marked as inline or constexpr (which implies
+# functions declared there are marked as inline or constexpr (which implies
 # inline). This makes sure the xsimd headers does not define symbol with global
 # linkage, and somehow convey our itnent to have all functions in xsimd being
 # inlined by the compiler.
diff --git a/test/doc/CMakeLists.txt b/test/doc/CMakeLists.txt
index be3385df8..fe8efd974 100644
--- a/test/doc/CMakeLists.txt
+++ b/test/doc/CMakeLists.txt
@@ -3,6 +3,10 @@
 
 if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT CMAKE_OSX_ARCHITECTURES)
 
+if(NOT TARGET xsimd)
+    find_package(xsimd REQUIRED CONFIG)
+endif()
+
 if(ENABLE_XTL_COMPLEX)
   add_compile_definitions(XSIMD_ENABLE_XTL_COMPLEX=1)
   include_directories(${xtl_INCLUDE_DIRS})
@@ -16,19 +20,19 @@ add_library(test_doc_any_arch OBJECT
             manipulating_abstract_batches.cpp
             manipulating_parametric_batches.cpp
             writing_vectorized_code.cpp)
-target_include_directories(test_doc_any_arch PRIVATE ${XSIMD_INCLUDE_DIR})
+target_link_libraries(test_doc_any_arch PRIVATE xsimd)
 target_compile_options(test_doc_any_arch PRIVATE -mavx)
 
 add_library(test_doc_avx2 OBJECT
             explicit_use_of_an_instruction_set.cpp
             sum_avx2.cpp)
+target_link_libraries(test_doc_avx2 PRIVATE xsimd)
 target_compile_options(test_doc_avx2 PRIVATE -mavx2)
-target_include_directories(test_doc_avx2 PRIVATE ${XSIMD_INCLUDE_DIR})
 
 add_library(test_doc_sse2 OBJECT
             sum_sse2.cpp)
+target_link_libraries(test_doc_sse2 PRIVATE xsimd)
 target_compile_options(test_doc_sse2 PRIVATE -msse2)
-target_include_directories(test_doc_sse2 PRIVATE ${XSIMD_INCLUDE_DIR})
 
 add_dependencies(xtest test_doc_any_arch test_doc_avx2 test_doc_sse2)
 
diff --git a/test/doc/explicit_use_of_an_instruction_set.cpp b/test/doc/explicit_use_of_an_instruction_set.cpp
index ab3d0e7e7..4d83cf743 100644
--- a/test/doc/explicit_use_of_an_instruction_set.cpp
+++ b/test/doc/explicit_use_of_an_instruction_set.cpp
@@ -1,4 +1,5 @@
 #include "xsimd/xsimd.hpp"
+
 #include <iostream>
 
 namespace xs = xsimd;
diff --git a/test/doc/explicit_use_of_an_instruction_set_mean.cpp b/test/doc/explicit_use_of_an_instruction_set_mean.cpp
index 6e68bac71..4c812b6f7 100644
--- a/test/doc/explicit_use_of_an_instruction_set_mean.cpp
+++ b/test/doc/explicit_use_of_an_instruction_set_mean.cpp
@@ -1,4 +1,5 @@
 #include "xsimd/xsimd.hpp"
+
 #include <cstddef>
 #include <vector>
 
diff --git a/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp b/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp
index 12a07bb2b..d81b055e2 100644
--- a/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp
+++ b/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp
@@ -1,4 +1,5 @@
 #include "xsimd/xsimd.hpp"
+
 #include <cstddef>
 #include <vector>
 
diff --git a/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp b/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp
index 6a1104625..76eab6d7e 100644
--- a/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp
+++ b/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp
@@ -1,6 +1,6 @@
 #include "xsimd/xsimd.hpp"
+
 #include <cstddef>
-#include <vector>
 
 struct mean
 {
diff --git a/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp b/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp
index 7e6f6cb30..af40b8394 100644
--- a/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp
+++ b/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp
@@ -1,6 +1,6 @@
 #include "xsimd/xsimd.hpp"
+
 #include <cstddef>
-#include <vector>
 
 template <class C, class Tag>
 void mean(const C& a, const C& b, C& res, Tag)
diff --git a/test/main.cpp b/test/main.cpp
index ef6681811..93b1ae371 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -10,11 +10,11 @@
  ****************************************************************************/
 #ifndef EMSCRIPTEN
 #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
-#include "doctest/doctest.h"
+#include <doctest/doctest.h>
 #else
 
 #define DOCTEST_CONFIG_IMPLEMENT
-#include "doctest/doctest.h"
+#include <doctest/doctest.h>
 #include <emscripten/bind.h>
 
 int run_tests()
@@ -28,4 +28,4 @@ EMSCRIPTEN_BINDINGS(my_module)
     emscripten::function("run_tests", &run_tests);
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/test/test_api.cpp b/test/test_api.cpp
index 73f74619a..20c887a36 100644
--- a/test/test_api.cpp
+++ b/test/test_api.cpp
@@ -12,12 +12,12 @@
 #include "xsimd/xsimd.hpp"
 #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
+#include "test_utils.hpp"
+
 #include <functional>
 #include <numeric>
 #include <random>
 
-#include "test_utils.hpp"
-
 template <class B>
 struct xsimd_api_test
 {
@@ -138,7 +138,7 @@ struct xsimd_api_test
         batch_type b = batch_type::load(v.data(), xsimd::aligned_mode());
         V res(size);
 
-        bool* b_data = new bool[size];
+        alignas(arch_type::alignment()) bool b_data[size];
 
         xsimd::store_as(res.data(), b, xsimd::unaligned_mode());
         INFO(name, " unaligned");
@@ -159,8 +159,6 @@ struct xsimd_api_test
         xsimd::store_as(b_data, bb, xsimd::aligned_mode());
         INFO(name, " batch_bool aligned");
         CHECK_UNARY(std::accumulate(b_data, b_data + size, true, std::logical_and<bool>()));
-
-        delete[] b_data;
     }
 
     template <class T>
diff --git a/test/test_arch.cpp b/test/test_arch.cpp
index 621679ff4..d2d0df249 100644
--- a/test/test_arch.cpp
+++ b/test/test_arch.cpp
@@ -12,12 +12,12 @@
 #include "xsimd/xsimd.hpp"
 #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
-#include <numeric>
-#include <type_traits>
-
 #include "test_sum.hpp"
 #include "test_utils.hpp"
 
+#include <numeric>
+#include <type_traits>
+
 #ifndef XSIMD_DEFAULT_ARCH
 static_assert(xsimd::default_arch::supported(), "default arch must be supported");
 static_assert(std::is_same<xsimd::default_arch, xsimd::best_arch>::value, "default arch is the best available");
diff --git a/test/test_batch.cpp b/test/test_batch.cpp
index 5cf47f3d7..176bdc16f 100644
--- a/test/test_batch.cpp
+++ b/test/test_batch.cpp
@@ -12,15 +12,58 @@
 #include "xsimd/xsimd.hpp"
 #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
+#include "test_utils.hpp"
+
 #include <cmath>
 #include <functional>
 #include <numeric>
 #include <sstream>
 
-#include "test_utils.hpp"
-
 using namespace std::placeholders;
 
+namespace detail_test_mulhilo
+{
+    template <class T>
+    std::enable_if_t<std::is_integral<T>::value && (sizeof(T) <= 4), T>
+    mulhi_reference(T x, T y) noexcept
+    {
+        using W = std::conditional_t<std::is_signed<T>::value, int64_t, uint64_t>;
+        return static_cast<T>((static_cast<W>(x) * static_cast<W>(y)) >> (8 * sizeof(T)));
+    }
+
+#if defined(__SIZEOF_INT128__)
+    template <class T>
+    std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 8), T>
+    mulhi_reference(T x, T y) noexcept
+    {
+        using W = std::conditional_t<std::is_signed<T>::value, __int128, unsigned __int128>;
+        return static_cast<T>((static_cast<W>(x) * static_cast<W>(y)) >> 64);
+    }
+#else
+    template <class T>
+    std::enable_if_t<std::is_integral<T>::value && (sizeof(T) == 8), T>
+    mulhi_reference(T x, T y) noexcept
+    {
+        uint64_t ux = static_cast<uint64_t>(x);
+        uint64_t uy = static_cast<uint64_t>(y);
+        uint64_t xl = ux & 0xffffffffULL, xh = ux >> 32;
+        uint64_t yl = uy & 0xffffffffULL, yh = uy >> 32;
+        uint64_t ll = xl * yl, lh = xl * yh, hl = xh * yl, hh = xh * yh;
+        uint64_t mid = (ll >> 32) + (lh & 0xffffffffULL) + (hl & 0xffffffffULL);
+        uint64_t hi = hh + (lh >> 32) + (hl >> 32) + (mid >> 32);
+        if (std::is_signed<T>::value)
+        {
+            if (x < 0)
+                hi -= uy;
+            if (y < 0)
+                hi -= ux;
+        }
+        return static_cast<T>(hi);
+    }
+#endif
+}
+using detail_test_mulhilo::mulhi_reference;
+
 template <class B>
 struct batch_test
 {
@@ -158,6 +201,21 @@ struct batch_test
         CHECK_EQ(res.first(), lhs[0]);
     }
 
+    template <size_t... Is>
+    void test_get_impl(batch_type const& res, std::index_sequence<Is...>) const
+    {
+        array_type extracted = { xsimd::get<Is>(res)... };
+        CHECK_EQ(extracted, lhs);
+        CHECK_BATCH_EQ(batch_type::load_unaligned(extracted.data()), res);
+    }
+
+    void test_get() const
+    {
+        batch_type res = batch_lhs();
+        CHECK_EQ(xsimd::get<0>(res), res.first());
+        test_get_impl(res, std::make_index_sequence<size> {});
+    }
+
     void test_arithmetic() const
     {
         // +batch
@@ -298,6 +356,148 @@ struct batch_test
         }
     }
 
+    template <class U = value_type>
+    void test_mulhilo_impl(std::true_type /*integral*/) const
+    {
+        using UT = std::make_unsigned_t<value_type>;
+
+        auto run_case = [](array_type const& a, array_type const& b, const char* tag)
+        {
+            batch_type ba = batch_type::load_unaligned(a.data());
+            batch_type bb = batch_type::load_unaligned(b.data());
+
+            array_type lo_expected;
+            array_type hi_expected;
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                // Cast to unsigned before multiplying: signed overflow is UB,
+                // but unsigned wraparound is well-defined and the low N bits
+                // of the product are bit-identical for both interpretations.
+                lo_expected[i] = static_cast<value_type>(static_cast<UT>(a[i]) * static_cast<UT>(b[i]));
+                hi_expected[i] = mulhi_reference(a[i], b[i]);
+            }
+
+            batch_type lo_res = xsimd::mul_lo(ba, bb);
+            INFO("mul_lo(batch, batch) [" << tag << "]");
+            CHECK_BATCH_EQ(lo_res, lo_expected);
+
+            batch_type hi_res = xsimd::mul_hi(ba, bb);
+            INFO("mul_hi(batch, batch) [" << tag << "]");
+            CHECK_BATCH_EQ(hi_res, hi_expected);
+
+            auto p = xsimd::mul_hilo(ba, bb);
+            INFO("mul_hilo.first == mul_hi [" << tag << "]");
+            CHECK_BATCH_EQ(p.first, hi_res);
+            INFO("mul_hilo.second == mul_lo [" << tag << "]");
+            CHECK_BATCH_EQ(p.second, lo_res);
+        };
+
+        // baseline: small operands from init_operands
+        run_case(lhs, rhs, "small");
+
+        // edge operands that actually exercise the high half
+        constexpr value_type vmin = std::numeric_limits<value_type>::min();
+        constexpr value_type vmax = std::numeric_limits<value_type>::max();
+        constexpr bool is_signed = std::is_signed<value_type>::value;
+
+        // Pattern A: extremes paired with extremes (covers vmax*vmax, vmin*vmin,
+        // vmin*vmax, vmin*-1 — the classic signed-overflow corners).
+        {
+            array_type a, b;
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                switch (i % 8)
+                {
+                case 0:
+                    a[i] = vmax;
+                    b[i] = vmax;
+                    break;
+                case 1:
+                    a[i] = vmin;
+                    b[i] = vmin;
+                    break;
+                case 2:
+                    a[i] = vmin;
+                    b[i] = vmax;
+                    break;
+                case 3:
+                    a[i] = vmax;
+                    b[i] = static_cast<value_type>(is_signed ? -1 : vmax);
+                    break;
+                case 4:
+                    a[i] = static_cast<value_type>(is_signed ? -1 : vmax);
+                    b[i] = static_cast<value_type>(is_signed ? -1 : vmax);
+                    break;
+                case 5:
+                    a[i] = vmin;
+                    b[i] = static_cast<value_type>(is_signed ? -1 : 1);
+                    break;
+                case 6:
+                    a[i] = static_cast<value_type>(vmax / 2 + 1);
+                    b[i] = static_cast<value_type>(vmax / 2 + 1);
+                    break;
+                case 7:
+                    a[i] = static_cast<value_type>(1);
+                    b[i] = vmax;
+                    break;
+                }
+            }
+            run_case(a, b, "extremes");
+        }
+
+        // Pattern B: high-half-non-zero, mixed signs (each lane unique so we
+        // catch lane-wise bugs in 32/64-bit emulated mul_hi paths).
+        {
+            array_type a, b;
+            constexpr std::size_t bits = 8 * sizeof(value_type);
+            constexpr std::size_t half = bits / 2;
+            const UT half_mask = (static_cast<UT>(1) << half) - 1;
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                // Spread bits across both halves so the product overflows the
+                // low half. Use deterministic but lane-varying patterns.
+                UT ua = static_cast<UT>((static_cast<UT>(0xA53C97E1ULL) ^ (static_cast<UT>(i) * 0x9E37ULL))
+                                        | (static_cast<UT>(i + 1) << half));
+                UT ub = static_cast<UT>((static_cast<UT>(0x6BD1F4A7ULL) ^ (static_cast<UT>(i) * 0xC2B5ULL))
+                                        | (static_cast<UT>((i * 3) + 1) << half));
+                // Make sure both halves are non-zero so the product spans bits.
+                if ((ua & half_mask) == 0)
+                    ua |= static_cast<UT>(1);
+                if ((ub & half_mask) == 0)
+                    ub |= static_cast<UT>(1);
+                a[i] = static_cast<value_type>(ua);
+                b[i] = static_cast<value_type>(ub);
+            }
+            run_case(a, b, "wide-bits");
+        }
+
+        // Pattern C: signed correction terms — only meaningful for signed
+        // types but harmless for unsigned (we still check correctness).
+        {
+            array_type a, b;
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                // Alternate negative * positive and negative * negative for
+                // signed; for unsigned this just samples large magnitudes.
+                value_type x = static_cast<value_type>(vmax - static_cast<value_type>(i));
+                value_type y = static_cast<value_type>(is_signed
+                                                           ? (i % 2 == 0 ? -static_cast<value_type>(i + 1)
+                                                                         : static_cast<value_type>(i + 1))
+                                                           : static_cast<value_type>(vmax - (i * 7)));
+                a[i] = x;
+                b[i] = y;
+            }
+            run_case(a, b, "signed-correction");
+        }
+    }
+
+    void test_mulhilo_impl(std::false_type /*not integral*/) const { }
+
+    void test_mulhilo() const
+    {
+        test_mulhilo_impl(typename std::is_integral<value_type>::type {});
+    }
+
     void test_saturated_arithmetic() const
     {
         // batch + batch
@@ -986,6 +1186,11 @@ TEST_CASE_TEMPLATE("[batch]", B, BATCH_TYPES)
         Test.test_first_element();
     }
 
+    SUBCASE("get")
+    {
+        Test.test_get();
+    }
+
     SUBCASE("arithmetic")
     {
         Test.test_arithmetic();
@@ -996,6 +1201,11 @@ TEST_CASE_TEMPLATE("[batch]", B, BATCH_TYPES)
         Test.test_incr_decr();
     }
 
+    SUBCASE("mul_hilo")
+    {
+        Test.test_mulhilo();
+    }
+
     SUBCASE("saturated_arithmetic")
     {
         Test.test_saturated_arithmetic();
diff --git a/test/test_batch_bool.cpp b/test/test_batch_bool.cpp
index db2fad599..9b658b2d1 100644
--- a/test/test_batch_bool.cpp
+++ b/test/test_batch_bool.cpp
@@ -12,13 +12,13 @@
 #include "xsimd/xsimd.hpp"
 #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
+#include "test_utils.hpp"
+
 #include <array>
 #include <functional>
 #include <type_traits>
 #include <vector>
 
-#include "test_utils.hpp"
-
 namespace xsimd
 {
 
@@ -99,17 +99,6 @@ namespace xsimd
         };
     }
 
-    int popcount(int v)
-    {
-        // from https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan
-        int c; // c accumulates the total bits set in v
-        for (c = 0; v; c++)
-        {
-            v &= v - 1; // clear the least significant bit set
-        }
-        return c;
-    }
-
     template <class T, std::size_t N>
     struct get_bool_base
     {
@@ -758,6 +747,123 @@ struct batch_bool_test
         CHECK_EQ(count(bool_g.half), batch_bool_type::size / 2);
     }
 
+    void test_count_lr() const
+    {
+        auto bool_g = xsimd::get_bool<batch_bool_type> {};
+
+        {
+            INFO("countl_zero");
+            CHECK_EQ(countl_zero(bool_g.all_false), batch_bool_type::size);
+            CHECK_EQ(countl_zero(bool_g.all_true), 0);
+            CHECK_EQ(countl_zero(bool_g.half), 0);
+            CHECK_EQ(countl_zero(bool_g.ihalf), batch_bool_type::size / 2);
+        }
+
+        {
+            INFO("countl_one");
+            CHECK_EQ(countl_one(bool_g.all_false), 0);
+            CHECK_EQ(countl_one(bool_g.all_true), batch_bool_type::size);
+            CHECK_EQ(countl_one(bool_g.half), batch_bool_type::size / 2);
+            CHECK_EQ(countl_one(bool_g.ihalf), 0);
+        }
+
+        {
+            INFO("countr_zero");
+            CHECK_EQ(countr_zero(bool_g.all_false), batch_bool_type::size);
+            CHECK_EQ(countr_zero(bool_g.all_true), 0);
+            CHECK_EQ(countr_zero(bool_g.half), batch_bool_type::size / 2);
+            CHECK_EQ(countr_zero(bool_g.ihalf), 0);
+        }
+
+        {
+            INFO("countr_one");
+            CHECK_EQ(countr_one(bool_g.all_false), 0);
+            CHECK_EQ(countr_one(bool_g.all_true), batch_bool_type::size);
+            CHECK_EQ(countr_one(bool_g.half), 0);
+            CHECK_EQ(countr_one(bool_g.ihalf), batch_bool_type::size / 2);
+        }
+
+        {
+            size_t i = 0;
+            for (const auto& vec : bool_g.almost_all_false())
+            {
+                batch_bool_type b = batch_bool_type::load_unaligned(vec.data());
+
+                if (i == 0)
+                {
+                    CHECK_EQ(countr_zero(b), 0);
+                    CHECK_EQ(countr_one(b), 1);
+                }
+                else
+                {
+                    CHECK_EQ(countr_zero(b), i);
+                    CHECK_EQ(countr_one(b), 0);
+                }
+
+                if (i == batch_bool_type::size - 1)
+                {
+                    CHECK_EQ(countl_zero(b), 0);
+                    CHECK_EQ(countl_one(b), 1);
+                }
+                else
+                {
+                    CHECK_EQ(countl_zero(b), batch_bool_type::size - 1 - i);
+                    CHECK_EQ(countl_one(b), 0);
+                }
+
+                i++;
+            }
+        }
+
+        {
+            size_t i = 0;
+            for (const auto& vec : bool_g.almost_all_true())
+            {
+                batch_bool_type b = batch_bool_type::load_unaligned(vec.data());
+
+                if (i == 0)
+                {
+                    CHECK_EQ(countr_zero(b), 1);
+                    CHECK_EQ(countr_one(b), 0);
+                }
+                else
+                {
+                    CHECK_EQ(countr_zero(b), 0);
+                    CHECK_EQ(countr_one(b), i);
+                }
+
+                if (i == batch_bool_type::size - 1)
+                {
+                    CHECK_EQ(countl_zero(b), 1);
+                    CHECK_EQ(countl_one(b), 0);
+                }
+                else
+                {
+                    CHECK_EQ(countl_zero(b), 0);
+                    CHECK_EQ(countl_one(b), batch_bool_type::size - 1 - i);
+                }
+
+                i++;
+            }
+        }
+
+        {
+            INFO("interspersed pattern");
+            CHECK_EQ(countr_zero(bool_g.interspersed), 1);
+            CHECK_EQ(countr_one(bool_g.interspersed), 0);
+            if (batch_bool_type::size % 2 == 0)
+            {
+                CHECK_EQ(countl_zero(bool_g.interspersed), 0);
+                CHECK_EQ(countl_one(bool_g.interspersed), 1);
+            }
+            else
+            {
+                CHECK_EQ(countl_zero(bool_g.interspersed), 1);
+                CHECK_EQ(countl_one(bool_g.interspersed), 0);
+            }
+        }
+    }
+
     void test_comparison() const
     {
         auto bool_g = xsimd::get_bool<batch_bool_type> {};
@@ -810,8 +916,55 @@ TEST_CASE_TEMPLATE("[xsimd batch bool]", B, BATCH_TYPES)
 
     SUBCASE("count") { Test.test_count(); }
 
+    SUBCASE("count{l,r}_{zero,one}") { Test.test_count_lr(); }
+
     SUBCASE("eq neq") { Test.test_comparison(); }
 
     SUBCASE("mask utils (compile-time)") { Test.test_mask_compile_time(); }
 }
+
+TEST_CASE_TEMPLATE("batch_bool mask hygiene", B, BATCH_TYPES)
+{
+    using batch_type = B;
+    using arch_type = typename B::arch_type;
+    using value_type = typename B::value_type;
+
+    SUBCASE("any(a != a) is false")
+    {
+        batch_type a(value_type(1));
+        CHECK_FALSE(xsimd::any(a != a));
+    }
+
+    SUBCASE("all(~(a != a)) is true")
+    {
+        batch_type a(value_type(1));
+        CHECK_UNARY(xsimd::all(~(a != a)));
+    }
+
+    SUBCASE("any(~(a == a)) is false")
+    {
+        batch_type a(value_type(1));
+        CHECK_FALSE(xsimd::any(~(a == a)));
+    }
+
+    SUBCASE("eq(mask, _mask) is all-true")
+    {
+        auto m0 = (batch_type(value_type(1)) != batch_type(value_type(1)));
+        CHECK_UNARY(xsimd::all(m0 == m0));
+    }
+
+    SUBCASE("batch_bool stored to bool[] is canonical 0/1")
+    {
+        batch_type a(value_type(1)), b(value_type(2));
+        auto m = (a == b); // all false
+        alignas(arch_type::alignment()) bool buf[B::size + 1] = { true, true }; // sentinel
+        m.store_aligned(buf);
+        for (std::size_t i = 0; i < B::size; ++i)
+        {
+            // bit-level check: must be exactly 0, not just falsy
+            CHECK_EQ(*reinterpret_cast<uint8_t const*>(&buf[i]), uint8_t(0));
+        }
+    }
+}
+
 #endif
diff --git a/test/test_batch_complex.cpp b/test/test_batch_complex.cpp
index e06ad83fb..9db85cef6 100644
--- a/test/test_batch_complex.cpp
+++ b/test/test_batch_complex.cpp
@@ -12,12 +12,12 @@
 #include "xsimd/xsimd.hpp"
 #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
+#include "test_utils.hpp"
+
 #include <cmath>
 #include <functional>
 #include <numeric>
 
-#include "test_utils.hpp"
-
 using namespace std::placeholders;
 
 template <class B>
@@ -182,6 +182,21 @@ struct batch_complex_test
         CHECK_EQ(res.first(), lhs[0]);
     }
 
+    template <size_t... Is>
+    void test_get_impl(batch_type const& res, std::index_sequence<Is...>) const
+    {
+        array_type extracted = { xsimd::get<Is>(res)... };
+        CHECK_EQ(extracted, lhs);
+        CHECK_BATCH_EQ(batch_type::load_unaligned(extracted.data()), res);
+    }
+
+    void test_get() const
+    {
+        batch_type res = batch_lhs();
+        CHECK_EQ(xsimd::get<0>(res), res.first());
+        test_get_impl(res, std::make_index_sequence<size> {});
+    }
+
     void test_arithmetic() const
     {
         // +batch
@@ -689,6 +704,8 @@ TEST_CASE_TEMPLATE("[xsimd complex batches]", B, BATCH_COMPLEX_TYPES)
 
     SUBCASE("first element") { Test.test_first_element(); }
 
+    SUBCASE("get") { Test.test_get(); }
+
     SUBCASE("arithmetic") { Test.test_arithmetic(); }
 
     SUBCASE("computed_assignment") { Test.test_computed_assignment(); }
diff --git a/test/test_batch_constant.cpp b/test/test_batch_constant.cpp
index 2b99fc05b..9877cd17d 100644
--- a/test/test_batch_constant.cpp
+++ b/test/test_batch_constant.cpp
@@ -10,6 +10,8 @@
  ****************************************************************************/
 
 #include "xsimd/xsimd.hpp"
+
+#include <numeric>
 #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
 #include "test_utils.hpp"
@@ -43,6 +45,22 @@ struct constant_batch_test
         CHECK_BATCH_EQ((batch_type)b, expected);
     }
 
+    void test_init_from_array() const
+    {
+#if XSIMD_CPP_VERSION >= 202002L
+        constexpr array_type expected = []()
+        {
+            array_type out = {};
+            std::iota(out.begin(), out.end(), 0);
+            return out;
+        }();
+
+        constexpr auto b = xsimd::make_batch_constant<expected, arch_type>();
+        INFO("batch(value_type)");
+        CHECK_BATCH_EQ((batch_type)b, expected);
+#endif
+    }
+
     void test_init_from_generator() const
     {
         array_type expected;
@@ -217,6 +235,8 @@ TEST_CASE_TEMPLATE("[constant batch]", B, BATCH_INT_TYPES)
     constant_batch_test<B> Test;
     SUBCASE("init_from_constant") { Test.test_init_from_constant(); }
 
+    SUBCASE("test_init_from_array") { Test.test_init_from_array(); }
+
     SUBCASE("init_from_generator") { Test.test_init_from_generator(); }
 
     SUBCASE("as_batch") { Test.test_cast(); }
@@ -263,6 +283,25 @@ struct constant_bool_batch_test
         CHECK_BATCH_EQ((batch_bool_type)b, expected);
     }
 
+    void test_init_from_array() const
+    {
+#if XSIMD_CPP_VERSION >= 202002L
+        constexpr bool_array_type expected = []()
+        {
+            bool_array_type out = {};
+            for (std::size_t k = 0; k < out.size(); ++k)
+            {
+                out[k] = k % 2 == 0;
+            }
+            return out;
+        }();
+
+        constexpr auto b = xsimd::make_batch_bool_constant<value_type, expected, arch_type>();
+        INFO("batch_bool_constant(value_type)");
+        CHECK_BATCH_EQ((batch_bool_type)b, expected);
+#endif
+    }
+
     void test_init_from_generator() const
     {
         bool_array_type expected;
@@ -357,6 +396,8 @@ TEST_CASE_TEMPLATE("[constant bool batch]", B, BATCH_INT_TYPES)
     constant_bool_batch_test<B> Test;
     SUBCASE("init_from_constant") { Test.test_init_from_constant(); }
 
+    SUBCASE("test_init_from_array") { Test.test_init_from_array(); }
+
     SUBCASE("init_from_generator") { Test.test_init_from_generator(); }
 
     SUBCASE("as_batch") { Test.test_cast(); }
diff --git a/test/test_bit.cpp b/test/test_bit.cpp
new file mode 100644
index 000000000..de3b60c27
--- /dev/null
+++ b/test/test_bit.cpp
@@ -0,0 +1,229 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include "xsimd/xsimd.hpp"
+#ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
+
+#include "test_utils.hpp"
+
+template <class T>
+struct bit_test
+{
+    using value_type = T;
+    using bits = std::integral_constant<int, sizeof(T) * CHAR_BIT>;
+
+    void test_popcount()
+    {
+        // Zero
+        CHECK_EQ(xsimd::detail::popcount(T(0)), 0);
+
+        // All bits set
+        CHECK_EQ(xsimd::detail::popcount(T(~T(0))), bits::value);
+
+        // Single bit patterns - all should have popcount of 1
+        for (int i = 0; i < bits::value; ++i)
+        {
+            T value = T(T(1) << i);
+            INFO("popcount(1 << " << i << ")");
+            CHECK_EQ(xsimd::detail::popcount(value), 1);
+        }
+
+        // Powers of 2 minus 1 - known popcounts
+        for (int i = 1; i < bits::value; ++i)
+        {
+            T value = T((T(1) << i) - 1);
+            INFO("popcount((1 << " << i << ") - 1)");
+            CHECK_EQ(xsimd::detail::popcount(value), i);
+        }
+
+        // Alternating patterns
+        if (bits::value >= 8)
+        {
+            T pattern_aa = T(0);
+            T pattern_55 = T(0);
+            for (int i = 0; i < bits::value / 8; ++i)
+            {
+                pattern_aa |= T(0xAA) << (i * 8);
+                pattern_55 |= T(0x55) << (i * 8);
+            }
+            INFO("popcount(0xAA...)");
+            CHECK_EQ(xsimd::detail::popcount(pattern_aa), bits::value / 2);
+            INFO("popcount(0x55...)");
+            CHECK_EQ(xsimd::detail::popcount(pattern_55), bits::value / 2);
+        }
+
+        // Specific test cases
+        CHECK_EQ(xsimd::detail::popcount(T(1)), 1);
+        CHECK_EQ(xsimd::detail::popcount(T(3)), 2);
+        CHECK_EQ(xsimd::detail::popcount(T(7)), 3);
+        CHECK_EQ(xsimd::detail::popcount(T(15)), 4);
+    }
+
+    void test_countl_zero()
+    {
+        // Zero should have all leading zeros
+        CHECK_EQ(xsimd::detail::countl_zero(T(0)), bits::value);
+
+        // All bits set should have 0 leading zeros
+        CHECK_EQ(xsimd::detail::countl_zero(T(~T(0))), 0);
+
+        // MSB set should have 0 leading zeros
+        T msb = T(1) << (bits::value - 1);
+        CHECK_EQ(xsimd::detail::countl_zero(msb), 0);
+
+        // Powers of 2
+        for (int i = 0; i < bits::value; ++i)
+        {
+            T value = T(T(1) << i);
+            int expected = bits::value - i - 1;
+            INFO("countl_zero(1 << " << i << ")");
+            CHECK_EQ(xsimd::detail::countl_zero(value), expected);
+        }
+
+        // Sequential patterns (1, 3, 7, 15, ...)
+        for (int i = 1; i < bits::value; ++i)
+        {
+            T value = T((T(1) << i) - 1);
+            int expected = bits::value - i;
+            INFO("countl_zero((1 << " << i << ") - 1)");
+            CHECK_EQ(xsimd::detail::countl_zero(value), expected);
+        }
+
+        // Specific values
+        CHECK_EQ(xsimd::detail::countl_zero(T(1)), bits::value - 1);
+        CHECK_EQ(xsimd::detail::countl_zero(T(2)), bits::value - 2);
+        CHECK_EQ(xsimd::detail::countl_zero(T(4)), bits::value - 3);
+    }
+
+    void test_countl_one()
+    {
+        // Zero should have 0 leading ones
+        CHECK_EQ(xsimd::detail::countl_one(T(0)), 0);
+
+        // All bits set should have all leading ones
+        CHECK_EQ(xsimd::detail::countl_one(T(~T(0))), bits::value);
+
+        // MSB clear, rest set should have 0 leading ones
+        T pattern = T(~(T(1) << (bits::value - 1)));
+        CHECK_EQ(xsimd::detail::countl_one(pattern), 0);
+
+        // Inverted powers of 2
+        for (int i = 0; i < bits::value; ++i)
+        {
+            T value = T(~(T(1) << i));
+            int expected = (i == bits::value - 1) ? 0 : bits::value - i - 1;
+            INFO("countl_one(~(1 << " << i << "))");
+            CHECK_EQ(xsimd::detail::countl_one(value), expected);
+        }
+
+        // Patterns with known leading ones
+        for (int i = 1; i <= bits::value; ++i)
+        {
+            T value = T(T(~T(0)) << (bits::value - i));
+            INFO("countl_one(~0 << " << (bits::value - i) << ")");
+            CHECK_EQ(xsimd::detail::countl_one(value), i);
+        }
+
+        // Specific values
+        CHECK_EQ(xsimd::detail::countl_one(T(~T(1))), bits::value - 1);
+        CHECK_EQ(xsimd::detail::countl_one(T(~T(3))), bits::value - 2);
+    }
+
+    void test_countr_zero()
+    {
+        // Zero should have all trailing zeros
+        CHECK_EQ(xsimd::detail::countr_zero(T(0)), bits::value);
+
+        // All bits set should have 0 trailing zeros
+        CHECK_EQ(xsimd::detail::countr_zero(T(~T(0))), 0);
+
+        // Odd numbers should have 0 trailing zeros
+        CHECK_EQ(xsimd::detail::countr_zero(T(1)), 0);
+        CHECK_EQ(xsimd::detail::countr_zero(T(3)), 0);
+        CHECK_EQ(xsimd::detail::countr_zero(T(5)), 0);
+        CHECK_EQ(xsimd::detail::countr_zero(T(7)), 0);
+
+        // Powers of 2
+        for (int i = 0; i < bits::value; ++i)
+        {
+            T value = T(1) << i;
+            INFO("countr_zero(1 << " << i << ")");
+            CHECK_EQ(xsimd::detail::countr_zero(value), i);
+        }
+
+        // Even numbers with known factors
+        CHECK_EQ(xsimd::detail::countr_zero(T(2)), 1);
+        CHECK_EQ(xsimd::detail::countr_zero(T(4)), 2);
+        CHECK_EQ(xsimd::detail::countr_zero(T(6)), 1);
+        CHECK_EQ(xsimd::detail::countr_zero(T(8)), 3);
+        CHECK_EQ(xsimd::detail::countr_zero(T(12)), 2);
+        CHECK_EQ(xsimd::detail::countr_zero(T(16)), 4);
+
+        // Specific patterns
+        for (int i = 1; i < bits::value; ++i)
+        {
+            T value = T(~T(0)) << i;
+            INFO("countr_zero(~0 << " << i << ")");
+            CHECK_EQ(xsimd::detail::countr_zero(value), i);
+        }
+    }
+
+    void test_countr_one()
+    {
+        // Zero should have 0 trailing ones
+        CHECK_EQ(xsimd::detail::countr_one(T(0)), 0);
+
+        // All bits set should have all trailing ones
+        CHECK_EQ(xsimd::detail::countr_one(T(~T(0))), bits::value);
+
+        // Even numbers should have 0 trailing ones
+        CHECK_EQ(xsimd::detail::countr_one(T(2)), 0);
+        CHECK_EQ(xsimd::detail::countr_one(T(4)), 0);
+        CHECK_EQ(xsimd::detail::countr_one(T(6)), 0);
+
+        // Powers of 2 minus 1
+        for (int i = 1; i < bits::value; ++i)
+        {
+            T value = T((T(1) << i) - 1);
+            INFO("countr_one((1 << " << i << ") - 1)");
+            CHECK_EQ(xsimd::detail::countr_one(value), i);
+        }
+
+        // Specific values
+        CHECK_EQ(xsimd::detail::countr_one(T(1)), 1);
+        CHECK_EQ(xsimd::detail::countr_one(T(3)), 2);
+        CHECK_EQ(xsimd::detail::countr_one(T(7)), 3);
+        CHECK_EQ(xsimd::detail::countr_one(T(15)), 4);
+        CHECK_EQ(xsimd::detail::countr_one(T(31)), 5);
+
+        // Inverted powers of 2 minus 1
+        for (int i = 1; i < bits::value; ++i)
+        {
+            T value = T(~((T(1) << i) - 1));
+            INFO("countr_one(~((1 << " << i << ") - 1))");
+            CHECK_EQ(xsimd::detail::countr_one(value), 0);
+        }
+    }
+};
+
+TEST_CASE_TEMPLATE("[bit operations]", T,
+                   uint8_t, uint16_t, uint32_t, uint64_t)
+{
+    bit_test<T> Test;
+
+    SUBCASE("popcount") { Test.test_popcount(); }
+    SUBCASE("countl_zero") { Test.test_countl_zero(); }
+    SUBCASE("countl_one") { Test.test_countl_one(); }
+    SUBCASE("countr_zero") { Test.test_countr_zero(); }
+    SUBCASE("countr_one") { Test.test_countr_one(); }
+}
+
+#endif
diff --git a/test/test_complex_trigonometric.cpp b/test/test_complex_trigonometric.cpp
index 8a0163be5..16e577ce1 100644
--- a/test/test_complex_trigonometric.cpp
+++ b/test/test_complex_trigonometric.cpp
@@ -43,7 +43,11 @@ struct complex_trigonometric_test
                                   real_value_type(0.1) + i * real_value_type(56.) / nb_input);
             ainput[i] = value_type(real_value_type(-1.) + real_value_type(2.) * i / nb_input,
                                    real_value_type(-1.1) + real_value_type(2.1) * i / nb_input);
-            atan_input[i] = value_type(real_value_type(-10.) + i * real_value_type(20.) / nb_input,
+            // Avoid sampling the branch cut of complex atan at (Re=0, |Im|>=1):
+            // the sign of Re(catan(+/-0 + i*y)) for |y|>1 is implementation-defined
+            // (C99 7.3.4.1), and glibc and musl disagree there. Starting Re at -9.5
+            // makes Re=0 occur at i=19/40 of the range, where |Im|<1 (analytic).
+            atan_input[i] = value_type(real_value_type(-9.5) + i * real_value_type(20.) / nb_input,
                                        real_value_type(-9.) + i * real_value_type(21.) / nb_input);
         }
         expected.resize(nb_input);
diff --git a/test/test_cpu_features.cpp b/test/test_cpu_features.cpp
new file mode 100644
index 000000000..05958a204
--- /dev/null
+++ b/test/test_cpu_features.cpp
@@ -0,0 +1,187 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include "xsimd/xsimd.hpp"
+
+#include <doctest/doctest.h>
+
+#include <algorithm>
+#include <array>
+#include <cstdlib>
+#include <string>
+
+#define CHECK_IMPLICATION(a, b) CHECK_UNARY(!(a) || (b))
+
+namespace detail
+{
+    void check_env_flag(const char* env_var, const char* feature_name, bool actual)
+    {
+        if (const char* val = std::getenv(env_var))
+        {
+            // Doctest struggles with string literals and const char *
+            // TODO(c++20): use std::format
+            auto msg = std::string(env_var) + " = " + val + ", " + feature_name + " = " + (actual ? "true" : "false");
+            INFO(msg);
+            CHECK_EQ(actual, val[0] == '1');
+        }
+    }
+
+    // TODO(c++23): use str.contains
+    bool contains(const std::string& haystack, const char* needle)
+    {
+        return haystack.find(needle) != std::string::npos;
+    }
+}
+
+#define CHECK_ENV_FEATURE(env_var, feature) detail::check_env_flag(env_var, #feature, feature)
+
+/**
+ * Tests that x86_cpu_features respects the architectural implication chains.
+ *
+ * These are "always true" assertions: if a higher feature is reported, all
+ * features it architecturally implies must also be reported. The test reads
+ * the current CPU's features at runtime and verifies every implication.
+ */
+TEST_CASE("[cpu_features] x86 implication chains")
+{
+    xsimd::cpu_features cpu;
+
+    // SSE implication chain
+    CHECK_IMPLICATION(cpu.sse4_2(), cpu.sse4_1());
+    CHECK_IMPLICATION(cpu.sse4_1(), cpu.ssse3());
+    CHECK_IMPLICATION(cpu.ssse3(), cpu.sse3());
+    CHECK_IMPLICATION(cpu.sse3(), cpu.sse2());
+
+    // AVX implication chain
+    CHECK_IMPLICATION(cpu.avx(), cpu.sse4_2());
+    CHECK_IMPLICATION(cpu.avx2(), cpu.avx());
+    CHECK_IMPLICATION(cpu.fma4(), cpu.avx());
+    CHECK_IMPLICATION(cpu.fma3(), cpu.avx());
+
+    // AVX-512 implication chain
+    CHECK_IMPLICATION(cpu.avx512f(), cpu.avx2());
+    CHECK_IMPLICATION(cpu.avx512vl(), cpu.avx512cd());
+    CHECK_IMPLICATION(cpu.avx512dq(), cpu.avx512f());
+    CHECK_IMPLICATION(cpu.avx512ifma(), cpu.avx512f());
+    CHECK_IMPLICATION(cpu.avx512pf(), cpu.avx512f());
+    CHECK_IMPLICATION(cpu.avx512er(), cpu.avx512f());
+    CHECK_IMPLICATION(cpu.avx512cd(), cpu.avx512f());
+    CHECK_IMPLICATION(cpu.avx512bw(), cpu.avx512f());
+    CHECK_IMPLICATION(cpu.avx512vbmi(), cpu.avx512f());
+    CHECK_IMPLICATION(cpu.avx512vbmi2(), cpu.avx512f());
+    CHECK_IMPLICATION(cpu.avx512vnni_bw(), cpu.avx512bw());
+    CHECK_IMPLICATION(cpu.avxvnni(), cpu.avx2());
+}
+
+TEST_CASE("[cpu_features] x86 manufacturer from environment")
+{
+    xsimd::x86_cpu_features cpu;
+
+    const char* val = std::getenv("XSIMD_TEST_CPU_ASSUME_MANUFACTURER");
+    if (val)
+    {
+        struct entry
+        {
+            const char* name;
+            xsimd::x86_manufacturer value;
+        };
+        std::array<entry, 9> manufacturers = { {
+            { "intel", xsimd::x86_manufacturer::intel },
+            { "amd", xsimd::x86_manufacturer::amd },
+            { "via", xsimd::x86_manufacturer::via },
+            { "zhaoxin", xsimd::x86_manufacturer::zhaoxin },
+            { "hygon", xsimd::x86_manufacturer::hygon },
+            { "transmeta", xsimd::x86_manufacturer::transmeta },
+            { "elbrus", xsimd::x86_manufacturer::elbrus },
+            { "microsoft_vpc", xsimd::x86_manufacturer::microsoft_vpc },
+            { "unknown", xsimd::x86_manufacturer::unknown },
+        } };
+
+        auto manufacturer = cpu.known_manufacturer();
+        const std::string allowed(val);
+        bool match = std::any_of(manufacturers.begin(), manufacturers.end(), [&](const entry& e)
+                                 { return e.value == manufacturer && detail::contains(allowed, e.name); });
+
+        auto const msg = std::string("XSIMD_TEST_CPU_ASSUME_MANUFACTURER = ") + val
+            + ", actual = " + xsimd::x86_manufacturer_name(manufacturer);
+        INFO(msg);
+        CHECK_UNARY(match);
+    }
+}
+
+TEST_CASE("[cpu_features] x86 features from environment")
+{
+    xsimd::cpu_features cpu;
+
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_SSE2", cpu.sse2());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_SSE3", cpu.sse3());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_SSSE3", cpu.ssse3());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_SSE4_1", cpu.sse4_1());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_SSE4_2", cpu.sse4_2());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_FMA3", cpu.fma3());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_FMA4", cpu.fma4());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX", cpu.avx());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX2", cpu.avx2());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX512F", cpu.avx512f());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX512BW", cpu.avx512bw());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX512CD", cpu.avx512cd());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX512VL", cpu.avx512vl());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVX512DQ", cpu.avx512dq());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_AVXVNNI", cpu.avxvnni());
+}
+
+TEST_CASE("[cpu_features] arm implication chains")
+{
+    xsimd::arm_cpu_features cpu;
+
+    CHECK_IMPLICATION(cpu.neon64(), cpu.neon());
+    CHECK_IMPLICATION(cpu.sve(), cpu.neon64());
+    CHECK_IMPLICATION(cpu.sve(), cpu.sve_size_bytes() >= (128 / 8));
+    CHECK_IMPLICATION(cpu.i8mm(), cpu.neon64());
+}
+
+TEST_CASE("[cpu_features] arm features from environment")
+{
+    xsimd::cpu_features cpu;
+
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_NEON", cpu.neon());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_NEON64", cpu.neon64());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_SVE", cpu.sve());
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_I8MM", cpu.i8mm());
+}
+
+TEST_CASE("[cpu_features] risc-v implication chains")
+{
+    xsimd::cpu_features cpu;
+
+    CHECK_IMPLICATION(cpu.rvv(), cpu.rvv_size_bytes() >= (128 / 8));
+}
+
+TEST_CASE("[cpu_features] risc-v features from environment")
+{
+    xsimd::cpu_features cpu;
+
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_RVV", cpu.rvv());
+}
+
+TEST_CASE("[cpu_features] ppc features from environment")
+{
+    xsimd::cpu_features cpu;
+
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_VSX", cpu.vsx());
+}
+
+TEST_CASE("[cpu_features] IBM Z (s390x) features from environment")
+{
+    xsimd::cpu_features cpu;
+
+    CHECK_ENV_FEATURE("XSIMD_TEST_CPU_ASSUME_VXE", cpu.vxe());
+}
diff --git a/test/test_custom_default_arch.cpp b/test/test_custom_default_arch.cpp
index 106e26762..3ae1f2aea 100644
--- a/test/test_custom_default_arch.cpp
+++ b/test/test_custom_default_arch.cpp
@@ -11,10 +11,11 @@
 #ifdef __STSE2__
 
 #define XSIMD_DEFAULT_ARCH xsimd::sse2
-#include "xsimd/xsimd.hpp"
 
 #include "test_utils.hpp"
 
+#include "xsimd/xsimd.hpp"
+
 // Could be different than sse2 if we compile for other architecture avx
 static_assert(std::is_same<xsimd::default_arch, xsimd::sse2>::value, "default arch correctly hooked");
 
diff --git a/test/test_gnu_source.cpp b/test/test_gnu_source.cpp
index 40303c921..e13d7671b 100644
--- a/test/test_gnu_source.cpp
+++ b/test/test_gnu_source.cpp
@@ -17,7 +17,7 @@
 #endif
 #include "xsimd/xsimd.hpp"
 
-#include "doctest/doctest.h"
+#include <doctest/doctest.h>
 
 TEST_CASE("[GNU_SOURCE support]")
 {
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
index 9fa7dbff8..c353976e5 100644
--- a/test/test_load_store.cpp
+++ b/test/test_load_store.cpp
@@ -12,20 +12,18 @@
 #include "xsimd/xsimd.hpp"
 #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
+#include "test_utils.hpp"
+
 #include <algorithm>
-#include <functional>
 #include <random>
 #include <type_traits>
 
-#include "test_utils.hpp"
-
 template <class B>
 struct load_store_test
 {
     using batch_type = B;
     using value_type = typename B::value_type;
     using index_type = typename xsimd::as_integer_t<batch_type>;
-    using batch_bool_type = typename batch_type::batch_bool_type;
     template <class T>
     using allocator = xsimd::default_allocator<T, typename B::arch_type>;
     static constexpr size_t size = B::size;
@@ -303,6 +301,33 @@ struct load_store_test
     };
 #endif
 
+    template <class Ptr>
+    void stream_load_if_same(Ptr const* ptr, batch_type& b, array_type const& expected_values, const std::string& name,
+                             std::true_type) const
+    {
+        b = xsimd::load(ptr, xsimd::stream_mode());
+        INFO(name, " stream (load)");
+        CHECK_BATCH_EQ(b, expected_values);
+    }
+
+    template <class Ptr>
+    void stream_load_if_same(Ptr const*, batch_type&, array_type const&, const std::string&, std::false_type) const
+    {
+    }
+
+    template <class Vec>
+    void stream_store_if_same(Vec& res, batch_type const& b, Vec const& reference, const std::string& name, std::true_type) const
+    {
+        xsimd::store(res.data(), b, xsimd::stream_mode());
+        INFO(name, " stream (store)");
+        CHECK_VECTOR_EQ(res, reference);
+    }
+
+    template <class Vec>
+    void stream_store_if_same(Vec&, batch_type const&, Vec const&, const std::string&, std::false_type) const
+    {
+    }
+
     template <class V>
     void test_load_impl(const V& v, const std::string& name)
     {
@@ -316,6 +341,10 @@ struct load_store_test
         INFO(name, " aligned");
         CHECK_BATCH_EQ(b, expected);
 
+        b = batch_type::load(v.data(), xsimd::stream_mode());
+        INFO(name, " stream (batch::load)");
+        CHECK_BATCH_EQ(b, expected);
+
         b = xsimd::load_as<value_type>(v.data(), xsimd::unaligned_mode());
         INFO(name, " unaligned (load_as)");
         CHECK_BATCH_EQ(b, expected);
@@ -324,6 +353,13 @@ struct load_store_test
         INFO(name, " aligned (load_as)");
         CHECK_BATCH_EQ(b, expected);
 
+        b = xsimd::load_as<value_type>(v.data(), xsimd::stream_mode());
+        INFO(name, " stream (load_as)");
+        CHECK_BATCH_EQ(b, expected);
+
+        stream_load_if_same(v.data(), b, expected, name,
+                            std::integral_constant<bool, std::is_same<typename V::value_type, value_type>::value> {});
+
         run_mask_tests(v, name, b, expected, std::is_same<typename V::value_type, value_type> {});
     }
 
@@ -474,6 +510,17 @@ struct load_store_test
         INFO(name, " aligned (store_as)");
         CHECK_VECTOR_EQ(res, v);
 
+        b.store(res.data(), xsimd::stream_mode());
+        INFO(name, " stream (batch::store)");
+        CHECK_VECTOR_EQ(res, v);
+
+        xsimd::store_as(res.data(), b, xsimd::stream_mode());
+        INFO(name, " stream (store_as)");
+        CHECK_VECTOR_EQ(res, v);
+
+        stream_store_if_same(res, b, v, name,
+                             std::integral_constant<bool, std::is_same<typename V::value_type, value_type>::value> {});
+
         V expected_masked(size);
 
         run_store_mask_section(v, name, b, res, expected_masked, std::is_same<typename V::value_type, value_type> {});
@@ -556,4 +603,35 @@ TEST_CASE_TEMPLATE("[load store]", B, BATCH_TYPES)
 
     SUBCASE("masked") { Test.test_masked(); }
 }
+
+TEST_CASE_TEMPLATE("store_masked respects Mode", B, BATCH_TYPES)
+{
+    using T = typename B::value_type;
+    using A = typename B::arch_type;
+    constexpr std::size_t N = B::size;
+
+    // Unaligned-mode + unaligned pointer: must not fault.
+    alignas(A::alignment()) T big[2 * N + 1] = {};
+    T* unaligned_ptr = big + 1; // sizeof(T)-aligned only
+
+    struct AllTrue
+    {
+        static constexpr bool get(std::size_t, std::size_t) { return true; }
+    };
+    auto cst_mask = xsimd::make_batch_bool_constant<T, AllTrue, A>();
+
+    B v(T(7));
+    v.store(unaligned_ptr, cst_mask, xsimd::unaligned_mode {});
+    for (std::size_t i = 0; i < N; ++i)
+        CHECK_EQ(unaligned_ptr[i], T(7));
+
+    // Overload resolution: store_masked with same-typed mask must compile.
+    // If C3 regresses, this becomes a compile error.
+    auto signed_cst_mask = xsimd::make_batch_bool_constant<T, AllTrue, A>();
+    alignas(A::alignment()) T aligned_buf[N] = {};
+    v.store(aligned_buf, signed_cst_mask, xsimd::aligned_mode {});
+    for (std::size_t i = 0; i < N; ++i)
+        CHECK_EQ(aligned_buf[i], T(7));
+}
+
 #endif
diff --git a/test/test_memory.cpp b/test/test_memory.cpp
index f582d59bd..1baebb533 100644
--- a/test/test_memory.cpp
+++ b/test/test_memory.cpp
@@ -12,14 +12,14 @@
 #include "xsimd/xsimd.hpp"
 #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
-#include <type_traits>
-#include <vector>
-
-#include "doctest/doctest.h"
-
 #include "xsimd/memory/xsimd_aligned_allocator.hpp"
 #include "xsimd/memory/xsimd_alignment.hpp"
 
+#include <doctest/doctest.h>
+
+#include <type_traits>
+#include <vector>
+
 struct mock_container
 {
 };
diff --git a/test/test_power.cpp b/test/test_power.cpp
index 6fa2ef396..cdf8c146a 100644
--- a/test/test_power.cpp
+++ b/test/test_power.cpp
@@ -13,7 +13,6 @@
 #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
 #include "test_utils.hpp"
-#include <iostream>
 
 template <class B>
 struct power_test
diff --git a/test/test_rounding.cpp b/test/test_rounding.cpp
index d800edd67..681aaaef7 100644
--- a/test/test_rounding.cpp
+++ b/test/test_rounding.cpp
@@ -56,12 +56,12 @@ struct rounding_test
             std::transform(input.cbegin(), input.cend(), expected.begin(),
                            [](const value_type& v)
                            { return std::ceil(v); });
-            for (size_t i = 0; i < nb_batches; i += size)
+            for (size_t i = 0; i < nb_batches; i++)
             {
                 batch_type in, out, ref;
-                detail::load_batch(in, input, i);
+                detail::load_batch(in, input, i * size);
                 out = ceil(in);
-                detail::load_batch(ref, expected, i);
+                detail::load_batch(ref, expected, i * size);
                 INFO("ceil");
                 CHECK_BATCH_EQ(ref, out);
             }
@@ -71,12 +71,12 @@ struct rounding_test
             std::transform(input.cbegin(), input.cend(), expected.begin(),
                            [](const value_type& v)
                            { return std::floor(v); });
-            for (size_t i = 0; i < nb_batches; i += size)
+            for (size_t i = 0; i < nb_batches; i++)
             {
                 batch_type in, out, ref;
-                detail::load_batch(in, input, i);
+                detail::load_batch(in, input, i * size);
                 out = floor(in);
-                detail::load_batch(ref, expected, i);
+                detail::load_batch(ref, expected, i * size);
                 INFO("floor");
                 CHECK_BATCH_EQ(ref, out);
             }
@@ -86,12 +86,12 @@ struct rounding_test
             std::transform(input.cbegin(), input.cend(), expected.begin(),
                            [](const value_type& v)
                            { return std::trunc(v); });
-            for (size_t i = 0; i < nb_batches; i += size)
+            for (size_t i = 0; i < nb_batches; i++)
             {
                 batch_type in, out, ref;
-                detail::load_batch(in, input, i);
+                detail::load_batch(in, input, i * size);
                 out = trunc(in);
-                detail::load_batch(ref, expected, i);
+                detail::load_batch(ref, expected, i * size);
                 INFO("trunc");
                 CHECK_BATCH_EQ(ref, out);
             }
@@ -101,12 +101,12 @@ struct rounding_test
             std::transform(input.cbegin(), input.cend(), expected.begin(),
                            [](const value_type& v)
                            { return std::round(v); });
-            for (size_t i = 0; i < nb_batches; i += size)
+            for (size_t i = 0; i < nb_batches; i++)
             {
                 batch_type in, out, ref;
-                detail::load_batch(in, input, i);
+                detail::load_batch(in, input, i * size);
                 out = round(in);
-                detail::load_batch(ref, expected, i);
+                detail::load_batch(ref, expected, i * size);
                 INFO("round");
                 CHECK_BATCH_EQ(ref, out);
             }
@@ -116,12 +116,12 @@ struct rounding_test
             std::transform(input.cbegin(), input.cend(), expected.begin(),
                            [](const value_type& v)
                            { return std::nearbyint(v); });
-            for (size_t i = 0; i < nb_batches; i += size)
+            for (size_t i = 0; i < nb_batches; i++)
             {
                 batch_type in, out, ref;
-                detail::load_batch(in, input, i);
+                detail::load_batch(in, input, i * size);
                 out = nearbyint(in);
-                detail::load_batch(ref, expected, i);
+                detail::load_batch(ref, expected, i * size);
                 INFO("nearbyint");
                 CHECK_BATCH_EQ(ref, out);
             }
@@ -132,13 +132,13 @@ struct rounding_test
             std::transform(input.cbegin(), input.cend(), expected.begin(),
                            [](const value_type& v)
                            { return xsimd::nearbyint_as_int(v); });
-            for (size_t i = 0; i < nb_batches; i += size)
+            for (size_t i = 0; i < nb_batches; i++)
             {
                 batch_type in;
                 int_batch_type out, ref;
-                detail::load_batch(in, input, i);
+                detail::load_batch(in, input, i * size);
                 out = nearbyint_as_int(in);
-                detail::load_batch(ref, expected, i);
+                detail::load_batch(ref, expected, i * size);
                 INFO("nearbyint_as_int");
                 CHECK_BATCH_EQ(ref, out);
             }
@@ -148,12 +148,12 @@ struct rounding_test
             std::transform(input.cbegin(), input.cend(), expected.begin(),
                            [](const value_type& v)
                            { return std::rint(v); });
-            for (size_t i = 0; i < nb_batches; i += size)
+            for (size_t i = 0; i < nb_batches; i++)
             {
                 batch_type in, out, ref;
-                detail::load_batch(in, input, i);
+                detail::load_batch(in, input, i * size);
                 out = rint(in);
-                detail::load_batch(ref, expected, i);
+                detail::load_batch(ref, expected, i * size);
                 INFO("rint");
                 CHECK_BATCH_EQ(ref, out);
             }
diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp
index 1719d329e..b02742dfd 100644
--- a/test/test_shuffle.cpp
+++ b/test/test_shuffle.cpp
@@ -10,12 +10,14 @@
  ****************************************************************************/
 
 #include "xsimd/xsimd.hpp"
+
+#include <vector>
 #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
 
 #include "test_utils.hpp"
 
 #ifdef __linux__
-#include "endian.h"
+#include <endian.h>
 #if BYTE_ORDER == BIG_ENDIAN
 #define XSIMD_NO_SLIDE
 #endif
@@ -620,13 +622,14 @@ struct shuffle_test
     void transpose()
     {
         B b_lhs = B::load_unaligned(lhs.data());
-        std::array<B, size> b_matrix;
+        // Due to issue with Qemu AVX512, we rely on vector for this otherwise static array
+        std::vector<B, xsimd::aligned_allocator<B>> b_matrix = {};
         for (size_t i = 0; i < size; ++i)
-            b_matrix[i] = b_lhs;
-        std::array<value_type, size * size> ref_matrix;
+            b_matrix.emplace_back(b_lhs);
+        std::vector<value_type, xsimd::aligned_allocator<value_type>> ref_matrix = {};
         for (size_t i = 0; i < size; ++i)
             for (size_t j = 0; j < size; ++j)
-                ref_matrix[i * size + j] = lhs[i];
+                ref_matrix.emplace_back(lhs[i]);
 
         INFO("transpose");
         xsimd::transpose(b_matrix.data(), b_matrix.data() + b_matrix.size());
diff --git a/test/test_utils.hpp b/test/test_utils.hpp
index 5f25f3e77..caa8b8f8e 100644
--- a/test/test_utils.hpp
+++ b/test/test_utils.hpp
@@ -11,16 +11,18 @@
 
 #include "xsimd/xsimd.hpp"
 
+#include <doctest/doctest.h>
+
 #include <array>
-#include <climits>
 #include <cmath>
 #include <complex>
+#include <iomanip>
 #include <limits>
+#include <sstream>
+#include <string>
 #include <type_traits>
 #include <vector>
 
-#include "doctest/doctest.h"
-
 #ifndef XSIMD_TEST_UTILS_HPP
 #define XSIMD_TEST_UTILS_HPP
 
@@ -35,7 +37,7 @@
 // This also seems to happen in M1.
 struct precision_t
 {
-#if defined(__apple_build_version__) && (XSIMD_WITH_SSE4_1 || XSIMD_WITH_NEON64)
+#if defined(__APPLE__) && (XSIMD_WITH_SSE4_1 || XSIMD_WITH_NEON64)
     static constexpr size_t max = 8192;
 #else
     static constexpr size_t max = 2048;
@@ -386,28 +388,84 @@ namespace detail
         b.store_unaligned(dst.data() + i);
     }
 
+    // Non-template context scope to avoid per-instantiation vtable issues with MinGW GCC.
+    // INFO() creates a ContextScope<Lambda> with a unique vtable per template instantiation.
+    // This concrete class has a single vtable definition shared across all instantiations.
+    struct StringContextScope : doctest::detail::ContextScopeBase
+    {
+        std::string msg_;
+        explicit StringContextScope(std::string msg)
+            : msg_(std::move(msg))
+        {
+        }
+        void stringify(std::ostream* os) const override { *os << msg_; }
+    };
+
+    template <typename T, typename std::enable_if<std::is_floating_point<T>::value, int>::type = 0>
+    std::string to_string_full_precision(T value)
+    {
+        // TODO(C++17): use std::to_chars
+        std::ostringstream ss;
+        ss << std::setprecision(std::numeric_limits<T>::max_digits10) << value;
+        return ss.str();
+    }
+
+    template <typename T, typename std::enable_if<!std::is_floating_point<T>::value, int>::type = 0>
+    std::string to_string_full_precision(T value)
+    {
+        return doctest::toString(value).c_str();
+    }
+
+    template <class T>
+    StringContextScope make_context_info(const char* name, const T& val)
+    {
+        return StringContextScope(std::string(name) + ":" + to_string_full_precision(val));
+    }
 }
 
-#define CHECK_BATCH_EQ(b1, b2)                            \
-    do                                                    \
-    {                                                     \
-        INFO(#b1 ":", b1);                                \
-        INFO(#b2 ":", b2);                                \
-        CHECK_UNARY(::detail::expect_batch_near(b1, b2)); \
+// We use make_context_info instead of INFO() to avoid MinGW GCC vtable issues
+// (see StringContextScope above). Unlike INFO(), make_context_info is eager:
+// it stringifies its operands at construction. To keep the happy path cheap
+// (these macros are called in tight loops and QEMU-emulated CI targets like
+// ppc64le blow past wall-clock otherwise), we first evaluate the predicate
+// and only build the context when it fails. On failure the predicate is
+// re-evaluated inside CHECK_UNARY so doctest records the expression text;
+// this requires the operands to be side-effect-free, which holds for all
+// call sites here.
+#define CHECK_BATCH_EQ(b1, b2)                                             \
+    do                                                                     \
+    {                                                                      \
+        const bool batches_are_near = ::detail::expect_batch_near(b1, b2); \
+        if (!batches_are_near)                                             \
+        {                                                                  \
+            auto _ctx1 = ::detail::make_context_info(#b1, b1);             \
+            auto _ctx2 = ::detail::make_context_info(#b2, b2);             \
+            CHECK_UNARY(batches_are_near);                                 \
+        }                                                                  \
     } while (0)
-#define CHECK_SCALAR_EQ(s1, s2)                            \
-    do                                                     \
-    {                                                      \
-        INFO(#s1 ":", s1);                                 \
-        INFO(#s2 ":", s2);                                 \
-        CHECK_UNARY(::detail::expect_scalar_near(s1, s2)); \
+
+#define CHECK_SCALAR_EQ(s1, s2)                                             \
+    do                                                                      \
+    {                                                                       \
+        const bool scalars_are_near = ::detail::expect_scalar_near(s1, s2); \
+        if (!scalars_are_near)                                              \
+        {                                                                   \
+            auto _ctx1 = ::detail::make_context_info(#s1, s1);              \
+            auto _ctx2 = ::detail::make_context_info(#s2, s2);              \
+            CHECK_UNARY(scalars_are_near);                                  \
+        }                                                                   \
     } while (0)
-#define CHECK_VECTOR_EQ(v1, v2)                            \
-    do                                                     \
-    {                                                      \
-        INFO(#v1 ":", v1);                                 \
-        INFO(#v2 ":", v2);                                 \
-        CHECK_UNARY(::detail::expect_vector_near(v1, v2)); \
+
+#define CHECK_VECTOR_EQ(v1, v2)                                             \
+    do                                                                      \
+    {                                                                       \
+        const bool vectors_are_near = ::detail::expect_vector_near(v1, v2); \
+        if (!vectors_are_near)                                              \
+        {                                                                   \
+            auto _ctx1 = ::detail::make_context_info(#v1, v1);              \
+            auto _ctx2 = ::detail::make_context_info(#v2, v2);              \
+            CHECK_UNARY(vectors_are_near);                                  \
+        }                                                                   \
     } while (0)
 
 /***********************
diff --git a/test/test_utils_bits.cpp b/test/test_utils_bits.cpp
new file mode 100644
index 000000000..c1ac5af1b
--- /dev/null
+++ b/test/test_utils_bits.cpp
@@ -0,0 +1,120 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ***************************************************************************/
+
+#include "xsimd/utils/bits.hpp"
+
+#include <doctest/doctest.h>
+
+#include <cstdint>
+
+TEST_CASE("[utils::make_bit_mask] single bit")
+{
+    CHECK_EQ(xsimd::utils::make_bit_mask<std::uint8_t>(0), 0x01);
+    CHECK_EQ(xsimd::utils::make_bit_mask<std::uint8_t>(7), 0x80);
+    CHECK_EQ(xsimd::utils::make_bit_mask<std::uint32_t>(0), 0x01u);
+    CHECK_EQ(xsimd::utils::make_bit_mask<std::uint32_t>(31), 0x80000000u);
+}
+
+TEST_CASE("[utils::make_bit_mask] multiple bits")
+{
+    CHECK_EQ(xsimd::utils::make_bit_mask(0u, 1u), 0b11);
+    CHECK_EQ(xsimd::utils::make_bit_mask<std::uint8_t>(0, 2, 4), 0b00010101);
+}
+
+TEST_CASE("[utils::all_bits_set] basic")
+{
+    CHECK(xsimd::utils::all_bits_set<0>(0x01u));
+    CHECK(xsimd::utils::all_bits_set<7>(0x80u));
+    CHECK_FALSE(xsimd::utils::all_bits_set<0>(0x00u));
+    CHECK_FALSE(xsimd::utils::all_bits_set<1>(0x01u));
+}
+
+TEST_CASE("[utils::all_bits_set] multiple bits")
+{
+    CHECK((xsimd::utils::all_bits_set<0, 1>(0x03u)));
+    CHECK((xsimd::utils::all_bits_set<0, 1>(0xFFu)));
+    CHECK_FALSE((xsimd::utils::all_bits_set<0, 1>(0x01u)));
+}
+
+TEST_CASE("[utils::set_bit]")
+{
+    CHECK_EQ(xsimd::utils::set_bit<0>(0u), 0x01u);
+    CHECK_EQ(xsimd::utils::set_bit<3>(0u), 0x08u);
+    // Idempotent: setting an already-set bit
+    CHECK_EQ(xsimd::utils::set_bit<0>(0x01u), 0x01u);
+    // Does not clear other bits
+    CHECK_EQ(xsimd::utils::set_bit<1>(0b1101u), 0b1111u);
+}
+
+TEST_CASE("[utils::make_low_mask]")
+{
+    CHECK_EQ(xsimd::utils::make_low_mask<std::uint8_t>(0), 0x00);
+    CHECK_EQ(xsimd::utils::make_low_mask<std::uint8_t>(1), 0x01);
+    CHECK_EQ(xsimd::utils::make_low_mask<std::uint8_t>(4), 0x0F);
+    CHECK_EQ(xsimd::utils::make_low_mask<std::uint8_t>(7), 0x7F);
+    // Full width
+    CHECK_EQ(xsimd::utils::make_low_mask<std::uint8_t>(8), 0xFF);
+    CHECK_EQ(xsimd::utils::make_low_mask<std::uint32_t>(32), 0xFFFFFFFFu);
+    CHECK_EQ(xsimd::utils::make_low_mask<std::uint64_t>(64), 0xFFFFFFFFFFFFFFFFu);
+}
+
+enum class flag : std::uint32_t
+{
+    A = 0,
+    B = 1,
+    C = 4,
+    D = 31,
+};
+
+TEST_CASE("[utils::uint_bitset] default construction")
+{
+    xsimd::utils::uint_bitset<flag, std::uint32_t> bs;
+    CHECK_FALSE(bs.bit_is_set<flag::A>());
+    CHECK_FALSE(bs.bit_is_set<flag::B>());
+}
+
+TEST_CASE("[utils::uint_bitset] construction from raw value")
+{
+    xsimd::utils::uint_bitset<flag, std::uint32_t> bs(0b11u);
+    CHECK(bs.bit_is_set<flag::A>());
+    CHECK(bs.bit_is_set<flag::B>());
+    CHECK_FALSE(bs.bit_is_set<flag::C>());
+}
+
+TEST_CASE("[utils::uint_bitset] set_bit")
+{
+    xsimd::utils::uint_bitset<flag, std::uint32_t> bs;
+    bs.set_bit<flag::A>();
+    CHECK(bs.bit_is_set<flag::A>());
+    CHECK_FALSE(bs.bit_is_set<flag::B>());
+    bs.set_bit<flag::D>();
+    CHECK(bs.bit_is_set<flag::D>());
+}
+
+TEST_CASE("[utils::uint_bitset] all_bits_set")
+{
+    xsimd::utils::uint_bitset<flag, std::uint32_t> bs(0b11u);
+    CHECK((bs.all_bits_set<flag::A, flag::B>()));
+    CHECK_FALSE((bs.all_bits_set<flag::A, flag::C>()));
+}
+
+TEST_CASE("[utils::uint_bitset] get_range")
+{
+    enum class rk : std::uint32_t
+    {
+        lo = 0,
+        mid = 4,
+        hi = 8
+    };
+    xsimd::utils::uint_bitset<rk, std::uint32_t> bs(0b10101011u);
+    CHECK_EQ((bs.get_range<rk::lo, rk::mid>()), 0b1011u);
+    CHECK_EQ((bs.get_range<rk::mid, rk::hi>()), 0b1010u);
+}
diff --git a/test/test_xsimd_api.cpp b/test/test_xsimd_api.cpp
index 8c58543ad..454c360c9 100644
--- a/test/test_xsimd_api.cpp
+++ b/test/test_xsimd_api.cpp
@@ -12,7 +12,7 @@
 #include "xsimd/types/xsimd_utils.hpp"
 #include "xsimd/xsimd.hpp"
 
-#include "doctest/doctest.h"
+#include <doctest/doctest.h>
 
 template <class T>
 struct scalar_type
@@ -351,7 +351,7 @@ struct xsimd_api_integral_types_functions
 {
     using value_type = typename scalar_type<T>::type;
 
-    void test_bitwise_lshift()
+    void test_bitwise_lshift_single()
     {
         constexpr int shift = 3;
         value_type val0(12);
@@ -364,6 +364,35 @@ struct xsimd_api_integral_types_functions
         CHECK_EQ(extract(cr), r);
     }
 
+    /* Test when T is a batch_constant only, not a scalar. */
+    template <typename U = T>
+    void test_bitwise_lshift_multiple(T const& vals, std::enable_if_t<!std::is_integral<U>::value, int> = 0)
+    {
+#ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE
+        constexpr auto Max = static_cast<value_type>(std::numeric_limits<value_type>::digits);
+        constexpr auto max_batch = xsimd::make_batch_constant<value_type, Max>();
+        constexpr auto shifts = xsimd::make_iota_batch_constant<value_type>() % max_batch;
+
+        {
+            auto shifted = xsimd::bitwise_lshift(vals, shifts.as_batch());
+            auto shifted_cst = xsimd::bitwise_lshift(vals, shifts);
+
+            for (std::size_t i = 0; i < shifts.size; ++i)
+            {
+                const auto expected = static_cast<value_type>(vals.get(i) << shifts.get(i));
+                CHECK_EQ(shifted.get(i), expected);
+                CHECK_EQ(shifted_cst.get(i), expected);
+            }
+        }
+#endif
+    }
+
+    /* Test multiple does not make sense when T is scalar. */
+    template <typename U = T>
+    void test_bitwise_lshift_multiple(T const&, std::enable_if_t<std::is_integral<U>::value, int> = 0)
+    {
+    }
+
     void test_bitwise_rshift()
     {
         constexpr int shift = 3;
@@ -424,11 +453,20 @@ struct xsimd_api_integral_types_functions
 
 TEST_CASE_TEMPLATE("[xsimd api | integral types functions]", B, INTEGRAL_TYPES)
 {
-    xsimd_api_integral_types_functions<B> Test;
+    using test_type = xsimd_api_integral_types_functions<B>;
+
+    test_type Test;
+
+    SUBCASE("test_bitwise_lshift_single")
+    {
+        Test.test_bitwise_lshift_single();
+    }
 
-    SUBCASE("bitwise_lshift")
+    SUBCASE("bitwise_lshift_multiple")
     {
-        Test.test_bitwise_lshift();
+        Test.test_bitwise_lshift_multiple({ 1 });
+        Test.test_bitwise_lshift_multiple({ 3 });
+        Test.test_bitwise_lshift_multiple({ 127 });
     }
 
     SUBCASE("bitwise_rshift")
@@ -553,12 +591,16 @@ struct xsimd_api_float_types_functions
     void test_exp()
     {
         value_type val(2);
+#if defined(__FAST_MATH__) || XSIMD_REASSOCIATIVE_MATH
+        CHECK_EQ(extract(xsimd::exp(T(val))), doctest::Approx(std::exp(val)));
+#else
         CHECK_EQ(extract(xsimd::exp(T(val))), std::exp(val));
+#endif
     }
     void test_exp10()
     {
         value_type val(2);
-#ifdef EMSCRIPTEN
+#if defined(EMSCRIPTEN) || defined(__FAST_MATH__)
         CHECK_EQ(extract(xsimd::exp10(T(val))), doctest::Approx(std::pow(value_type(10), val)));
 #else
         CHECK_EQ(extract(xsimd::exp10(T(val))), std::pow(value_type(10), val));
diff --git a/xsimdConfig.cmake.in b/xsimdConfig.cmake.in
index 1c395450d..6f12f60fc 100644
--- a/xsimdConfig.cmake.in
+++ b/xsimdConfig.cmake.in
@@ -18,12 +18,37 @@
 @PACKAGE_INIT@
 
 if(NOT TARGET @PROJECT_NAME@)
-    set(@PROJECT_NAME@_ENABLE_XTL_COMPLEX @ENABLE_XTL_COMPLEX@)
-    if(@PROJECT_NAME@_ENABLE_XTL_COMPLEX)
-        include(CMakeFindDependencyMacro)
-        find_dependency(xtl REQUIRED)
+    include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
+    get_target_property(
+        @PROJECT_NAME@_INCLUDE_DIRS
+        @PROJECT_NAME@ INTERFACE_INCLUDE_DIRECTORIES
+    )
+
+    # xsimd <= 14 behaviour.
+    # Packagers that configured xsimd for their users with ENABLE_XTL_COMPLEX
+    # have it automatically enabled.
+    # Can be removed in xsimd 15. Keeping it with a version check so we do not
+    # miss the opportunity to remove it in the next major release.
+    # Compared to legacy version, users can now override the check
+    # with XSIMD_ENABLE_XTL_COMPLEX.
+    if(
+        @ENABLE_XTL_COMPLEX@
+        AND @XSIMD_VERSION_MAJOR@ LESS_EQUAL 14
+        AND NOT DEFINED XSIMD_ENABLE_XTL_COMPLEX
+    )
+        # The next check will handle adding it to the target
+        set(XSIMD_ENABLE_XTL_COMPLEX @ENABLE_XTL_COMPLEX@)
     endif()
 
-    include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
-    get_target_property(@PROJECT_NAME@_INCLUDE_DIRS @PROJECT_NAME@ INTERFACE_INCLUDE_DIRECTORIES)
+    # Final users of xsimd in a package manager control whether or not to
+    # enable xtl with this variable.
+    if(XSIMD_ENABLE_XTL_COMPLEX)
+        include(CMakeFindDependencyMacro)
+        find_dependency(xtl 0.8.0 REQUIRED)
+        target_link_libraries(@PROJECT_NAME@ INTERFACE xtl)
+        target_compile_definitions(
+            @PROJECT_NAME@
+            INTERFACE XSIMD_ENABLE_XTL_COMPLEX=1
+        )
+    endif()
 endif()