diff --git a/.github/workflows/android_cmake/start.sh b/.github/workflows/android_cmake/start.sh
index 4b275f01a3af..e7e6a090c4e1 100755
--- a/.github/workflows/android_cmake/start.sh
+++ b/.github/workflows/android_cmake/start.sh
@@ -88,6 +88,10 @@ PKG_CONFIG_LIBDIR=/tmp/install/lib/pkgconfig cmake .. \
  -DSFCGAL_CONFIG=disabled \
  -DHDF5_C_COMPILER_EXECUTABLE=disabled \
  -DHDF5_CXX_COMPILER_EXECUTABLE=disabled
+
+echo "Check that GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON"
+(grep "GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON" CMakeCache.txt > /dev/null && echo "yes") || (echo "Missing" && /bin/false)
+
 make -j$(nproc)
 make install
 cd ..
diff --git a/.github/workflows/cmake_builds.yml b/.github/workflows/cmake_builds.yml
index 910c84966de4..938179c23dbd 100644
--- a/.github/workflows/cmake_builds.yml
+++ b/.github/workflows/cmake_builds.yml
@@ -606,7 +606,7 @@ jobs:
     - name: Setup xcode
       uses: maxim-lobanov/setup-xcode@60606e260d2fc5762a71e64e74b2174e8ea3c8bd # v1.6.0
       with:
-        xcode-version: 14.3
+        xcode-version: '15.4.0'
     - name: Checkout GDAL
       uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0
     - name: Setup cache
diff --git a/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt b/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt
index ef95d09f9d76..cd35f8279933 100644
--- a/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt
+++ b/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt
@@ -86,4 +86,5 @@ Supported Formats: (ro:read-only, rw:read-write, +:update, v:virtual-I/O s:subda
   TIGER -vector- (rov): U.S. Census TIGER/Line
   AVCBin -vector- (rov): Arc/Info Binary Coverage
   AVCE00 -vector- (rov): Arc/Info E00 (ASCII) Coverage (*.e00)
+  AIVector -vector- (ro): Artificial Intelligence powered vector driver
   HTTP -raster,vector- (ro): HTTP Fetching Wrapper
diff --git a/.github/workflows/windows_conda_expected_ogrinfo_formats.txt b/.github/workflows/windows_conda_expected_ogrinfo_formats.txt
index 908f8769c49c..c910b318a6cf 100644
--- a/.github/workflows/windows_conda_expected_ogrinfo_formats.txt
+++ b/.github/workflows/windows_conda_expected_ogrinfo_formats.txt
@@ -81,4 +81,5 @@ Supported Formats: (ro:read-only, rw:read-write, +:update, v:virtual-I/O s:subda
   TIGER -vector- (rov): U.S. Census TIGER/Line
   AVCBin -vector- (rov): Arc/Info Binary Coverage
   AVCE00 -vector- (rov): Arc/Info E00 (ASCII) Coverage (*.e00)
+  AIVector -vector- (ro): Artificial Intelligence powered vector driver
   HTTP -raster,vector- (ro): HTTP Fetching Wrapper
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f45a709ff7d0..058d13cf2877 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -56,6 +56,7 @@ repos:
                 frmts/pcidsk/sdk|
                 frmts/grib/degrib/degrib|
                 frmts/grib/degrib/g2clib|
+                gcore/sse2neon.h|
                 port/utf8.h|
                 ogr/ogrsf_frmts/adbc/ogr_adbc_internal.h|
                 ogr/ogrsf_frmts/cad/libopencad/|
diff --git a/CITATION.cff b/CITATION.cff
index a4b3a817404c..f55417d0e5bb 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -2,8 +2,8 @@ cff-version: 1.2.0
 message: Please cite this software using these metadata or in the CITATION file.
 type: software
 title: GDAL
-version: 3.8.3
-date-released: 2024-01-02
+version: 3.10.0
+date-released: 2024-11-01
 doi: 10.5281/zenodo.5884351
 abstract: GDAL is a translator library for raster and vector geospatial data
   formats that is released under an MIT style Open Source License by the Open
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 07d1a09d5f2c..460537e2fcda 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,6 +94,16 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(x86|AMD64)")
     endif ()
   endif ()
 
+else()
+
+  # Check ability to use Arm Neon optimizations
+  include(CheckCXXSourceCompiles)
+  include(CMakePushCheckState)
+  cmake_push_check_state(RESET)
+  set(CMAKE_REQUIRED_INCLUDES "${CMAKE_CURRENT_SOURCE_DIR}/gcore")
+  check_cxx_source_compiles("#include \"include_sse2neon.h\"\nint main() { return 0; }" SSE2NEON_COMPILES)
+  cmake_pop_check_state()
+
 endif ()
 #
 option(CLANG_TIDY_ENABLED "Run clang-tidy with the compiler." OFF)
diff --git a/Doxyfile b/Doxyfile
index b22e38ec20c7..4a896612e813 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -415,7 +415,9 @@ RECURSIVE              = NO
 # subdirectory from a directory tree whose root is specified with the INPUT tag.
 
 EXCLUDE                = gcore/rawdataset.cpp \
-			 gcore/rawdataset.h
+                         gcore/rawdataset.h \
+                         gcore/include_sse2neon.h \
+                         gcore/sse2neon.h
 
 # The EXCLUDE_SYMLINKS tag can be used select whether or not files or
 # directories that are symbolic links (a Unix filesystem feature) are excluded
diff --git a/HOWTO-RELEASE b/HOWTO-RELEASE
index 684d3b589afd..46c735f003b6 100644
--- a/HOWTO-RELEASE
+++ b/HOWTO-RELEASE
@@ -358,3 +358,10 @@ or your message manually approved, with an administrator of the list.
 
 23) For bugfixes releases, forward port to master changes done in
     doc/source/about_no_title.rst, doc/source/download.rst and doc/source/download_past.rst
+
+24) For a feature release, enable a new version in the ReadTheDocs administration panel.
+    a) Go to https://readthedocs.org/projects/gdal/versions/
+    b) In the "Activate a version" tab, enter "release/X.Y" in the text entry and click on the Filter button
+    c) Click on the Activate button
+    d) Go to https://readthedocs.org/projects/gdal/, and in the "Compile a version" drop-down list,
+       select the newt "release-X.Y" label and click on the "Compile a version" button
diff --git a/apps/argparse/argparse.hpp b/apps/argparse/argparse.hpp
index a52142848192..029fa03c2c42 100644
--- a/apps/argparse/argparse.hpp
+++ b/apps/argparse/argparse.hpp
@@ -2086,7 +2086,7 @@ class ArgumentParser {
     std::stringstream stream;
 
     std::string curline("Usage: ");
-    curline += this->m_program_name;
+    curline += this->m_parser_path;
     const bool multiline_usage =
         this->m_usage_max_line_width < std::numeric_limits<std::size_t>::max();
     const size_t indent_size = curline.size();
diff --git a/apps/gdalargumentparser.cpp b/apps/gdalargumentparser.cpp
index 724dcd91957b..3b795325345b 100644
--- a/apps/gdalargumentparser.cpp
+++ b/apps/gdalargumentparser.cpp
@@ -37,7 +37,7 @@ GDALArgumentParser::GDALArgumentParser(const std::string &program_name,
                 [this](const auto &)
                 {
                     std::cout << usage() << std::endl << std::endl;
-                    std::cout << _("Note: ") << m_program_name
+                    std::cout << _("Note: ") << m_parser_path
                               << _(" --long-usage for full help.") << std::endl;
                     std::exit(0);
                 })
diff --git a/autotest/cpp/CMakeLists.txt b/autotest/cpp/CMakeLists.txt
index 493f1d59f9bd..d8420177a9ff 100644
--- a/autotest/cpp/CMakeLists.txt
+++ b/autotest/cpp/CMakeLists.txt
@@ -77,6 +77,7 @@ add_executable(
   test_gdal_aaigrid.cpp
   test_gdal_dted.cpp
   test_gdal_gtiff.cpp
+  test_gdal_minmax_element.cpp
   test_gdal_pixelfn.cpp
   test_gdal_typetraits.cpp
   test_ogr.cpp
diff --git a/autotest/cpp/googletest/CMakeLists.txt.in b/autotest/cpp/googletest/CMakeLists.txt.in
index 1cdcf819370b..5055e6628fae 100644
--- a/autotest/cpp/googletest/CMakeLists.txt.in
+++ b/autotest/cpp/googletest/CMakeLists.txt.in
@@ -1,5 +1,5 @@
 # Source https://github.com/google/googletest/blob/master/googletest/README.md
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.16)
 
 project(googletest-download NONE)
 
@@ -10,8 +10,8 @@ endif()
 
 include(ExternalProject)
 ExternalProject_Add(googletest
-  URL https://github.com/google/googletest/archive/release-1.12.1.zip
-  URL_HASH SHA1=973e464e8936d4b79bb24f27b058aaef4150b06e
+  URL https://github.com/google/googletest/releases/download/v1.15.2/googletest-1.15.2.tar.gz
+  URL_HASH SHA1=568d58e26bd4e838449ca7ab8ebc152b3cbd210d
   DOWNLOAD_NO_PROGRESS ON
   SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
   BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
diff --git a/autotest/cpp/test_gdal.cpp b/autotest/cpp/test_gdal.cpp
index 45ebfb4ab870..9be606d82c6b 100644
--- a/autotest/cpp/test_gdal.cpp
+++ b/autotest/cpp/test_gdal.cpp
@@ -4777,4 +4777,143 @@ TEST_F(test_gdal, ReadRaster)
     }
 }
 
+// Test GDALComputeRasterMinMaxLocation
+TEST_F(test_gdal, GDALComputeRasterMinMaxLocation)
+{
+    GDALDatasetH hDS = GDALOpen(GCORE_DATA_DIR "byte.tif", GA_ReadOnly);
+    ASSERT_NE(hDS, nullptr);
+    GDALRasterBandH hBand = GDALGetRasterBand(hDS, 1);
+    {
+        double dfMin = 0;
+        double dfMax = 0;
+        int nMinX = -1;
+        int nMinY = -1;
+        int nMaxX = -1;
+        int nMaxY = -1;
+        EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, &dfMin, &dfMax, &nMinX,
+                                                  &nMinY, &nMaxX, &nMaxY),
+                  CE_None);
+        EXPECT_EQ(dfMin, 74.0);
+        EXPECT_EQ(dfMax, 255.0);
+        EXPECT_EQ(nMinX, 9);
+        EXPECT_EQ(nMinY, 17);
+        EXPECT_EQ(nMaxX, 2);
+        EXPECT_EQ(nMaxY, 18);
+        GByte val = 0;
+        EXPECT_EQ(GDALRasterIO(hBand, GF_Read, nMinX, nMinY, 1, 1, &val, 1, 1,
+                               GDT_Byte, 0, 0),
+                  CE_None);
+        EXPECT_EQ(val, 74);
+        EXPECT_EQ(GDALRasterIO(hBand, GF_Read, nMaxX, nMaxY, 1, 1, &val, 1, 1,
+                               GDT_Byte, 0, 0),
+                  CE_None);
+        EXPECT_EQ(val, 255);
+    }
+    {
+        int nMinX = -1;
+        int nMinY = -1;
+        EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, nullptr, nullptr,
+                                                  &nMinX, &nMinY, nullptr,
+                                                  nullptr),
+                  CE_None);
+        EXPECT_EQ(nMinX, 9);
+        EXPECT_EQ(nMinY, 17);
+    }
+    {
+        int nMaxX = -1;
+        int nMaxY = -1;
+        EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, nullptr, nullptr,
+                                                  nullptr, nullptr, &nMaxX,
+                                                  &nMaxY),
+                  CE_None);
+        EXPECT_EQ(nMaxX, 2);
+        EXPECT_EQ(nMaxY, 18);
+    }
+    {
+        EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, nullptr, nullptr,
+                                                  nullptr, nullptr, nullptr,
+                                                  nullptr),
+                  CE_None);
+    }
+    GDALClose(hDS);
+}
+
+// Test GDALComputeRasterMinMaxLocation
+TEST_F(test_gdal, GDALComputeRasterMinMaxLocation_byte_min_max_optim)
+{
+    GDALDatasetUniquePtr poDS(GDALDriver::FromHandle(GDALGetDriverByName("MEM"))
+                                  ->Create("", 1, 4, 1, GDT_Byte, nullptr));
+    std::array<uint8_t, 4> buffer = {
+        1,    //////////////////////////////////////////////////////////
+        0,    //////////////////////////////////////////////////////////
+        255,  //////////////////////////////////////////////////////////
+        1,    //////////////////////////////////////////////////////////
+    };
+    GDALRasterIOExtraArg sExtraArg;
+    INIT_RASTERIO_EXTRA_ARG(sExtraArg);
+    EXPECT_EQ(poDS->GetRasterBand(1)->RasterIO(
+                  GF_Write, 0, 0, 1, 4, buffer.data(), 1, 4, GDT_Byte,
+                  sizeof(uint8_t), 1 * sizeof(uint8_t), &sExtraArg),
+              CE_None);
+
+    double dfMin = 0;
+    double dfMax = 0;
+    int nMinX = -1;
+    int nMinY = -1;
+    int nMaxX = -1;
+    int nMaxY = -1;
+    EXPECT_EQ(poDS->GetRasterBand(1)->ComputeRasterMinMaxLocation(
+                  &dfMin, &dfMax, &nMinX, &nMinY, &nMaxX, &nMaxY),
+              CE_None);
+    EXPECT_EQ(dfMin, 0);
+    EXPECT_EQ(dfMax, 255);
+    EXPECT_EQ(nMinX, 0);
+    EXPECT_EQ(nMinY, 1);
+    EXPECT_EQ(nMaxX, 0);
+    EXPECT_EQ(nMaxY, 2);
+}
+
+// Test GDALComputeRasterMinMaxLocation
+TEST_F(test_gdal, GDALComputeRasterMinMaxLocation_with_mask)
+{
+    GDALDatasetUniquePtr poDS(GDALDriver::FromHandle(GDALGetDriverByName("MEM"))
+                                  ->Create("", 2, 2, 1, GDT_Byte, nullptr));
+    std::array<uint8_t, 6> buffer = {
+        2, 10,  //////////////////////////////////////////////////////////
+        4, 20,  //////////////////////////////////////////////////////////
+    };
+    GDALRasterIOExtraArg sExtraArg;
+    INIT_RASTERIO_EXTRA_ARG(sExtraArg);
+    EXPECT_EQ(poDS->GetRasterBand(1)->RasterIO(
+                  GF_Write, 0, 0, 2, 2, buffer.data(), 2, 2, GDT_Byte,
+                  sizeof(uint8_t), 2 * sizeof(uint8_t), &sExtraArg),
+              CE_None);
+
+    poDS->GetRasterBand(1)->CreateMaskBand(0);
+    std::array<uint8_t, 6> buffer_mask = {
+        0, 255,  //////////////////////////////////////////////////////////
+        255, 0,  //////////////////////////////////////////////////////////
+    };
+    EXPECT_EQ(poDS->GetRasterBand(1)->GetMaskBand()->RasterIO(
+                  GF_Write, 0, 0, 2, 2, buffer_mask.data(), 2, 2, GDT_Byte,
+                  sizeof(uint8_t), 2 * sizeof(uint8_t), &sExtraArg),
+              CE_None);
+
+    double dfMin = 0;
+    double dfMax = 0;
+    int nMinX = -1;
+    int nMinY = -1;
+    int nMaxX = -1;
+    int nMaxY = -1;
+    EXPECT_EQ(poDS->GetRasterBand(1)->ComputeRasterMinMaxLocation(
+                  &dfMin, &dfMax, &nMinX, &nMinY, &nMaxX, &nMaxY),
+              CE_None);
+    EXPECT_EQ(dfMin, 4);
+    EXPECT_EQ(dfMax, 10);
+    EXPECT_EQ(nMinX, 0);
+    EXPECT_EQ(nMinY, 1);
+    EXPECT_EQ(nMaxX, 1);
+    EXPECT_EQ(nMaxY, 0);
+}
+
 }  // namespace
diff --git a/autotest/cpp/test_gdal_minmax_element.cpp b/autotest/cpp/test_gdal_minmax_element.cpp
new file mode 100644
index 000000000000..b6d681dda76d
--- /dev/null
+++ b/autotest/cpp/test_gdal_minmax_element.cpp
@@ -0,0 +1,900 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+// Project:  C++ Test Suite for GDAL/OGR
+// Purpose:  Test gdal_minmax_element.hpp
+// Author:   Even Rouault <even.rouault at spatialys.com>
+//
+///////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2023, Even Rouault <even.rouault at spatialys.com>
+/*
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#include "gdal_unit_test.h"
+
+#include "gdal_minmax_element.hpp"
+
+#include "gtest_include.h"
+
+#include <limits>
+
+namespace
+{
+
+struct test_gdal_minmax_element : public ::testing::Test
+{
+};
+
+TEST_F(test_gdal_minmax_element, uint8)
+{
+    using T = uint8_t;
+    constexpr GDALDataType eDT = GDT_Byte;
+    T min_v = 3;
+    T max_v = 7;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2),
+                         static_cast<T>(max_v - 1),
+                         max_v,
+                         static_cast<T>(max_v - 1),
+                         static_cast<T>(min_v + 1),
+                         min_v,
+                         static_cast<T>(min_v + 1)};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[125] = static_cast<T>(min_v + 1);
+        v[126] = min_v;
+        v[127] = static_cast<T>(min_v + 1);
+        v[128] = static_cast<T>(max_v - 1);
+        v[129] = max_v;
+        v[130] = static_cast<T>(max_v - 1);
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+    {
+        std::vector<T> v(257, 0);
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, 0);
+        EXPECT_TRUE(idx_min == 0 || idx_min == 256) << idx_min;
+    }
+    {
+        std::vector<T> v(257, 0);
+        v[127] = static_cast<T>(min_v + 1);
+        v[255] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+    {
+        std::vector<T> v(259, static_cast<T>((min_v + max_v) / 2));
+        v[0] = min_v;
+        v[256] = static_cast<T>(max_v - 1);
+        v[257] = max_v;
+        v[258] = static_cast<T>(max_v - 1);
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[0] = min_v;
+        v[127] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[127] = min_v;
+        v[0] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[0] = min_v;
+        v[129] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[129] = min_v;
+        v[0] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[129] = min_v;
+        v[256] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[256] = min_v;
+        v[129] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, 0);
+        v[65] = static_cast<T>(max_v - 2);
+        v[66] = static_cast<T>(max_v - 1);
+        v[129] = max_v;
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, int8)
+{
+    using T = int8_t;
+    T min_v = -1;
+    T max_v = 3;
+    constexpr GDALDataType eDT = GDT_Int8;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, uint16)
+{
+    using T = uint16_t;
+    constexpr GDALDataType eDT = GDT_UInt16;
+    T min_v = 1000;
+    T max_v = 2000;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, int16)
+{
+    using T = int16_t;
+    constexpr GDALDataType eDT = GDT_Int16;
+    T min_v = -1000;
+    T max_v = 2000;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, uint32)
+{
+    using T = uint32_t;
+    constexpr GDALDataType eDT = GDT_UInt32;
+    T min_v = 10000000;
+    T max_v = 20000000;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, int32)
+{
+    using T = int32_t;
+    constexpr GDALDataType eDT = GDT_Int32;
+    T min_v = -10000000;
+    T max_v = 20000000;
+    {
+        T nodata = 0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, uint64)
+{
+    using T = uint64_t;
+    constexpr GDALDataType eDT = GDT_UInt64;
+    T min_v = 100000000000000;
+    T max_v = 200000000000000;
+    {
+        double nodata = 0;
+        std::vector<T> v{max_v, static_cast<T>(nodata), min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        double nodata = 0;
+        std::vector<T> v{static_cast<T>(nodata), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min =
+            gdal::min_element(v.data(), v.size(), eDT, true,
+                              static_cast<double>(static_cast<T>(min_v + 1)));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, int64)
+{
+    using T = int64_t;
+    constexpr GDALDataType eDT = GDT_Int64;
+    T min_v = -100000000000000;
+    T max_v = 200000000000000;
+    {
+        double nodata = 0;
+        std::vector<T> v{max_v, static_cast<T>(nodata), min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        double nodata = 0;
+        std::vector<T> v{static_cast<T>(nodata), max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{static_cast<T>((min_v + max_v) / 2),
+                         max_v - 1,
+                         max_v,
+                         max_v - 1,
+                         min_v + 1,
+                         min_v,
+                         min_v + 1};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>((min_v + max_v) / 2));
+        v[5] = min_v;
+        v[31] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 2));
+        v[128] = static_cast<T>(min_v + 1);
+        v[256] = min_v;
+        auto idx_min =
+            gdal::min_element(v.data(), v.size(), eDT, true,
+                              static_cast<double>(static_cast<T>(min_v + 1)));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, float32)
+{
+    using T = float;
+    constexpr GDALDataType eDT = GDT_Float32;
+    T min_v = 1.0f;
+    T max_v = 1.5f;
+    {
+        T nodata = 2.0f;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 2.0f;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        T nodata = 2.0f;
+        std::vector<T> v{std::numeric_limits<T>::quiet_NaN(),
+                         std::numeric_limits<T>::quiet_NaN(), nodata, max_v,
+                         min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        T nodata = std::numeric_limits<T>::quiet_NaN();
+        std::vector<T> v{std::numeric_limits<T>::quiet_NaN(),
+                         std::numeric_limits<T>::quiet_NaN(), nodata, max_v,
+                         min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{std::numeric_limits<T>::quiet_NaN(),
+                         std::numeric_limits<T>::quiet_NaN(),
+                         max_v,
+                         std::numeric_limits<T>::quiet_NaN(),
+                         min_v,
+                         std::numeric_limits<T>::quiet_NaN()};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{max_v, std::numeric_limits<T>::quiet_NaN(), min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, std::numeric_limits<T>::quiet_NaN());
+        v[125] = static_cast<T>(min_v + 0.1f);
+        v[126] = min_v;
+        v[127] = static_cast<T>(min_v + 0.1f);
+        v[128] = static_cast<T>(max_v - 0.1f);
+        v[129] = max_v;
+        v[130] = static_cast<T>(max_v - 0.1f);
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(33, 1.2f);
+        v[5] = min_v;
+        v[15] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(255, std::numeric_limits<T>::quiet_NaN());
+        v[v.size() - 2] = min_v;
+        v.back() = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 0.2f));
+        v[128] = static_cast<T>(min_v + 0.1f);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 0.1f));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, float64)
+{
+    using T = double;
+    constexpr GDALDataType eDT = GDT_Float64;
+    T min_v = 1.0;
+    T max_v = 1.5;
+    {
+        T nodata = 2.0;
+        std::vector<T> v{max_v, nodata, min_v};
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0);
+            EXPECT_EQ(idx_min, 0);
+        }
+        {
+            auto idx_min =
+                gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            auto idx_max =
+                gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+        {
+            auto [idx_min, idx_max] =
+                gdal::minmax_element(v.data(), v.size(), eDT, true, nodata);
+            EXPECT_EQ(v[idx_min], min_v);
+            EXPECT_EQ(v[idx_max], max_v);
+        }
+    }
+    {
+        T nodata = 2.0;
+        std::vector<T> v{nodata, max_v, min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        T nodata = 2.0;
+        std::vector<T> v{std::numeric_limits<T>::quiet_NaN(),
+                         std::numeric_limits<T>::quiet_NaN(), nodata, max_v,
+                         min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{max_v, std::numeric_limits<T>::quiet_NaN(), min_v};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v{std::numeric_limits<T>::quiet_NaN(),
+                         std::numeric_limits<T>::quiet_NaN(),
+                         max_v,
+                         std::numeric_limits<T>::quiet_NaN(),
+                         min_v,
+                         std::numeric_limits<T>::quiet_NaN()};
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(33, std::numeric_limits<T>::quiet_NaN());
+        v[5] = min_v;
+        v[15] = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(255, std::numeric_limits<T>::quiet_NaN());
+        v[v.size() - 2] = min_v;
+        v.back() = max_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_min], min_v);
+        auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0);
+        EXPECT_EQ(v[idx_max], max_v);
+    }
+    {
+        std::vector<T> v(257, static_cast<T>(min_v + 0.2));
+        v[128] = static_cast<T>(min_v + 0.1);
+        v[256] = min_v;
+        auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true,
+                                         static_cast<T>(min_v + 0.1));
+        EXPECT_EQ(v[idx_min], min_v);
+    }
+}
+
+TEST_F(test_gdal_minmax_element, unsupported)
+{
+    float v[2] = {0, 0};
+    CPLErrorHandlerPusher oErrorHandler(CPLQuietErrorHandler);
+    {
+        CPLErrorReset();
+        EXPECT_EQ(gdal::min_element(v, 1, GDT_CFloat32, false, 0), 0);
+        EXPECT_EQ(CPLGetLastErrorNo(), CPLE_NotSupported);
+    }
+    {
+        CPLErrorReset();
+        EXPECT_EQ(gdal::max_element(v, 1, GDT_CFloat32, false, 0), 0);
+        EXPECT_EQ(CPLGetLastErrorNo(), CPLE_NotSupported);
+    }
+    {
+        CPLErrorReset();
+        auto [idx_min, idx_max] =
+            gdal::minmax_element(v, 1, GDT_CFloat32, false, 0);
+        EXPECT_EQ(idx_min, 0);
+        EXPECT_EQ(idx_max, 0);
+        EXPECT_EQ(CPLGetLastErrorNo(), CPLE_NotSupported);
+    }
+}
+
+}  // namespace
diff --git a/autotest/cpp/test_ogr.cpp b/autotest/cpp/test_ogr.cpp
index b43f40d1cd65..8a91c026f57b 100644
--- a/autotest/cpp/test_ogr.cpp
+++ b/autotest/cpp/test_ogr.cpp
@@ -69,6 +69,7 @@ void testSpatialReferenceLeakOnCopy(OGRSpatialReference *poSRS)
         ASSERT_GT(nCurCount, nLastCount);
         nLastCount = nCurCount;
 
+        // coverity[copy_assignment_call]
         value3 = value;
         ASSERT_EQ(nLastCount, poSRS->GetReferenceCount());
     }
@@ -327,6 +328,7 @@ TEST_F(test_ogr, OGRGeometryCollection_copy_constructor_illegal_use)
     CPLErrorReset();
     {
         CPLErrorHandlerPusher oPusher(CPLQuietErrorHandler);
+        // coverity[copy_assignment_call]
         *mp_as_gc = gc;
     }
     EXPECT_STREQ(CPLGetLastErrorMsg(),
@@ -360,6 +362,7 @@ TEST_F(test_ogr, OGRCurvePolygon_copy_constructor_illegal_use)
     CPLErrorReset();
     {
         CPLErrorHandlerPusher oPusher(CPLQuietErrorHandler);
+        // coverity[copy_assignment_call]
         *poly_as_cp = cp;
     }
     EXPECT_STREQ(CPLGetLastErrorMsg(),
@@ -368,6 +371,60 @@ TEST_F(test_ogr, OGRCurvePolygon_copy_constructor_illegal_use)
     EXPECT_TRUE(poly.IsEmpty());
 }
 
+template <class T> void testMove()
+{
+    auto poSRS = new OGRSpatialReference();
+    {
+        auto poOrigin = std::unique_ptr<T>(make<T>());
+        ASSERT_TRUE(nullptr != poOrigin);
+        poOrigin->assignSpatialReference(poSRS);
+
+        T valueCopy(*poOrigin);
+        const int refCountBefore = poSRS->GetReferenceCount();
+        T fromMoved(std::move(*poOrigin));
+        EXPECT_EQ(poSRS->GetReferenceCount(), refCountBefore);
+
+        ASSERT_TRUE(CPL_TO_BOOL(fromMoved.Equals(&valueCopy)))
+            << valueCopy.getGeometryName()
+            << ": move constructor changed a value";
+        EXPECT_EQ(fromMoved.getSpatialReference(), poSRS);
+
+        T valueCopy2(valueCopy);
+        EXPECT_EQ(valueCopy.getSpatialReference(), poSRS);
+        T value3;
+        const int refCountBefore2 = poSRS->GetReferenceCount();
+        value3 = std::move(valueCopy);
+        EXPECT_EQ(poSRS->GetReferenceCount(), refCountBefore2);
+
+        ASSERT_TRUE(CPL_TO_BOOL(value3.Equals(&valueCopy2)))
+            << valueCopy2.getGeometryName()
+            << ": move assignment operator changed a value";
+        EXPECT_EQ(value3.getSpatialReference(), poSRS);
+    }
+    EXPECT_EQ(poSRS->GetReferenceCount(), 1);
+    poSRS->Release();
+}
+
+TEST_F(test_ogr, geometry_move)
+{
+    testMove<OGRPoint>();
+    testMove<OGRLineString>();
+    testMove<OGRLinearRing>();
+    testMove<OGRCircularString>();
+    testMove<OGRCompoundCurve>();
+    testMove<OGRCurvePolygon>();
+    testMove<OGRPolygon>();
+    testMove<OGRGeometryCollection>();
+    testMove<OGRMultiSurface>();
+    testMove<OGRMultiPolygon>();
+    testMove<OGRMultiPoint>();
+    testMove<OGRMultiCurve>();
+    testMove<OGRMultiLineString>();
+    testMove<OGRTriangle>();
+    testMove<OGRPolyhedralSurface>();
+    testMove<OGRTriangulatedSurface>();
+}
+
 TEST_F(test_ogr, geometry_get_point)
 {
     {
@@ -4241,9 +4298,8 @@ TEST_F(test_ogr, OGRCurve_reversePoints)
 TEST_F(test_ogr, transformWithOptions)
 {
     // Projected CRS to national geographic CRS (not including poles or antimeridian)
-    OGRGeometry *poGeom = nullptr;
-    OGRGeometryFactory::createFromWkt(
-        "LINESTRING(700000 6600000, 700001 6600001)", nullptr, &poGeom);
+    auto [poGeom, err] = OGRGeometryFactory::createFromWkt(
+        "LINESTRING(700000 6600000, 700001 6600001)");
     ASSERT_NE(poGeom, nullptr);
 
     OGRSpatialReference oEPSG_2154;
@@ -4254,12 +4310,12 @@ TEST_F(test_ogr, transformWithOptions)
     auto poCT = std::unique_ptr<OGRCoordinateTransformation>(
         OGRCreateCoordinateTransformation(&oEPSG_2154, &oEPSG_4171));
     OGRGeometryFactory::TransformWithOptionsCache oCache;
-    poGeom = OGRGeometryFactory::transformWithOptions(poGeom, poCT.get(),
-                                                      nullptr, oCache);
-    EXPECT_NEAR(poGeom->toLineString()->getX(0), 3, 1e-8);
-    EXPECT_NEAR(poGeom->toLineString()->getY(0), 46.5, 1e-8);
-
-    delete poGeom;
+    auto poNewGeom =
+        std::unique_ptr<OGRGeometry>(OGRGeometryFactory::transformWithOptions(
+            poGeom.get(), poCT.get(), nullptr, oCache));
+    ASSERT_NE(poNewGeom, nullptr);
+    EXPECT_NEAR(poNewGeom->toLineString()->getX(0), 3, 1e-8);
+    EXPECT_NEAR(poNewGeom->toLineString()->getY(0), 46.5, 1e-8);
 }
 
 #ifdef HAVE_GEOS
@@ -4268,10 +4324,8 @@ TEST_F(test_ogr, transformWithOptions)
 TEST_F(test_ogr, transformWithOptions_GEOS)
 {
     // Projected CRS to national geographic CRS including antimeridian
-    OGRGeometry *poGeom = nullptr;
-    OGRGeometryFactory::createFromWkt(
-        "LINESTRING(657630.64 4984896.17,815261.43 4990738.26)", nullptr,
-        &poGeom);
+    auto [poGeom, err] = OGRGeometryFactory::createFromWkt(
+        "LINESTRING(657630.64 4984896.17,815261.43 4990738.26)");
     ASSERT_NE(poGeom, nullptr);
 
     OGRSpatialReference oEPSG_6329;
@@ -4282,12 +4336,14 @@ TEST_F(test_ogr, transformWithOptions_GEOS)
     auto poCT = std::unique_ptr<OGRCoordinateTransformation>(
         OGRCreateCoordinateTransformation(&oEPSG_6329, &oEPSG_6318));
     OGRGeometryFactory::TransformWithOptionsCache oCache;
-    poGeom = OGRGeometryFactory::transformWithOptions(poGeom, poCT.get(),
-                                                      nullptr, oCache);
-    EXPECT_EQ(poGeom->getGeometryType(), wkbMultiLineString);
-    if (poGeom->getGeometryType() == wkbMultiLineString)
-    {
-        const auto poMLS = poGeom->toMultiLineString();
+    auto poNewGeom =
+        std::unique_ptr<OGRGeometry>(OGRGeometryFactory::transformWithOptions(
+            poGeom.get(), poCT.get(), nullptr, oCache));
+    ASSERT_NE(poNewGeom, nullptr);
+    EXPECT_EQ(poNewGeom->getGeometryType(), wkbMultiLineString);
+    if (poNewGeom->getGeometryType() == wkbMultiLineString)
+    {
+        const auto poMLS = poNewGeom->toMultiLineString();
         EXPECT_EQ(poMLS->getNumGeometries(), 2);
         if (poMLS->getNumGeometries() == 2)
         {
@@ -4302,8 +4358,6 @@ TEST_F(test_ogr, transformWithOptions_GEOS)
             }
         }
     }
-
-    delete poGeom;
 }
 #endif
 
diff --git a/autotest/gcore/basic_test.py b/autotest/gcore/basic_test.py
index 3b399d820925..33f9e58a36ab 100755
--- a/autotest/gcore/basic_test.py
+++ b/autotest/gcore/basic_test.py
@@ -987,3 +987,22 @@ def test_colorinterp():
         assert name not in d
         d[name] = c
         assert gdal.GetColorInterpretationByName(name) == c
+
+
+def test_ComputeMinMaxLocation():
+
+    ds = gdal.Open("data/byte.tif")
+    ret = ds.GetRasterBand(1).ComputeMinMaxLocation()
+    assert (
+        ret.min == 74
+        and ret.max == 255
+        and ret.minX == 9
+        and ret.minY == 17
+        and ret.maxX == 2
+        and ret.maxY == 18
+    )
+
+    ds = gdal.GetDriverByName("MEM").Create("", 1, 1, 1, gdal.GDT_Float64)
+    ds.GetRasterBand(1).Fill(float("nan"))
+    ret = ds.GetRasterBand(1).ComputeMinMaxLocation()
+    assert ret is None
diff --git a/autotest/gcore/cog.py b/autotest/gcore/cog.py
index 3ef850bb019c..0e58f0e18af7 100755
--- a/autotest/gcore/cog.py
+++ b/autotest/gcore/cog.py
@@ -13,6 +13,7 @@
 # SPDX-License-Identifier: MIT
 ###############################################################################
 
+import os
 import struct
 import sys
 
@@ -1935,3 +1936,29 @@ def test_cog_mask_band_overviews(tmp_vsimem):
     assert ds.GetRasterBand(1).IsMaskBand()
     assert ds.GetRasterBand(1).GetOverview(0).IsMaskBand()
     assert ds.GetRasterBand(1).GetOverview(1).IsMaskBand()
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+
+
+@pytest.mark.parametrize(
+    "src_filename,creation_options",
+    [
+        ("data/cog/byte_little_endian_golden.tif", []),
+        (
+            "data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif",
+            ["BLOCKSIZE=16", "PREDICTOR=STANDARD"],
+        ),
+    ],
+)
+def test_cog_write_check_golden_file(tmp_path, src_filename, creation_options):
+
+    out_filename = str(tmp_path / "test.tif")
+    with gdal.config_option("GDAL_TIFF_ENDIANNESS", "LITTLE"):
+        with gdal.Open(src_filename) as src_ds:
+            gdal.GetDriverByName("COG").CreateCopy(
+                out_filename, src_ds, options=creation_options
+            )
+    assert os.stat(src_filename).st_size == os.stat(out_filename).st_size
+    assert open(src_filename, "rb").read() == open(out_filename, "rb").read()
diff --git a/autotest/gcore/data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif b/autotest/gcore/data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif
new file mode 100644
index 000000000000..aaade76e71f7
Binary files /dev/null and b/autotest/gcore/data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif differ
diff --git a/autotest/gcore/data/cog/byte_little_endian_golden.tif b/autotest/gcore/data/cog/byte_little_endian_golden.tif
new file mode 100644
index 000000000000..4ace013ced02
Binary files /dev/null and b/autotest/gcore/data/cog/byte_little_endian_golden.tif differ
diff --git a/autotest/gcore/data/gtiff/byte_little_endian_golden.tif b/autotest/gcore/data/gtiff/byte_little_endian_golden.tif
new file mode 100644
index 000000000000..a4cbc947432c
Binary files /dev/null and b/autotest/gcore/data/gtiff/byte_little_endian_golden.tif differ
diff --git a/autotest/gcore/data/gtiff/byte_little_endian_tiled_lzw_golden.tif b/autotest/gcore/data/gtiff/byte_little_endian_tiled_lzw_golden.tif
new file mode 100644
index 000000000000..b43311f0e16a
Binary files /dev/null and b/autotest/gcore/data/gtiff/byte_little_endian_tiled_lzw_golden.tif differ
diff --git a/autotest/gcore/data/gtiff/float32_little_endian_golden.tif b/autotest/gcore/data/gtiff/float32_little_endian_golden.tif
new file mode 100644
index 000000000000..96e3dfd31339
Binary files /dev/null and b/autotest/gcore/data/gtiff/float32_little_endian_golden.tif differ
diff --git a/autotest/gcore/data/gtiff/uint16_little_endian_golden.tif b/autotest/gcore/data/gtiff/uint16_little_endian_golden.tif
new file mode 100644
index 000000000000..2ea26292221b
Binary files /dev/null and b/autotest/gcore/data/gtiff/uint16_little_endian_golden.tif differ
diff --git a/autotest/gcore/misc.py b/autotest/gcore/misc.py
index 18d1f4c806c0..62f8b29a5663 100755
--- a/autotest/gcore/misc.py
+++ b/autotest/gcore/misc.py
@@ -13,6 +13,7 @@
 # SPDX-License-Identifier: MIT
 ###############################################################################
 
+import datetime
 import os
 import shutil
 
@@ -721,6 +722,70 @@ def test_misc_13():
     assert out_ds is None
 
 
+###############################################################################
+# Test parsing of CPL_DEBUG and CPL_TIMESTAMP
+
+
+@pytest.fixture
+def debug_output():
+
+    messages = []
+
+    def handle(ecls, ecode, emsg):
+        messages.append(emsg)
+
+    def log_message(category, message):
+        messages.clear()
+        gdal.Debug(category, message)
+        return messages[0] if messages else None
+
+    log_message.handle = handle
+
+    with gdaltest.error_handler(handle):
+        yield log_message
+
+
+@pytest.mark.parametrize(
+    "booleans",
+    [("YES", "NO"), ("TRUE", "FALSE"), ("ON", "OFF"), ("1", "0")],
+    ids="_".join,
+)
+def test_misc_cpl_debug(debug_output, booleans):
+
+    on, off = booleans
+
+    assert debug_output("GDAL", "msg") is None
+
+    with gdal.config_option("CPL_DEBUG", off):
+        assert debug_output("GDAL", "msg") is None
+
+    with gdal.config_option("CPL_DEBUG", on):
+        assert debug_output("GDAL", "message") == "GDAL: message"
+
+        with gdal.config_option("CPL_TIMESTAMP", off):
+            assert debug_output("GDAL", "message") == "GDAL: message"
+
+        with gdal.config_option("CPL_TIMESTAMP", on):
+            output = debug_output("GDAL", "message")
+            assert str(datetime.datetime.now().year) in output
+            assert output.endswith("GDAL: message")
+
+
+def test_misc_cpl_debug_filtering(debug_output):
+
+    with gdal.config_option("CPL_DEBUG", "GDAL"):
+        assert debug_output("GDAL", "msg") == "GDAL: msg"
+        assert debug_output("GDAL_WARP", "msg") is None
+        assert debug_output("", "msg") == ": msg"
+
+    with gdal.config_option("CPL_DEBUG", "GDAL_WARP_TRANSLATE_ETC"):
+        assert debug_output("GDAL", "msg") == "GDAL: msg"
+        assert debug_output("TRANSLATE", "msg") == "TRANSLATE: msg"
+
+    with gdal.config_option("CPL_DEBUG", ""):
+        assert debug_output("GDAL", "msg") == "GDAL: msg"
+
+
 ###############################################################################
 # Test ConfigureLogging()
 
diff --git a/autotest/gcore/tiff_write.py b/autotest/gcore/tiff_write.py
index f803b95d24b8..3e2c8d9de67b 100755
--- a/autotest/gcore/tiff_write.py
+++ b/autotest/gcore/tiff_write.py
@@ -11901,3 +11901,30 @@ def test_tiff_write_band_IMAGERY(tmp_vsimem):
         )
     with gdal.Open(filename2) as ds:
         assert ds.GetRasterBand(1).GetMetadata_Dict("IMAGERY") == {"foo": "bar"}
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+
+
+@pytest.mark.parametrize(
+    "src_filename,creation_options",
+    [
+        ("data/gtiff/byte_little_endian_golden.tif", []),
+        ("data/gtiff/uint16_little_endian_golden.tif", []),
+        ("data/gtiff/float32_little_endian_golden.tif", []),
+        (
+            "data/gtiff/byte_little_endian_tiled_lzw_golden.tif",
+            ["TILED=YES", "BLOCKXSIZE=16", "BLOCKYSIZE=16", "COMPRESS=LZW"],
+        ),
+    ],
+)
+def test_tiff_write_check_golden_file(tmp_path, src_filename, creation_options):
+
+    out_filename = str(tmp_path / "test.tif")
+    with gdal.Open(src_filename) as src_ds:
+        gdal.GetDriverByName("GTiff").CreateCopy(
+            out_filename, src_ds, options=["ENDIANNESS=LITTLE"] + creation_options
+        )
+    assert os.stat(src_filename).st_size == os.stat(out_filename).st_size
+    assert open(src_filename, "rb").read() == open(out_filename, "rb").read()
diff --git a/autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2 b/autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2
new file mode 100644
index 000000000000..049871c52ec8
Binary files /dev/null and b/autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2 differ
diff --git a/autotest/gdrivers/data/netcdf/byte_nc3_golden.nc b/autotest/gdrivers/data/netcdf/byte_nc3_golden.nc
new file mode 100644
index 000000000000..09b43e1a71fb
Binary files /dev/null and b/autotest/gdrivers/data/netcdf/byte_nc3_golden.nc differ
diff --git a/autotest/gdrivers/jp2openjpeg.py b/autotest/gdrivers/jp2openjpeg.py
index 387a15cc0756..1aa3439bba66 100755
--- a/autotest/gdrivers/jp2openjpeg.py
+++ b/autotest/gdrivers/jp2openjpeg.py
@@ -3920,3 +3920,29 @@ def test_jp2openjpeg_unsupported_srs_for_gmljp2(tmp_vsimem):
     assert ds.GetSpatialRef().IsSame(ref_srs)
     # Check that we do *not* have a GMLJP2 box
     assert "xml:gml.root-instance" not in ds.GetMetadataDomainList()
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+# (might be risky depending on libopenjp2...)
+
+
+@pytest.mark.parametrize(
+    "src_filename,creation_options",
+    [
+        # Created with gdal_translate autotest/gcore/data/byte.tif autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2 -of jp2openjpeg -co QUALITY=100 -co REVERSIBLE=YES -co COMMENT=
+        (
+            "data/jpeg2000/byte_lossless_openjp2_golden.jp2",
+            ["QUALITY=100", "REVERSIBLE=YES", "COMMENT="],
+        ),
+    ],
+)
+def test_jp2openjpeg_write_check_golden_file(tmp_path, src_filename, creation_options):
+
+    out_filename = str(tmp_path / "test.jp2")
+    with gdal.Open(src_filename) as src_ds:
+        gdal.GetDriverByName("JP2OpenJPEG").CreateCopy(
+            out_filename, src_ds, options=creation_options
+        )
+    assert os.stat(src_filename).st_size == os.stat(out_filename).st_size
+    assert open(src_filename, "rb").read() == open(out_filename, "rb").read()
diff --git a/autotest/gdrivers/netcdf.py b/autotest/gdrivers/netcdf.py
index 21c0b3026ca6..6622974f49b4 100755
--- a/autotest/gdrivers/netcdf.py
+++ b/autotest/gdrivers/netcdf.py
@@ -6583,3 +6583,36 @@ def test_netcdf_extra_dim_no_georef(tmp_path):
     ds = gdal.Open(fname)
     assert ds.RasterCount == 4
     assert ds.ReadRaster() == src_ds.ReadRaster()
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+# (might be risky depending on libopenjp2...)
+
+
+@pytest.mark.parametrize(
+    "src_filename,golden_file,creation_options",
+    [
+        # Created with gdal_translate gdal_translate autotest/gcore/data/byte.tif autotest/gdrivers/data/netcdf/byte_nc3_golden.nc  -co WRITE_GDAL_VERSION=NO  -co WRITE_GDAL_HISTORY=NO -co FORMAT=NC
+        (
+            "../gcore/data/byte.tif",
+            "data/netcdf/byte_nc3_golden.nc",
+            ["WRITE_GDAL_VERSION=NO", "WRITE_GDAL_HISTORY=NO", "FORMAT=NC"],
+        ),
+    ],
+)
+# I've that feeling that netCDF might be host endianness dependent...
+@pytest.mark.skipif(
+    sys.byteorder != "little", reason="only supported on little-endian hosts"
+)
+def test_netcdf_write_check_golden_file(
+    tmp_path, src_filename, golden_file, creation_options
+):
+
+    out_filename = str(tmp_path / "test.nc")
+    with gdal.Open(src_filename) as src_ds:
+        gdal.GetDriverByName("netCDF").CreateCopy(
+            out_filename, src_ds, options=creation_options
+        )
+    assert os.stat(golden_file).st_size == os.stat(out_filename).st_size
+    assert open(golden_file, "rb").read() == open(out_filename, "rb").read()
diff --git a/autotest/gnm/gnm_test.py b/autotest/gnm/gnm_test.py
index 530d2db8fdfd..f82cbaf71270 100755
--- a/autotest/gnm/gnm_test.py
+++ b/autotest/gnm/gnm_test.py
@@ -36,7 +36,7 @@ def test_gnm_filenetwork_create():
         pass
 
     drv = gdal.GetDriverByName("GNMFile")
-    ds = drv.Create(
+    with drv.Create(
         "tmp/",
         0,
         0,
@@ -47,17 +47,15 @@ def test_gnm_filenetwork_create():
             "net_description=Test file based GNM",
             "net_srs=EPSG:4326",
         ],
-    )
-    # cast to GNM
-    dn = gnm.CastToNetwork(ds)
-    assert dn is not None
-    assert dn.GetVersion() == 100, "GNM: Check GNM version failed"
-    assert dn.GetName() == "test_gnm", "GNM: Check GNM name failed"
-    assert (
-        dn.GetDescription() == "Test file based GNM"
-    ), "GNM: Check GNM description failed"
-
-    dn = None
+    ) as ds:
+        # cast to GNM
+        dn = gnm.CastToNetwork(ds)
+        assert dn is not None
+        assert dn.GetVersion() == 100, "GNM: Check GNM version failed"
+        assert dn.GetName() == "test_gnm", "GNM: Check GNM name failed"
+        assert (
+            dn.GetDescription() == "Test file based GNM"
+        ), "GNM: Check GNM description failed"
 
 
 ###############################################################################
diff --git a/autotest/ogr/data/gpkg/poly_golden.gpkg b/autotest/ogr/data/gpkg/poly_golden.gpkg
new file mode 100644
index 000000000000..ab45c9747eee
Binary files /dev/null and b/autotest/ogr/data/gpkg/poly_golden.gpkg differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtable
new file mode 100644
index 000000000000..ca18a72cc503
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtable differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtablx
new file mode 100644
index 000000000000..d3c0d9e20c02
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtablx differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtable
new file mode 100644
index 000000000000..b4ed54fbfb33
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtable differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtablx
new file mode 100644
index 000000000000..29323984e52a
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtablx differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtable
new file mode 100644
index 000000000000..29b5907954ae
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtable differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtablx
new file mode 100644
index 000000000000..4cf22da58b1f
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtablx differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtable
new file mode 100644
index 000000000000..e731d3f0c724
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtable differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtablx
new file mode 100644
index 000000000000..0209415ef5ba
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtablx differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtable
new file mode 100644
index 000000000000..d717f9fa1a7c
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtable differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtablx
new file mode 100644
index 000000000000..259a71f00d2c
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtablx differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtable
new file mode 100644
index 000000000000..7c8b607cb27b
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtable differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtablx
new file mode 100644
index 000000000000..92567c81d8c1
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtablx differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtable
new file mode 100644
index 000000000000..3e11ac8a531f
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtable differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtablx
new file mode 100644
index 000000000000..8a2df70bfbfb
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtablx differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbindexes b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbindexes
new file mode 100644
index 000000000000..cc24e2a06b9b
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbindexes differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtable
new file mode 100644
index 000000000000..5cde63fc1c8e
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtable differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtablx
new file mode 100644
index 000000000000..c8318b15c834
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtablx differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.spx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.spx
new file mode 100644
index 000000000000..44769469081c
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.spx differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/gdb b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/gdb
new file mode 100644
index 000000000000..506f9c628294
Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/gdb differ
diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/timestamps b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/timestamps
new file mode 100644
index 000000000000..05d2b9440ec0
--- /dev/null
+++ b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/timestamps
@@ -0,0 +1 @@
+����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������
\ No newline at end of file
diff --git a/autotest/ogr/data/shp/poly_golden/poly.dbf b/autotest/ogr/data/shp/poly_golden/poly.dbf
new file mode 100644
index 000000000000..ad76f9f42a5b
Binary files /dev/null and b/autotest/ogr/data/shp/poly_golden/poly.dbf differ
diff --git a/autotest/ogr/data/shp/poly_golden/poly.prj b/autotest/ogr/data/shp/poly_golden/poly.prj
new file mode 100644
index 000000000000..fec0ee28909b
--- /dev/null
+++ b/autotest/ogr/data/shp/poly_golden/poly.prj
@@ -0,0 +1 @@
+PROJCS["British_National_Grid",GEOGCS["GCS_OSGB_1936",DATUM["D_OSGB_1936",SPHEROID["Airy_1830",6377563.396,299.3249646]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["False_Easting",400000.0],PARAMETER["False_Northing",-100000.0],PARAMETER["Central_Meridian",-2.0],PARAMETER["Scale_Factor",0.9996012717],PARAMETER["Latitude_Of_Origin",49.0],UNIT["Meter",1.0]]
\ No newline at end of file
diff --git a/autotest/ogr/data/shp/poly_golden/poly.shp b/autotest/ogr/data/shp/poly_golden/poly.shp
new file mode 100644
index 000000000000..98951531b18c
Binary files /dev/null and b/autotest/ogr/data/shp/poly_golden/poly.shp differ
diff --git a/autotest/ogr/data/shp/poly_golden/poly.shx b/autotest/ogr/data/shp/poly_golden/poly.shx
new file mode 100644
index 000000000000..134898b3895e
Binary files /dev/null and b/autotest/ogr/data/shp/poly_golden/poly.shx differ
diff --git a/autotest/ogr/ogr_aivector.py b/autotest/ogr/ogr_aivector.py
new file mode 100755
index 000000000000..32c194666cc9
--- /dev/null
+++ b/autotest/ogr/ogr_aivector.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env pytest
+###############################################################################
+# $Id$
+#
+# Project:  GDAL/OGR Test Suite
+# Purpose:  Test read functionality for OGR AIVector driver.
+# Author:   Even Rouault <even dot rouault at spatialys.com>
+#
+###############################################################################
+# Copyright (c) 2024, Even Rouault <even dot rouault at spatialys.com>
+#
+# SPDX-License-Identifier: MIT
+###############################################################################
+
+import gdaltest
+import pytest
+
+pytestmark = pytest.mark.require_driver("AIVector")
+
+
+def test_ogr_aivector_test_ogrsf():
+
+    import test_cli_utilities
+
+    if test_cli_utilities.get_test_ogrsf_path() is None:
+        pytest.skip()
+
+    ret = gdaltest.runexternal(
+        test_cli_utilities.get_test_ogrsf_path() + " -ro AIVector:foo.bin"
+    )
+
+    assert "INFO" in ret
+    assert "ERROR" not in ret
diff --git a/autotest/ogr/ogr_gpkg.py b/autotest/ogr/ogr_gpkg.py
index 78e882d94892..a14fc22c98f6 100755
--- a/autotest/ogr/ogr_gpkg.py
+++ b/autotest/ogr/ogr_gpkg.py
@@ -10766,3 +10766,45 @@ def test_gpkg_secure_delete(tmp_vsimem):
             with ds.ExecuteSQL("PRAGMA secure_delete") as sql_lyr:
                 f = sql_lyr.GetNextFeature()
                 assert f.GetField(0) == 0
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+
+
+@pytest.mark.parametrize(
+    "src_filename",
+    [
+        # Generated with: ogr2ogr autotest/ogr/data/gpkg/poly_golden.gpkg autotest/ogr/data/poly.shp --config OGR_CURRENT_DATE="2000-01-01T:00:00:00.000Z" -nomd
+        "data/gpkg/poly_golden.gpkg",
+    ],
+)
+def test_ogr_gpkg_write_check_golden_file(tmp_path, src_filename):
+
+    out_filename = str(tmp_path / "test.gpkg")
+    with gdal.config_option("OGR_CURRENT_DATE", "2000-01-01T:00:00:00.000Z"):
+        gdal.VectorTranslate(out_filename, src_filename)
+
+    # Compare first sqlite3 dump if sqlite3 binary available
+    import subprocess
+
+    try:
+        golden_dump = subprocess.check_output(
+            ["sqlite3", src_filename, ".dump"]
+        ).decode("utf-8")
+        got_dump = subprocess.check_output(["sqlite3", out_filename, ".dump"]).decode(
+            "utf-8"
+        )
+        assert got_dump == golden_dump
+        # print("Identical sqlite3 dump")
+    except Exception:
+        pass
+
+    if get_sqlite_version() >= (3, 46, 0):
+        assert os.stat(src_filename).st_size == os.stat(out_filename).st_size
+        golden_data = bytearray(open(src_filename, "rb").read())
+        got_data = bytearray(open(out_filename, "rb").read())
+        # Zero out the SQLite version number at bytes 96-99. Cf https://www.sqlite.org/fileformat.html
+        golden_data[96] = golden_data[97] = golden_data[98] = golden_data[99] = 0
+        got_data[96] = got_data[97] = got_data[98] = got_data[99] = 0
+        assert got_data == golden_data
diff --git a/autotest/ogr/ogr_mem.py b/autotest/ogr/ogr_mem.py
index 3d26ac31a00b..e763eb031d7c 100755
--- a/autotest/ogr/ogr_mem.py
+++ b/autotest/ogr/ogr_mem.py
@@ -2872,9 +2872,12 @@ def test_ogr_mem_arrow_json():
     lyr.CreateField(field_def)
 
     stream = lyr.GetArrowStreamAsPyArrow()
-    md = stream.schema["field_json"].metadata
-    assert b"ARROW:extension:name" in md
-    assert md[b"ARROW:extension:name"] == b"arrow.json"
+    field_schema = stream.schema["field_json"]
+    # Since pyarrow 18, the field type is extension<arrow.json>
+    if str(field_schema.type) != "extension<arrow.json>":
+        md = field_schema.metadata
+        assert b"ARROW:extension:name" in md
+        assert md[b"ARROW:extension:name"] == b"arrow.json"
 
 
 ###############################################################################
diff --git a/autotest/ogr/ogr_openfilegdb_write.py b/autotest/ogr/ogr_openfilegdb_write.py
index ad2ccf036bb6..d2b0486267a4 100755
--- a/autotest/ogr/ogr_openfilegdb_write.py
+++ b/autotest/ogr/ogr_openfilegdb_write.py
@@ -13,6 +13,7 @@
 # SPDX-License-Identifier: MIT
 ###############################################################################
 
+import os
 import struct
 import sys
 
@@ -4571,3 +4572,32 @@ def test_ogr_openfilegdb_write_OGRUnsetMarker(tmp_vsimem):
         lyr = ds.GetLayer(0)
         f = lyr.GetNextFeature()
         assert f["i32"] == -21121
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+
+
+@pytest.mark.parametrize(
+    "src_directory",
+    [
+        # Generated with:
+        # ogr2ogr autotest/ogr/data/openfilegdb/polygon_golden.gdb '{"type":"Feature","properties":{"foo":"bar"},"geometry":{"type":"Polygon","coordinates":[[[0,0],[0,1],[1,0],[0,0]]]}}' --config OPENFILEGDB_CREATOR GDAL --config OPENFILEGDB_REPRODUCIBLE_UUID YES -f openfilegdb
+        "data/openfilegdb/polygon_golden.gdb",
+    ],
+)
+def test_ogr_openfilegdb_write_check_golden_file(tmp_path, src_directory):
+
+    out_directory = str(tmp_path / "test.gdb")
+    with gdaltest.config_options(
+        {"OPENFILEGDB_CREATOR": "GDAL", "OPENFILEGDB_REPRODUCIBLE_UUID": "YES"}
+    ):
+        gdal.VectorTranslate(out_directory, src_directory, format="OpenFileGDB")
+    for filename in os.listdir(src_directory):
+        src_filename = os.path.join(src_directory, filename)
+        out_filename = os.path.join(out_directory, filename)
+
+        assert os.stat(src_filename).st_size == os.stat(out_filename).st_size, filename
+        assert (
+            open(src_filename, "rb").read() == open(out_filename, "rb").read()
+        ), filename
diff --git a/autotest/ogr/ogr_shape.py b/autotest/ogr/ogr_shape.py
index 889d0c688abd..d777a11e41bc 100755
--- a/autotest/ogr/ogr_shape.py
+++ b/autotest/ogr/ogr_shape.py
@@ -6134,3 +6134,34 @@ def test_ogr_shape_read_date_empty_string():
     lyr = ds.GetLayer(0)
     f = lyr.GetNextFeature()
     assert f["date"] is None
+
+
+###############################################################################
+# Verify that we can generate an output that is byte-identical to the expected golden file.
+
+
+@pytest.mark.parametrize(
+    "src_directory",
+    [
+        # Generated with:
+        # ogr2ogr autotest/ogr/data/shp/poly_golden autotest/ogr/data/poly.shp -lco DBF_DATE_LAST_UPDATE=2000-01-01
+        "data/shp/poly_golden",
+    ],
+)
+def test_ogr_shape_write_check_golden_file(tmp_path, src_directory):
+
+    out_directory = str(tmp_path / "test")
+    gdal.VectorTranslate(
+        out_directory,
+        src_directory,
+        format="ESRI Shapefile",
+        layerCreationOptions=["DBF_DATE_LAST_UPDATE=2000-01-01"],
+    )
+    for filename in os.listdir(src_directory):
+        src_filename = os.path.join(src_directory, filename)
+        out_filename = os.path.join(out_directory, filename)
+
+        assert os.stat(src_filename).st_size == os.stat(out_filename).st_size, filename
+        assert (
+            open(src_filename, "rb").read() == open(out_filename, "rb").read()
+        ), filename
diff --git a/autotest/postinstall/test_gdal-config.sh b/autotest/postinstall/test_gdal-config.sh
index 19a656ee04ae..e77088e12e7b 100755
--- a/autotest/postinstall/test_gdal-config.sh
+++ b/autotest/postinstall/test_gdal-config.sh
@@ -102,7 +102,7 @@ set -eu
 CXX="${CXX:-c++}"
 echo "Test that we can compile all headers with C++11 using ${CXX}"
 for i in $prefix/include/*.h; do
-  ${CXX} -std=c++11 -c $(${GDAL_CONFIG} --cflags) $i;
+  ${CXX} -Wall -Wpedantic -std=c++11 -c $(${GDAL_CONFIG} --cflags) $i;
 done
 
 echo "$ERRORS tests failed out of $NTESTS"
diff --git a/autotest/pyscripts/test_pct.py b/autotest/pyscripts/test_pct.py
index 88212f866c43..c549685ea5ff 100755
--- a/autotest/pyscripts/test_pct.py
+++ b/autotest/pyscripts/test_pct.py
@@ -40,6 +40,9 @@ def script_path():
 
 def test_rgb2pct_help(script_path):
 
+    if gdaltest.is_travis_branch("sanitize"):
+        pytest.skip("fails on sanitize for unknown reason")
+
     assert "ERROR" not in test_py_scripts.run_py_script(
         script_path, "rgb2pct", "--help"
     )
@@ -51,6 +54,9 @@ def test_rgb2pct_help(script_path):
 
 def test_rgb2pct_version(script_path):
 
+    if gdaltest.is_travis_branch("sanitize"):
+        pytest.skip("fails on sanitize for unknown reason")
+
     assert "ERROR" not in test_py_scripts.run_py_script(
         script_path, "rgb2pct", "--version"
     )
diff --git a/ci/travis/osx/install.sh b/ci/travis/osx/install.sh
index 00e392ae0633..95f435e7e052 100755
--- a/ci/travis/osx/install.sh
+++ b/ci/travis/osx/install.sh
@@ -27,6 +27,9 @@ CFLAGS="-Wextra -Werror" CXXFLAGS="-Wextra -Werror" cmake .. \
          -DBUILD_CSHARP_BINDINGS=OFF \
          -DCMAKE_UNITY_BUILD=ON
 
+echo "Check that GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON"
+(grep "GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON" CMakeCache.txt > /dev/null && echo "yes") || (echo "Missing" && /bin/false)
+
 NPROC=$(sysctl -n hw.ncpu)
 echo "NPROC=${NPROC}"
 make -j${NPROC}
diff --git a/ci/travis/osx/script.sh b/ci/travis/osx/script.sh
index a28b9eeac6be..7b212dfc9deb 100755
--- a/ci/travis/osx/script.sh
+++ b/ci/travis/osx/script.sh
@@ -7,6 +7,9 @@ export PROJ_NETWORK=ON
 echo 'Running CPP unit tests'
 (cd build && make quicktest)
 
+echo 'Running CPP perftests'
+(cd build && ctest -V -R perf)
+
 echo 'Running Python unit tests'
 # install test dependencies
 sudo -H pip3 install -r autotest/requirements.txt
diff --git a/cmake/modules/thirdparty/FindDotnet.cmake b/cmake/modules/thirdparty/FindDotnet.cmake
index 945688b6eb47..499634228d63 100644
--- a/cmake/modules/thirdparty/FindDotnet.cmake
+++ b/cmake/modules/thirdparty/FindDotnet.cmake
@@ -5,24 +5,24 @@
 #
 # FindDotnet
 # ----------
-# 
+#
 # Find DotNet executable, and initialize functions for adding dotnet projects.
-# 
+#
 # Results are reported in the following variables::
-# 
+#
 #   DOTNET_FOUND          - True if dotnet executable is found
 #   DOTNET_EXE            - Dotnet executable
 #   DOTNET_VERSION        - Dotnet version as reported by dotnet executable
 #   DOTNET_SDKS           - Dotnet SDKs loaded as reported by dotnet executable
 #   NUGET_EXE             - Nuget executable (WIN32 only)
 #   NUGET_CACHE_PATH      - Nuget package cache path
-# 
+#
 # The following functions are defined to add dotnet/msbuild projects:
-# 
+#
 # ADD_DOTNET -- add a project to be built by dotnet.
-# 
+#
 # ```
-# ADD_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU] 
+# ADD_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU]
 #            [CONFIG configuration]
 #            [PLATFORM platform]
 #            [PACKAGE nuget_package_dependencies... ]
@@ -34,12 +34,12 @@
 #            [ARGUMENTS additional_build_args...]
 #            [PACK_ARGUMENTS additional_pack_args...])
 # ```
-# 
-# RUN_DOTNET -- Run a project with `dotnet run`. The `OUTPUT` argument represents artifacts 
+#
+# RUN_DOTNET -- Run a project with `dotnet run`. The `OUTPUT` argument represents artifacts
 #               produced by running the .NET program, and can be consumed from other build steps.
-# 
+#
 # ```
-# RUN_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU] 
+# RUN_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU]
 #            [ARGUMENTS program_args...]
 #            [OUTPUT outputs...]
 #            [CONFIG configuration]
@@ -49,11 +49,11 @@
 #            [CUSTOM_BUILDPROPS <CustomProp>value</CustomProp>....]
 #            [SOURCES additional_file_dependencies... ])
 # ```
-# 
+#
 # ADD_MSBUILD -- add a project to be built by msbuild. Windows-only. When building in Unix systems, msbuild targets are skipped.
-# 
+#
 # ```
-# ADD_MSBUILD(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU] 
+# ADD_MSBUILD(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU]
 #            [CONFIG configuration]
 #            [PLATFORM platform]
 #            [PACKAGE output_nuget_packages... ]
@@ -68,7 +68,7 @@
 # and if the program fails to build or run, the build fails. Currently only .NET Core App framework is supported.
 # Multiple smoke tests will be run one-by-one to avoid global resource conflicts.
 #
-# SMOKETEST_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU] 
+# SMOKETEST_DOTNET(<project_file> [RELEASE|DEBUG] [X86|X64|ANYCPU]
 #                 [ARGUMENTS program_args...]
 #                 [CONFIG configuration]
 #                 [PLATFORM platform]
@@ -76,12 +76,12 @@
 #                 [OUTPUT_PATH output_path relative to cmake binary output dir]
 #                 [CUSTOM_BUILDPROPS <CustomProp>value</CustomProp>....]
 #                 [SOURCES additional_file_dependencies... ])
-# 
+#
 # For all the above functions, `RELEASE|DEBUG` overrides `CONFIG`, `X86|X64|ANYCPU` overrides PLATFORM.
 #
 #
 # DOTNET_REGISTER_LOCAL_REPOSITORY -- register a local NuGet package repository.
-# 
+#
 # ```
 # DOTNET_REGISTER_LOCAL_REPOSITORY(repo_name repo_path)
 # ```
@@ -97,7 +97,7 @@
 #             [ARGUMENTS additional_dotnet_test_args...]
 #             [OUTPUT_PATH output_path relative to cmake binary output dir])
 # ```
-# 
+#
 # GEN_DOTNET_PROPS -- Generates a Directory.Build.props file. The created file is populated with MSBuild properties:
 #  - DOTNET_PACKAGE_VERSION: a version string that can be referenced in the actual project file as $(DOTNET_PACKAGE_VERSION).
 #    The version string value can be set with PACKAGE_VERSION argument, and defaults to '1.0.0'.
@@ -111,10 +111,7 @@
 #                  [PACKAGE_VERSION version]
 #                  [XML_INJECT xml_injection])
 # ```
-# 
-# Require 3.5 for batch copy multiple files
-
-cmake_minimum_required(VERSION 3.5.0)
+#
 
 IF(DOTNET_FOUND)
     RETURN()
@@ -184,11 +181,11 @@ ENDFUNCTION()
 FUNCTION(DOTNET_GET_DEPS _DN_PROJECT arguments)
     CMAKE_PARSE_ARGUMENTS(
         # prefix
-        _DN 
+        _DN
         # options (flags)
-        "RELEASE;DEBUG;X86;X64;ANYCPU;NETCOREAPP" 
+        "RELEASE;DEBUG;X86;X64;ANYCPU;NETCOREAPP"
         # oneValueArgs
-        "NAME;CONFIG;PLATFORM;VERSION;OUTPUT_PATH" 
+        "NAME;CONFIG;PLATFORM;VERSION;OUTPUT_PATH"
         # multiValueArgs
         "PACKAGE;DEPENDS;ARGUMENTS;PACK_ARGUMENTS;OUTPUT;SOURCES;CUSTOM_BUILDPROPS,BUILD_OPTIONS"
         # the input arguments
@@ -199,7 +196,7 @@ FUNCTION(DOTNET_GET_DEPS _DN_PROJECT arguments)
     GET_FILENAME_COMPONENT(_DN_projname "${_DN_PROJECT}" NAME)
     STRING(REGEX REPLACE "\\.[^.]*$" "" _DN_projname_noext ${_DN_projname})
 
-    FILE(GLOB_RECURSE DOTNET_deps 
+    FILE(GLOB_RECURSE DOTNET_deps
         ${_DN_proj_dir}/*.cs
         ${_DN_proj_dir}/*.fs
         ${_DN_proj_dir}/*.vb
@@ -328,14 +325,14 @@ ENDMACRO()
 
 MACRO(DOTNET_BUILD_COMMANDS)
     IF(${DOTNET_IS_MSBUILD})
-        SET(build_dotnet_cmds 
+        SET(build_dotnet_cmds
             COMMAND ${CMAKE_COMMAND} -E echo "======= Building msbuild project ${DOTNET_PROJNAME} [${DOTNET_CONFIG} ${DOTNET_PLATFORM}]"
             COMMAND ${NUGET_EXE} restore -Force ${DOTNET_PROJPATH}
             COMMAND ${DOTNET_EXE} msbuild ${DOTNET_PROJPATH} /t:Clean ${DOTNET_BUILD_PROPERTIES} /p:Configuration="${DOTNET_CONFIG}"
             COMMAND ${DOTNET_EXE} msbuild ${DOTNET_PROJPATH} /t:Build ${DOTNET_BUILD_PROPERTIES} /p:Configuration="${DOTNET_CONFIG}" ${DOTNET_ARGUMENTS})
         SET(build_dotnet_type "msbuild")
     ELSE()
-        SET(build_dotnet_cmds 
+        SET(build_dotnet_cmds
             COMMAND ${CMAKE_COMMAND} -E echo "======= Building .NET project ${DOTNET_PROJNAME} [${DOTNET_CONFIG} ${DOTNET_PLATFORM}]")
         foreach (_src ${DOTNET_SOURCES} )
             LIST(APPEND build_dotnet_cmds COMMAND ${DOTNET_EXE} add ${DOTNET_PROJPATH} reference ${_src})
@@ -362,10 +359,10 @@ MACRO(DOTNET_BUILD_COMMANDS)
             MESSAGE("-- Adding ${build_dotnet_type} project ${DOTNET_PROJPATH} (no nupkg)")
         ENDIF()
     endif()
-    
-    LIST(APPEND build_dotnet_cmds COMMAND ${DOTNET_EXE} pack 
-                        --no-build --no-restore ${DOTNET_PROJPATH} 
-                        -c ${DOTNET_CONFIG} ${DOTNET_BUILD_PROPERTIES} ${DOTNET_PACK_OPTIONS} 
+
+    LIST(APPEND build_dotnet_cmds COMMAND ${DOTNET_EXE} pack
+                        --no-build --no-restore ${DOTNET_PROJPATH}
+                        -c ${DOTNET_CONFIG} ${DOTNET_BUILD_PROPERTIES} ${DOTNET_PACK_OPTIONS}
                         --output ${CMAKE_CURRENT_BINARY_DIR} -p:PackageVersion=${DOTNET_PACKAGE_VERSION} )
     LIST(APPEND DOTNET_OUTPUTS ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.buildtimestamp)
     LIST(APPEND build_dotnet_cmds COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.buildtimestamp)
@@ -416,7 +413,7 @@ FUNCTION(RUN_DOTNET DOTNET_PROJECT)
         COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.runtimestamp
         WORKING_DIRECTORY ${DOTNET_OUTPUT_PATH})
     ADD_CUSTOM_TARGET(
-        ${DOTNET_PROJNAME} 
+        ${DOTNET_PROJNAME}
         DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.runtimestamp ${DOTNET_RUN_OUTPUT})
     ADD_DOTNET_DEPENDENCY_TARGETS()
 ENDFUNCTION()
@@ -469,9 +466,9 @@ FUNCTION(GEN_DOTNET_PROPS target_props_file)
         # prefix
         _DNP
         # options (flags)
-        "" 
+        ""
         # oneValueArgs
-        "PACKAGE_VERSION;XML_INJECT" 
+        "PACKAGE_VERSION;XML_INJECT"
         # multiValueArgs
         ""
         # the input arguments
@@ -496,4 +493,4 @@ ENDFUNCTION()
 
 
 MESSAGE("-- Found .NET toolchain: ${DOTNET_EXE} (version ${DOTNET_VERSION})")
-SET(DOTNET_FOUND TRUE)
\ No newline at end of file
+SET(DOTNET_FOUND TRUE)
diff --git a/doc/source/about_no_title.rst b/doc/source/about_no_title.rst
index c60c858cce3e..819c4aca9f4a 100644
--- a/doc/source/about_no_title.rst
+++ b/doc/source/about_no_title.rst
@@ -1,4 +1,4 @@
-GDAL is a translator library for raster and vector geospatial data formats that is released under an MIT style Open Source :ref:`license` by the `Open Source Geospatial Foundation`_. As a library, it presents a single raster abstract data model and single vector abstract data model to the calling application for all supported formats. It also comes with a variety of useful command line utilities for data translation and processing. The `NEWS`_ page describes the October 2024 GDAL/OGR 3.9.3 release.
+GDAL is a translator library for raster and vector geospatial data formats that is released under an MIT style Open Source :ref:`license` by the `Open Source Geospatial Foundation`_. As a library, it presents a single raster abstract data model and single vector abstract data model to the calling application for all supported formats. It also comes with a variety of useful command line utilities for data translation and processing. The `NEWS`_ page describes the November 2024 GDAL/OGR 3.10.0 release.
 
 .. note::
 
@@ -17,7 +17,7 @@ GDAL is a translator library for raster and vector geospatial data formats that
    :target:  `Open Source Geospatial Foundation`_
 
 .. _`Open Source Geospatial Foundation`: http://www.osgeo.org/
-.. _`NEWS`: https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md
+.. _`NEWS`: https://github.com/OSGeo/gdal/blob/v3.10.0/NEWS.md
 
 See :ref:`software_using_gdal`
 
diff --git a/doc/source/api/python_samples.rst b/doc/source/api/python_samples.rst
index a2831f63ffaa..294a2bce48c4 100644
--- a/doc/source/api/python_samples.rst
+++ b/doc/source/api/python_samples.rst
@@ -47,6 +47,7 @@ Python Raster Sample scripts
     - hsv_merge: Merge greyscale image into RGB image as intensity in HSV space.
     - gdal_ls: Display the list of files in a virtual directory, like /vsicurl or /vsizip
     - gdal_cp: Copy a virtual file
+    - gdal_minmax_location: returns the location where min/max values of a raster are hit.
 
 Python Vector Sample scripts
 ------------------------------
diff --git a/doc/source/community/code_of_conduct.rst b/doc/source/community/code_of_conduct.rst
index 9dc38d42ac5b..b151edb5daf9 100644
--- a/doc/source/community/code_of_conduct.rst
+++ b/doc/source/community/code_of_conduct.rst
@@ -19,7 +19,7 @@ claims any affiliation with the GDAL project.
 
 It applies to in-person events (such as conferences and related social events),
 IRC, public and private mailing lists, the issue tracker, the wiki, blogs,
-Twitter, and any other forums which the community uses for communication and
+social media, and any other forums which the community uses for communication and
 interactions.
 
 This code is not exhaustive or complete. It serves to distill our common
diff --git a/doc/source/download.rst b/doc/source/download.rst
index ca336557fdc3..847f3a8e3b1d 100644
--- a/doc/source/download.rst
+++ b/doc/source/download.rst
@@ -18,11 +18,11 @@ Source Code
 Current Release
 ...............
 
-* **2024-10-14** `gdal-3.9.3.tar.gz`_ `3.9.3 Release Notes`_ (`3.9.3 md5`_)
+* **2024-11-01** `gdal-3.10.0.tar.gz`_ `3.10.0 Release Notes`_ (`3.10.0 md5`_)
 
-.. _`3.9.3 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md
-.. _`gdal-3.9.3.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz
-.. _`3.9.3 md5`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz.md5
+.. _`3.10.0 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.10.0/NEWS.md
+.. _`gdal-3.10.0.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.10.0/gdal-3.10.0.tar.gz
+.. _`3.10.0 md5`: https://github.com/OSGeo/gdal/releases/download/v3.10.0/gdal-3.10.0.tar.gz.md5
 
 Past Releases
 .............
diff --git a/doc/source/download_past.rst b/doc/source/download_past.rst
index f732485793ef..afff6413df04 100644
--- a/doc/source/download_past.rst
+++ b/doc/source/download_past.rst
@@ -5,6 +5,12 @@
 Past Releases
 =============
 
+* **2024-10-14** `gdal-3.9.3.tar.gz`_ `3.9.3 Release Notes`_ (`3.9.3 md5`_)
+
+.. _`3.9.3 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md
+.. _`gdal-3.9.3.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz
+.. _`3.9.3 md5`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz.md5
+
 * **2024-08-16** `gdal-3.9.2.tar.gz`_ `3.9.2 Release Notes`_ (`3.9.2 md5`_)
 
 .. _`3.9.2 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.9.2/NEWS.md
diff --git a/doc/source/drivers/vector/aivector.rst b/doc/source/drivers/vector/aivector.rst
new file mode 100644
index 000000000000..41c230f45dd2
--- /dev/null
+++ b/doc/source/drivers/vector/aivector.rst
@@ -0,0 +1,32 @@
+.. _vector.aivector:
+
+Artificial intelligence powered vector driver
+=============================================
+
+.. versionadded:: 3.11
+
+.. shortname:: AIVector
+
+.. built_in_by_default::
+
+This driver builds on many years of self-funded investments from the GDAL team on AI
+technologies to bring you the ultimate driver that can read any vector format.
+After that one, no need for any new vector driver!
+
+The open syntax is ``AIVector:{filename}``, or directly specify the filename and
+force the use of the AIVector driver with the ``-if`` flag of ogrinfo or ogr2ogr.
+No options at all. Just enjoy the true power of AI.
+
+.. note:: We are open to external investors to develop the write side of the driver.
+
+Examples
+--------
+
+::
+
+  ogrinfo -if AIVector undocumented_proprietary_format.bin -al
+
+.. note::
+
+    The above works even if you make a typo in the filename. The driver will
+    automatically figure out the filename you meant.
diff --git a/doc/source/drivers/vector/index.rst b/doc/source/drivers/vector/index.rst
index 859bba292217..411189072692 100644
--- a/doc/source/drivers/vector/index.rst
+++ b/doc/source/drivers/vector/index.rst
@@ -23,6 +23,7 @@ Vector drivers
    :hidden:
 
    adbc
+   aivector
    amigocloud
    arrow
    avcbin
diff --git a/doc/source/software_using_gdal.rst b/doc/source/software_using_gdal.rst
index 7886d4df8b77..e5ae5e161eab 100644
--- a/doc/source/software_using_gdal.rst
+++ b/doc/source/software_using_gdal.rst
@@ -44,6 +44,7 @@ Free and open source
 - `NextGIS Web  <http://nextgis.com/nextgis-web>`_ Server-side Web GIS and a framework for storage, visualization and permissions management of all kinds
 - `Ogr2 GUI  <https://sourceforge.net/projects/ogr2gui/>`_ Graphical user interface for ogr2ogr
 - `OpenCPN  <http://opencpn.org>`_  A concise ChartPlotter/Navigator. A cross-platform ship-borne GUI application.
+- `OpenDataCube  <https://www.opendatacube.org>`_  FOSS software package that simplifies the management and analysis of large amounts of satellite imagery and other Earth observation data.
 - `OpenEV  <http://openev.sourceforge.net>`_  An OpenGL/GTK/Python based graphical viewer which exclusively uses GDAL for raster access.
 - `OFGT <https://github.com/openforis/geospatial-toolkit>`_  a collection of utilities for multipurpose forest monitoring under the `Open Foris Initiative <http://km.fao.org/OFwiki/index.php/Main_Page>`_ Open Foris Initiative.
 - `OpenFLUID  <https://www.openfluid-project.org>`_  a software platform for spatial modelling of landscapes dynamics
diff --git a/docker/README.md b/docker/README.md
index 390cea55b7f7..74d3a1d865a3 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -47,7 +47,7 @@ See [alpine-normal/Dockerfile](alpine-normal/Dockerfile)
 # Ubuntu based
 
 Ubuntu version:
-* 24.04 for GDAL 3.9
+* 24.04 for GDAL 3.9 and 3.10
 * 22.04 for GDAL 3.6, 3.7 and 3.8
 * 20.04 for GDAL 3.4 and 3.5
 
@@ -100,11 +100,11 @@ If you are getting a ``<jemalloc>: arena 0 background thread creation failed (1)
 
 # Images of releases
 
-Tagged images of recent past releases are available. The last ones (at time of writing) are for GDAL 3.9.3 and PROJ 9.5.0, for linux/amd64 and linux/arm64:
-* ghcr.io/osgeo/gdal:alpine-small-3.9.3
-* ghcr.io/osgeo/gdal:alpine-normal-3.9.3
-* ghcr.io/osgeo/gdal:ubuntu-small-3.9.3
-* ghcr.io/osgeo/gdal:ubuntu-full-3.9.3
+Tagged images of recent past releases are available. The last ones (at time of writing) are for GDAL 3.10.0 and PROJ 9.5.0, for linux/amd64 and linux/arm64:
+* ghcr.io/osgeo/gdal:alpine-small-3.10.0
+* ghcr.io/osgeo/gdal:alpine-normal-3.10.0
+* ghcr.io/osgeo/gdal:ubuntu-small-3.10.0
+* ghcr.io/osgeo/gdal:ubuntu-full-3.10.0
 
 ## Multi-arch Images
 
diff --git a/docker/ubuntu-full/bh-gdal.sh b/docker/ubuntu-full/bh-gdal.sh
index 3c66862a6c73..b4a7659463c6 100755
--- a/docker/ubuntu-full/bh-gdal.sh
+++ b/docker/ubuntu-full/bh-gdal.sh
@@ -46,7 +46,9 @@ wget -q "https://github.com/${GDAL_REPOSITORY}/archive/${GDAL_VERSION}.tar.gz" \
     cd build
     # GDAL_USE_TIFF_INTERNAL=ON to use JXL
     export GDAL_CMAKE_EXTRA_OPTS=""
-    if test "${GCC_ARCH}" != "x86_64"; then
+    if test "${GCC_ARCH}" = "x86_64"; then
+        export GDAL_CMAKE_EXTRA_OPTS="${GDAL_CMAKE_EXTRA_OPTS} -DENABLE_IPO=ON"
+    else
         export GDAL_CMAKE_EXTRA_OPTS="${GDAL_CMAKE_EXTRA_OPTS} -DPDFIUM_INCLUDE_DIR="
     fi
     export JAVA_ARCH=""
diff --git a/docker/ubuntu-small/Dockerfile b/docker/ubuntu-small/Dockerfile
index 524f52b35488..65cf4fd17b3a 100644
--- a/docker/ubuntu-small/Dockerfile
+++ b/docker/ubuntu-small/Dockerfile
@@ -153,6 +153,11 @@ RUN --mount=type=cache,id=ubuntu-small-gdal,target=$HOME/.cache \
     && if test "x${GDAL_BUILD_IS_RELEASE:-}" = "x"; then \
         export GDAL_SHA1SUM=${GDAL_VERSION}; \
     fi \
+    && if test "${GCC_ARCH}" = "x86_64"; then \
+        export GDAL_CMAKE_EXTRA_OPTS="-DENABLE_IPO=ON"; \
+    else \
+        export GDAL_CMAKE_EXTRA_OPTS=""; \
+    fi \
     && mkdir gdal \
     && wget -q https://github.com/${GDAL_REPOSITORY}/archive/${GDAL_VERSION}.tar.gz -O - \
         | tar xz -C gdal --strip-components=1 \
@@ -183,7 +188,7 @@ RUN --mount=type=cache,id=ubuntu-small-gdal,target=$HOME/.cache \
         -DPROJ_INCLUDE_DIR="/build${PROJ_INSTALL_PREFIX-/usr/local}/include" \
         -DPROJ_LIBRARY="/build${PROJ_INSTALL_PREFIX-/usr/local}/lib/libinternalproj.so" \
         -DGDAL_USE_TIFF_INTERNAL=ON \
-        -DGDAL_USE_GEOTIFF_INTERNAL=ON \
+        -DGDAL_USE_GEOTIFF_INTERNAL=ON ${GDAL_CMAKE_EXTRA_OPTS} \
         -DBUILD_TESTING=OFF \
     && ninja \
     && DESTDIR="/build" ninja install \
diff --git a/frmts/drivers.ini b/frmts/drivers.ini
index 65b3a6666dbd..1e59d34eea35 100644
--- a/frmts/drivers.ini
+++ b/frmts/drivers.ini
@@ -273,6 +273,9 @@ Tiger
 AVCBin
 AVCE00
 
+# Last but not the least
+AIVector
+
 # End of OGR drivers
 
 # Put here drivers that absolutely need to look for side car
diff --git a/frmts/gtiff/tifvsi.cpp b/frmts/gtiff/tifvsi.cpp
index afc77e171e2e..a889f3cad534 100644
--- a/frmts/gtiff/tifvsi.cpp
+++ b/frmts/gtiff/tifvsi.cpp
@@ -439,7 +439,10 @@ static void VSI_TIFFSetOpenOptions(TIFFOpenOptions *opts)
         {
             const auto nUsableRAM = CPLGetUsablePhysicalRAM();
             if (nUsableRAM > 0)
+            {
+                // coverity[return_overflow]
                 return nUsableRAM / 10 * 9;
+            }
             else
                 return 0;
         }
diff --git a/frmts/rcm/rcmdataset.cpp b/frmts/rcm/rcmdataset.cpp
index 087521b98345..46248c14c7c1 100644
--- a/frmts/rcm/rcmdataset.cpp
+++ b/frmts/rcm/rcmdataset.cpp
@@ -79,7 +79,10 @@ static double *InterpolateValues(CSLConstList papszList, int tableSize,
                                  int pixelFirstLutValue)
 {
     /* Allocate the right LUT size according to the product range pixel */
-    double *table = static_cast<double *>(CPLCalloc(sizeof(double), tableSize));
+    double *table =
+        static_cast<double *>(VSI_CALLOC_VERBOSE(sizeof(double), tableSize));
+    if (!table)
+        return nullptr;
 
     if (stepSize <= 0)
     {
@@ -301,38 +304,16 @@ RCMRasterBand::~RCMRasterBand()
 CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage)
 
 {
-    int nRequestYSize;
-    int nRequestXSize;
-
-    /* -------------------------------------------------------------------- */
-    /*      If the last strip is partial, we need to avoid                  */
-    /*      over-requesting.  We also need to initialize the extra part     */
-    /*      of the block to zero.                                           */
-    /* -------------------------------------------------------------------- */
-    if ((nBlockYOff + 1) * nBlockYSize > nRasterYSize)
-    {
-        nRequestYSize = nRasterYSize - nBlockYOff * nBlockYSize;
-        memset(pImage, 0,
-               GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize);
-    }
-    else
-    {
-        nRequestYSize = nBlockYSize;
-    }
+    int nRequestXSize = 0;
+    int nRequestYSize = 0;
+    GetActualBlockSize(nBlockXOff, nBlockYOff, &nRequestXSize, &nRequestYSize);
 
-    /*-------------------------------------------------------------------- */
-    /*      If the input imagery is tiled, also need to avoid over-        */
-    /*      requesting in the X-direction.                                 */
-    /* ------------------------------------------------------------------- */
-    if ((nBlockXOff + 1) * nBlockXSize > nRasterXSize)
+    // Zero initial partial right-most and bottom-most blocks
+    if (nRequestXSize < nBlockXSize || nRequestYSize < nBlockYSize)
     {
-        nRequestXSize = nRasterXSize - nBlockXOff * nBlockXSize;
         memset(pImage, 0,
-               GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize);
-    }
-    else
-    {
-        nRequestXSize = nBlockXSize;
+               static_cast<size_t>(GDALGetDataTypeSizeBytes(eDataType)) *
+                   nBlockXSize * nBlockYSize);
     }
 
     int dataTypeSize = GDALGetDataTypeSizeBytes(eDataType);
@@ -355,14 +336,16 @@ CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage)
                 GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
                 nRequestXSize, nRequestYSize, pImage, nRequestXSize,
                 nRequestYSize, bandFileType, 2, nullptr, dataTypeSize,
-                dataTypeSize * nBlockXSize, bandFileSize, nullptr);
+                static_cast<GSpacing>(dataTypeSize) * nBlockXSize, bandFileSize,
+                nullptr);
     }
     else if (twoBandComplex && this->isNITF)
     {
         return poBand->RasterIO(
             GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
             nRequestXSize, nRequestYSize, pImage, nRequestXSize, nRequestYSize,
-            eDataType, 0, dataTypeSize * nBlockXSize, nullptr);
+            eDataType, 0, static_cast<GSpacing>(dataTypeSize) * nBlockXSize,
+            nullptr);
     }
 
     if (poRCMDataset->IsComplexData())
@@ -377,7 +360,8 @@ CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage)
                 GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
                 nRequestXSize, nRequestYSize, pImage, nRequestXSize,
                 nRequestYSize, bandFileType, 2, nullptr, dataTypeSize,
-                nBlockXSize * dataTypeSize, bandFileSize, nullptr);
+                static_cast<GSpacing>(dataTypeSize) * nBlockXSize, bandFileSize,
+                nullptr);
     }
 
     // case: band file == this band
@@ -388,7 +372,8 @@ CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage)
         return poBand->RasterIO(
             GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
             nRequestXSize, nRequestYSize, pImage, nRequestXSize, nRequestYSize,
-            eDataType, 0, dataTypeSize * nBlockXSize, nullptr);
+            eDataType, 0, static_cast<GSpacing>(dataTypeSize) * nBlockXSize,
+            nullptr);
     }
     else
     {
@@ -457,7 +442,8 @@ void RCMCalibRasterBand::ReadLUT()
     }
 
     /* Get the Pixel Per range */
-    if (this->stepSize == INT_MIN || this->numberOfValues == INT_MIN ||
+    if (this->stepSize == 0 || this->stepSize == INT_MIN ||
+        this->numberOfValues == INT_MIN ||
         abs(this->stepSize) > INT_MAX / abs(this->numberOfValues))
     {
         CPLError(CE_Failure, CPLE_AppDefined,
@@ -476,15 +462,26 @@ void RCMCalibRasterBand::ReadLUT()
         return;
     }
 
+    // Avoid excessive memory allocation
+    if (this->m_nTableSize > 1000 * 1000)
+    {
+        CPLError(CE_Failure, CPLE_NotSupported, "Too many elements in LUT: %d",
+                 this->m_nTableSize);
+        return;
+    }
+
     /* Allocate the right LUT size according to the product range pixel */
     this->m_nfTable =
         InterpolateValues(aosLUTList.List(), this->m_nTableSize, this->stepSize,
                           this->numberOfValues, this->pixelFirstLutValue);
+    if (!this->m_nfTable)
+        return;
 
-    const size_t nLen =
-        this->m_nTableSize * max_space_for_string;  // 32 max + space
-    char *lut_gains = static_cast<char *>(CPLMalloc(nLen));
-    memset(lut_gains, 0, nLen);
+    // 32 max + space
+    char *lut_gains = static_cast<char *>(
+        VSI_CALLOC_VERBOSE(this->m_nTableSize, max_space_for_string));
+    if (!lut_gains)
+        return;
 
     for (int i = 0; i < this->m_nTableSize; i++)
     {
@@ -588,29 +585,35 @@ void RCMCalibRasterBand::ReadNoiseLevels()
                     atoi(CPLGetXMLValue(psNumberOfValues, "", "0"));
                 const char *noiseLevelValues =
                     CPLGetXMLValue(psNoiseLevelValues, "", "");
-                char **papszNoiseLevelList = CSLTokenizeString2(
-                    noiseLevelValues, " ", CSLT_HONOURSTRINGS);
-                /* Get the Pixel Per range */
-                this->m_nTableNoiseLevelsSize =
-                    abs(this->stepSizeNoiseLevels) *
-                    abs(this->numberOfValuesNoiseLevels);
-
-                if ((EQUAL(calibType, "Beta Nought") &&
-                     this->m_eCalib == Beta0) ||
-                    (EQUAL(calibType, "Sigma Nought") &&
-                     this->m_eCalib == Sigma0) ||
-                    (EQUAL(calibType, "Gamma") && this->m_eCalib == Gamma))
+                if (this->stepSizeNoiseLevels > 0 &&
+                    this->numberOfValuesNoiseLevels != INT_MIN &&
+                    abs(this->numberOfValuesNoiseLevels) <
+                        INT_MAX / this->stepSizeNoiseLevels)
                 {
-                    /* Allocate the right Noise Levels size according to the
-                     * product range pixel */
-                    this->m_nfTableNoiseLevels = InterpolateValues(
-                        papszNoiseLevelList, this->m_nTableNoiseLevelsSize,
-                        this->stepSizeNoiseLevels,
-                        this->numberOfValuesNoiseLevels,
-                        this->pixelFirstLutValueNoiseLevels);
-                }
+                    char **papszNoiseLevelList = CSLTokenizeString2(
+                        noiseLevelValues, " ", CSLT_HONOURSTRINGS);
+                    /* Get the Pixel Per range */
+                    this->m_nTableNoiseLevelsSize =
+                        abs(this->stepSizeNoiseLevels) *
+                        abs(this->numberOfValuesNoiseLevels);
+
+                    if ((EQUAL(calibType, "Beta Nought") &&
+                         this->m_eCalib == Beta0) ||
+                        (EQUAL(calibType, "Sigma Nought") &&
+                         this->m_eCalib == Sigma0) ||
+                        (EQUAL(calibType, "Gamma") && this->m_eCalib == Gamma))
+                    {
+                        /* Allocate the right Noise Levels size according to the
+                         * product range pixel */
+                        this->m_nfTableNoiseLevels = InterpolateValues(
+                            papszNoiseLevelList, this->m_nTableNoiseLevelsSize,
+                            this->stepSizeNoiseLevels,
+                            this->numberOfValuesNoiseLevels,
+                            this->pixelFirstLutValueNoiseLevels);
+                    }
 
-                CSLDestroy(papszNoiseLevelList);
+                    CSLDestroy(papszNoiseLevelList);
+                }
 
                 if (this->m_nfTableNoiseLevels != nullptr)
                 {
@@ -673,53 +676,32 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                                       void *pImage)
 {
     CPLErr eErr;
-    int nRequestYSize;
-    int nRequestXSize;
+    int nRequestXSize = 0;
+    int nRequestYSize = 0;
+    GetActualBlockSize(nBlockXOff, nBlockYOff, &nRequestXSize, &nRequestYSize);
 
-    /* -------------------------------------------------------------------- */
-    /*      If the last strip is partial, we need to avoid                  */
-    /*      over-requesting.  We also need to initialize the extra part     */
-    /*      of the block to zero.                                           */
-    /* -------------------------------------------------------------------- */
-    if ((nBlockYOff + 1) * nBlockYSize > nRasterYSize)
+    // Zero initial partial right-most and bottom-most blocks
+    if (nRequestXSize < nBlockXSize || nRequestYSize < nBlockYSize)
     {
-        nRequestYSize = nRasterYSize - nBlockYOff * nBlockYSize;
         memset(pImage, 0,
-               GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize);
-    }
-    else
-    {
-        nRequestYSize = nBlockYSize;
-    }
-
-    /*-------------------------------------------------------------------- */
-    /*      If the input imagery is tiled, also need to avoid over-        */
-    /*      requesting in the X-direction.                                 */
-    /* ------------------------------------------------------------------- */
-    if ((nBlockXOff + 1) * nBlockXSize > nRasterXSize)
-    {
-        nRequestXSize = nRasterXSize - nBlockXOff * nBlockXSize;
-        memset(pImage, 0,
-               GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize);
-    }
-    else
-    {
-        nRequestXSize = nBlockXSize;
+               static_cast<size_t>(GDALGetDataTypeSizeBytes(eDataType)) *
+                   nBlockXSize * nBlockYSize);
     }
 
     if (this->m_eOriginalType == GDT_CInt16)
     {
-        GInt16 *pnImageTmp;
         /* read in complex values */
-        pnImageTmp = static_cast<GInt16 *>(
-            CPLMalloc(nBlockXSize * nBlockYSize *
-                      GDALGetDataTypeSizeBytes(m_eOriginalType)));
+        GInt16 *panImageTmp = static_cast<GInt16 *>(
+            VSI_MALLOC3_VERBOSE(nBlockXSize, nBlockYSize,
+                                GDALGetDataTypeSizeBytes(m_eOriginalType)));
+        if (!panImageTmp)
+            return CE_Failure;
 
         if (m_poBandDataset->GetRasterCount() == 2)
         {
             eErr = m_poBandDataset->RasterIO(
                 GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
-                nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize,
+                nRequestXSize, nRequestYSize, panImageTmp, nRequestXSize,
                 nRequestYSize, this->m_eOriginalType, 2, nullptr, 4,
                 nBlockXSize * 4, 4, nullptr);
 
@@ -728,7 +710,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                 nBlockXOff * nBlockXSize,
                 nBlockYOff * nBlockYSize,
                 nRequestXSize, nRequestYSize,
-                pnImageTmp, nRequestXSize, nRequestYSize,
+                panImageTmp, nRequestXSize, nRequestYSize,
                 GDT_Int32,
                 2, nullptr, 4, nBlockXSize * 4, 2, nullptr);
             */
@@ -737,7 +719,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
         {
             eErr = m_poBandDataset->RasterIO(
                 GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
-                nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize,
+                nRequestXSize, nRequestYSize, panImageTmp, nRequestXSize,
                 nRequestYSize, this->m_eOriginalType, 1, nullptr, 4,
                 nBlockXSize * 4, 0, nullptr);
 
@@ -747,7 +729,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                     nBlockXOff * nBlockXSize,
                     nBlockYOff * nBlockYSize,
                     nRequestXSize, nRequestYSize,
-                    pnImageTmp, nRequestXSize, nRequestYSize,
+                    panImageTmp, nRequestXSize, nRequestYSize,
                     GDT_UInt32,
                     1, nullptr, 4, nBlockXSize * 4, 0, nullptr);
             */
@@ -771,8 +753,8 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                 const int nTruePixOff = (i * nBlockXSize) + j;
 
                 // Formula for Complex Q+J
-                const float real = static_cast<float>(pnImageTmp[nPixOff]);
-                const float img = static_cast<float>(pnImageTmp[nPixOff + 1]);
+                const float real = static_cast<float>(panImageTmp[nPixOff]);
+                const float img = static_cast<float>(panImageTmp[nPixOff + 1]);
                 const float digitalValue = (real * real) + (img * img);
                 const float lutValue =
                     static_cast<float>(m_nfTable[nBlockXOff * nBlockXSize + j]);
@@ -782,7 +764,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
             }
         }
 
-        CPLFree(pnImageTmp);
+        CPLFree(panImageTmp);
     }
 
     // If the underlying file is NITF CFloat32
@@ -790,25 +772,26 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
              this->m_eOriginalType == GDT_CFloat64)
     {
         /* read in complex values */
-        float *pnImageTmp;
-
         const int dataTypeSize =
             GDALGetDataTypeSizeBytes(this->m_eOriginalType);
         const GDALDataType bandFileType = this->m_eOriginalType;
-        const int bandFileSize = GDALGetDataTypeSizeBytes(bandFileType);
+        const int bandFileDataTypeSize = GDALGetDataTypeSizeBytes(bandFileType);
 
         /* read the original image complex values in a temporary image space */
-        pnImageTmp = static_cast<float *>(
-            CPLMalloc(2 * nBlockXSize * nBlockYSize * bandFileSize));
+        float *pafImageTmp = static_cast<float *>(VSI_MALLOC3_VERBOSE(
+            nBlockXSize, nBlockYSize, 2 * bandFileDataTypeSize));
+        if (!pafImageTmp)
+            return CE_Failure;
 
         eErr =
             // I and Q from each band are pixel-interleaved into this complex
             // band
             m_poBandDataset->RasterIO(
                 GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
-                nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize,
+                nRequestXSize, nRequestYSize, pafImageTmp, nRequestXSize,
                 nRequestYSize, bandFileType, 2, nullptr, dataTypeSize,
-                nBlockXSize * dataTypeSize, bandFileSize, nullptr);
+                static_cast<GSpacing>(dataTypeSize) * nBlockXSize,
+                bandFileDataTypeSize, nullptr);
 
         /* calibrate the complex values */
         for (int i = 0; i < nRequestYSize; i++)
@@ -820,8 +803,8 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                 const int nTruePixOff = (i * nBlockXSize) + j;
 
                 // Formula for Complex Q+J
-                const float real = static_cast<float>(pnImageTmp[nPixOff]);
-                const float img = static_cast<float>(pnImageTmp[nPixOff + 1]);
+                const float real = pafImageTmp[nPixOff];
+                const float img = pafImageTmp[nPixOff + 1];
                 const float digitalValue = (real * real) + (img * img);
                 const float lutValue =
                     static_cast<float>(m_nfTable[nBlockXOff * nBlockXSize + j]);
@@ -831,7 +814,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
             }
         }
 
-        CPLFree(pnImageTmp);
+        CPLFree(pafImageTmp);
     }
 
     else if (this->m_eOriginalType == GDT_Float32)
@@ -900,13 +883,14 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
 
     else if (this->m_eOriginalType == GDT_UInt16)
     {
-        GUInt16 *pnImageTmp;
         /* read in detected values */
-        pnImageTmp = static_cast<GUInt16 *>(CPLMalloc(
-            nBlockXSize * nBlockYSize * GDALGetDataTypeSizeBytes(GDT_UInt16)));
+        GUInt16 *panImageTmp = static_cast<GUInt16 *>(VSI_MALLOC3_VERBOSE(
+            nBlockXSize, nBlockYSize, GDALGetDataTypeSizeBytes(GDT_UInt16)));
+        if (!panImageTmp)
+            return CE_Failure;
         eErr = m_poBandDataset->RasterIO(
             GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
-            nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize,
+            nRequestXSize, nRequestYSize, panImageTmp, nRequestXSize,
             nRequestYSize, GDT_UInt16, 1, nullptr, 2, nBlockXSize * 2, 0,
             nullptr);
 
@@ -918,7 +902,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                 const int nPixOff = (i * nBlockXSize) + j;
 
                 const float digitalValue =
-                    static_cast<float>(pnImageTmp[nPixOff]);
+                    static_cast<float>(panImageTmp[nPixOff]);
                 const float A =
                     static_cast<float>(m_nfTable[nBlockXOff * nBlockXSize + j]);
                 reinterpret_cast<float *>(pImage)[nPixOff] =
@@ -927,16 +911,18 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                     A;
             }
         }
-        CPLFree(pnImageTmp);
+        CPLFree(panImageTmp);
     } /* Ticket #2104: Support for ScanSAR products */
 
     else if (this->m_eOriginalType == GDT_Byte)
     {
-        GByte *pnImageTmp;
-        pnImageTmp = static_cast<GByte *>(CPLMalloc(nBlockXSize * nBlockYSize));
+        GByte *pabyImageTmp =
+            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nBlockXSize, nBlockYSize));
+        if (!pabyImageTmp)
+            return CE_Failure;
         eErr = m_poBandDataset->RasterIO(
             GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize,
-            nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize,
+            nRequestXSize, nRequestYSize, pabyImageTmp, nRequestXSize,
             nRequestYSize, GDT_Byte, 1, nullptr, 1, nBlockXSize, 0, nullptr);
 
         /* iterate over detected values */
@@ -947,7 +933,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                 const int nPixOff = (i * nBlockXSize) + j;
 
                 const float digitalValue =
-                    static_cast<float>(pnImageTmp[nPixOff]);
+                    static_cast<float>(pabyImageTmp[nPixOff]);
                 const float A =
                     static_cast<float>(m_nfTable[nBlockXOff * nBlockXSize + j]);
                 reinterpret_cast<float *>(pImage)[nPixOff] =
@@ -956,7 +942,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff,
                     A;
             }
         }
-        CPLFree(pnImageTmp);
+        CPLFree(pabyImageTmp);
     }
     else
     {
@@ -1213,14 +1199,8 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
         psSceneAttributes, "imageAttributes.samplesPerLine", "-1"));
     poDS->nRasterYSize = atoi(
         CPLGetXMLValue(psSceneAttributes, "imageAttributes.numLines", "-1"));
-    if (poDS->nRasterXSize <= 1 || poDS->nRasterYSize <= 1)
+    if (!GDALCheckDatasetDimensions(poDS->nRasterXSize, poDS->nRasterYSize))
     {
-        CPLError(
-            CE_Failure, CPLE_OpenFailed,
-            "ERROR: Non-sane raster dimensions provided in product.xml. If "
-            "this is "
-            "a valid RCM scene, please contact your data provider for "
-            "a corrected dataset.");
         return nullptr;
     }
 
@@ -1582,47 +1562,52 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
                 CPLGetXMLValue(psIncidenceAngle.get(),
                                "=incidenceAngles.pixelFirstAnglesValue", "0"));
 
-            int stepSize = atoi(CPLGetXMLValue(
+            const int stepSize = atoi(CPLGetXMLValue(
                 psIncidenceAngle.get(), "=incidenceAngles.stepSize", "0"));
-
-            int numberOfValues =
+            const int numberOfValues =
                 atoi(CPLGetXMLValue(psIncidenceAngle.get(),
                                     "=incidenceAngles.numberOfValues", "0"));
 
-            /* Get the Pixel Per range */
-            int tableSize = abs(stepSize) * abs(numberOfValues);
+            if (!(stepSize == 0 || stepSize == INT_MIN ||
+                  numberOfValues == INT_MIN ||
+                  abs(numberOfValues) > INT_MAX / abs(stepSize)))
+            {
+                /* Get the Pixel Per range */
+                const int tableSize = abs(stepSize) * abs(numberOfValues);
 
-            CPLString angles;
-            // Loop through all nodes with spaces
-            CPLXMLNode *psNextNode =
-                CPLGetXMLNode(psIncidenceAngle.get(), "=incidenceAngles");
+                CPLString angles;
+                // Loop through all nodes with spaces
+                CPLXMLNode *psNextNode =
+                    CPLGetXMLNode(psIncidenceAngle.get(), "=incidenceAngles");
 
-            CPLXMLNode *psNodeInc;
-            for (psNodeInc = psNextNode->psChild; psNodeInc != nullptr;
-                 psNodeInc = psNodeInc->psNext)
-            {
-                if (EQUAL(psNodeInc->pszValue, "angles"))
+                CPLXMLNode *psNodeInc;
+                for (psNodeInc = psNextNode->psChild; psNodeInc != nullptr;
+                     psNodeInc = psNodeInc->psNext)
                 {
-                    if (angles.length() > 0)
+                    if (EQUAL(psNodeInc->pszValue, "angles"))
                     {
-                        angles.append(" "); /* separator */
+                        if (angles.length() > 0)
+                        {
+                            angles.append(" "); /* separator */
+                        }
+                        const char *valAngle =
+                            CPLGetXMLValue(psNodeInc, "", "");
+                        angles.append(valAngle);
                     }
-                    const char *valAngle = CPLGetXMLValue(psNodeInc, "", "");
-                    angles.append(valAngle);
                 }
-            }
 
-            char **papszAngleList =
-                CSLTokenizeString2(angles, " ", CSLT_HONOURSTRINGS);
+                char **papszAngleList =
+                    CSLTokenizeString2(angles, " ", CSLT_HONOURSTRINGS);
 
-            /* Allocate the right LUT size according to the product range pixel
-             */
-            poDS->m_IncidenceAngleTableSize = tableSize;
-            poDS->m_nfIncidenceAngleTable =
-                InterpolateValues(papszAngleList, tableSize, stepSize,
-                                  numberOfValues, pixelFirstLutValue);
+                /* Allocate the right LUT size according to the product range pixel
+                 */
+                poDS->m_IncidenceAngleTableSize = tableSize;
+                poDS->m_nfIncidenceAngleTable =
+                    InterpolateValues(papszAngleList, tableSize, stepSize,
+                                      numberOfValues, pixelFirstLutValue);
 
-            CSLDestroy(papszAngleList);
+                CSLDestroy(papszAngleList);
+            }
         }
     }
 
@@ -1962,6 +1947,12 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
                     /* we should bomb gracefully... */
                     pszLUT = pszSigma0LUT;
             }
+            if (!pszLUT)
+            {
+                CPLFree(pszFullname);
+                CPLError(CE_Failure, CPLE_AppDefined, "LUT missing.");
+                return nullptr;
+            }
 
             // The variable 'osNoiseLevelsValues' is always the same for a ban
             // name except the XML contains different calibration name
@@ -1970,10 +1961,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
                 // If Complex, always 32 bits
                 RCMCalibRasterBand *poBand = new RCMCalibRasterBand(
                     poDS.get(), pszPole, GDT_Float32, poBandFile.release(),
-                    eCalib, CPLFormFilename(osPath, pszLUT, nullptr),
-                    CPLFormFilename(osPath, osNoiseLevelsValues.c_str(),
-                                    nullptr),
-                    eDataType);
+                    eCalib, pszLUT, osNoiseLevelsValues.c_str(), eDataType);
                 poDS->SetBand(poDS->GetRasterCount() + 1, poBand);
             }
             else
@@ -1981,10 +1969,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
                 // Whatever the datatype was previoulsy set
                 RCMCalibRasterBand *poBand = new RCMCalibRasterBand(
                     poDS.get(), pszPole, eDataType, poBandFile.release(),
-                    eCalib, CPLFormFilename(osPath, pszLUT, nullptr),
-                    CPLFormFilename(osPath, osNoiseLevelsValues.c_str(),
-                                    nullptr),
-                    eDataType);
+                    eCalib, pszLUT, osNoiseLevelsValues.c_str(), eDataType);
                 poDS->SetBand(poDS->GetRasterCount() + 1, poBand);
             }
         }
@@ -2310,7 +2295,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
 
             if (bUseProjInfo)
             {
-                poDS->m_oSRS = oPrj;
+                poDS->m_oSRS = std::move(oPrj);
             }
             else
             {
@@ -2320,7 +2305,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
             }
         }
 
-        poDS->m_oGCPSRS = oLL;
+        poDS->m_oGCPSRS = std::move(oLL);
     }
 
     /* -------------------------------------------------------------------- */
@@ -2448,33 +2433,25 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo)
         case Sigma0:
         {
             osSubdatasetName = szSIGMA0;
-            CPLString pszDescriptionSigma =
-                FormatCalibration(szSIGMA0, osMDFilename.c_str());
-            osDescription = pszDescriptionSigma;
+            osDescription = FormatCalibration(szSIGMA0, osMDFilename.c_str());
         }
         break;
         case Beta0:
         {
             osSubdatasetName = szBETA0;
-            CPLString pszDescriptionBeta =
-                FormatCalibration(szBETA0, osMDFilename.c_str());
-            osDescription = pszDescriptionBeta;
+            osDescription = FormatCalibration(szBETA0, osMDFilename.c_str());
         }
         break;
         case Gamma:
         {
             osSubdatasetName = szGAMMA;
-            CPLString pszDescriptionGamma =
-                FormatCalibration(szGAMMA, osMDFilename.c_str());
-            osDescription = pszDescriptionGamma;
+            osDescription = FormatCalibration(szGAMMA, osMDFilename.c_str());
         }
         break;
         case Uncalib:
         {
             osSubdatasetName = szUNCALIB;
-            CPLString pszDescriptionUncalib =
-                FormatCalibration(szUNCALIB, osMDFilename.c_str());
-            osDescription = pszDescriptionUncalib;
+            osDescription = FormatCalibration(szUNCALIB, osMDFilename.c_str());
         }
         break;
         default:
diff --git a/gcore/CMakeLists.txt b/gcore/CMakeLists.txt
index 57a114bc3205..0923511caada 100644
--- a/gcore/CMakeLists.txt
+++ b/gcore/CMakeLists.txt
@@ -104,7 +104,10 @@ if (NOT GDAL_AUTOLOAD_PLUGINS)
     PROPERTY COMPILE_DEFINITIONS GDAL_NO_AUTOLOAD)
 endif ()
 
-if (HAVE_SSSE3_AT_COMPILE_TIME)
+if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
+  target_compile_definitions(gcore PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME -DUSE_NEON_OPTIMIZATIONS)
+  target_sources(gcore PRIVATE rasterio_ssse3.cpp)
+elseif (HAVE_SSSE3_AT_COMPILE_TIME)
   target_compile_definitions(gcore PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME)
   target_sources(gcore PRIVATE rasterio_ssse3.cpp)
   set_property(
@@ -209,6 +212,8 @@ target_public_header(
   gdalsubdatasetinfo.h
   gdal_typetraits.h
   gdal_adbc.h
+  gdal_minmax_element.hpp
+  gdal_priv_templates.hpp  # Required by gdal_minmax_element.hpp
 )
 
 set(GDAL_DATA_FILES
diff --git a/gcore/gdal.h b/gcore/gdal.h
index f7a411c393cb..8638f8501b27 100644
--- a/gcore/gdal.h
+++ b/gcore/gdal.h
@@ -1670,6 +1670,10 @@ CPLErr CPL_DLL CPL_STDCALL GDALSetRasterScale(GDALRasterBandH hBand,
 CPLErr CPL_DLL CPL_STDCALL GDALComputeRasterMinMax(GDALRasterBandH hBand,
                                                    int bApproxOK,
                                                    double adfMinMax[2]);
+CPLErr CPL_DLL GDALComputeRasterMinMaxLocation(GDALRasterBandH hBand,
+                                               double *pdfMin, double *pdfMax,
+                                               int *pnMinX, int *pnMinY,
+                                               int *pnMaxX, int *pnMaxY);
 CPLErr CPL_DLL CPL_STDCALL GDALFlushRasterCache(GDALRasterBandH hBand);
 CPLErr CPL_DLL CPL_STDCALL GDALDropRasterCache(GDALRasterBandH hBand);
 CPLErr CPL_DLL CPL_STDCALL GDALGetRasterHistogram(
diff --git a/gcore/gdal_minmax_element.hpp b/gcore/gdal_minmax_element.hpp
new file mode 100644
index 000000000000..9ceb304553b3
--- /dev/null
+++ b/gcore/gdal_minmax_element.hpp
@@ -0,0 +1,1411 @@
+/******************************************************************************
+ * Project:  GDAL Core
+ * Purpose:  Utility functions to find minimum and maximum values in a buffer
+ * Author:   Even Rouault, <even dot rouault at spatialys.com>
+ *
+ ******************************************************************************
+ * Copyright (c) 2024, Even Rouault <even dot rouault at spatialys.com>
+ *
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#ifndef GDAL_MINMAX_ELEMENT_INCLUDED
+#define GDAL_MINMAX_ELEMENT_INCLUDED
+
+// NOTE: This header requires C++17
+
+// This file may be vendored by other applications than GDAL
+// WARNING: if modifying this file, please also update the upstream GDAL version
+// at https://github.com/OSGeo/gdal/blob/master/gcore/gdal_minmax_element.hpp
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <utility>
+
+#include "gdal.h"
+
+#ifdef GDAL_COMPILATION
+#define GDAL_MINMAXELT_NS gdal
+#elif !defined(GDAL_MINMAXELT_NS)
+#error "Please define the GDAL_MINMAXELT_NS macro to define the namespace"
+#endif
+
+#ifdef USE_NEON_OPTIMIZATIONS
+#include "include_sse2neon.h"
+#define GDAL_MINMAX_ELEMENT_USE_SSE2
+#else
+#if defined(__x86_64) || defined(_M_X64)
+#define GDAL_MINMAX_ELEMENT_USE_SSE2
+#endif
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
+// SSE2 header
+#include <emmintrin.h>
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+#endif
+#endif
+
+#include "gdal_priv_templates.hpp"
+
+namespace GDAL_MINMAXELT_NS
+{
+namespace detail
+{
+
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
+/************************************************************************/
+/*                            compScalar()                              */
+/************************************************************************/
+
+template <class T, bool IS_MAX> inline static bool compScalar(T x, T y)
+{
+    if constexpr (IS_MAX)
+        return x > y;
+    else
+        return x < y;
+}
+
+/************************************************************************/
+/*                         extremum_element()                           */
+/************************************************************************/
+
+template <class T, bool IS_MAX>
+size_t extremum_element(const T *v, size_t size, T noDataValue)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    T extremum = v[0];
+    bool extremum_is_nodata = extremum == noDataValue;
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] != noDataValue &&
+            (compScalar<T, IS_MAX>(v[i], extremum) || extremum_is_nodata))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+            extremum_is_nodata = false;
+        }
+    }
+    return idx_of_extremum;
+}
+
+/************************************************************************/
+/*                         extremum_element()                           */
+/************************************************************************/
+
+template <class T, bool IS_MAX> size_t extremum_element(const T *v, size_t size)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    T extremum = v[0];
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (compScalar<T, IS_MAX>(v[i], extremum))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+        }
+    }
+    return idx_of_extremum;
+}
+
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
+
+/************************************************************************/
+/*                   extremum_element_with_nan()                        */
+/************************************************************************/
+
+static inline int8_t Shift8(uint8_t x)
+{
+    return static_cast<int8_t>(x + std::numeric_limits<int8_t>::min());
+}
+
+static inline int16_t Shift16(uint16_t x)
+{
+    return static_cast<int16_t>(x + std::numeric_limits<int16_t>::min());
+}
+
+CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW
+static inline int32_t Shift32(uint32_t x)
+{
+    x += static_cast<uint32_t>(std::numeric_limits<int32_t>::min());
+    int32_t ret;
+    memcpy(&ret, &x, sizeof(x));
+    return ret;
+}
+
+// Return a _mm128[i|d] register with all its elements set to x
+template <class T> static inline auto set1(T x)
+{
+    if constexpr (std::is_same_v<T, uint8_t>)
+        return _mm_set1_epi8(Shift8(x));
+    else if constexpr (std::is_same_v<T, int8_t>)
+        return _mm_set1_epi8(x);
+    else if constexpr (std::is_same_v<T, uint16_t>)
+        return _mm_set1_epi16(Shift16(x));
+    else if constexpr (std::is_same_v<T, int16_t>)
+        return _mm_set1_epi16(x);
+    else if constexpr (std::is_same_v<T, uint32_t>)
+        return _mm_set1_epi32(Shift32(x));
+    else if constexpr (std::is_same_v<T, int32_t>)
+        return _mm_set1_epi32(x);
+    else if constexpr (std::is_same_v<T, float>)
+        return _mm_set1_ps(x);
+    else
+        return _mm_set1_pd(x);
+}
+
+// Return a _mm128[i|d] register with all its elements set to x
+template <class T> static inline auto set1_unshifted(T x)
+{
+    if constexpr (std::is_same_v<T, uint8_t>)
+    {
+        int8_t xSigned;
+        memcpy(&xSigned, &x, sizeof(xSigned));
+        return _mm_set1_epi8(xSigned);
+    }
+    else if constexpr (std::is_same_v<T, int8_t>)
+        return _mm_set1_epi8(x);
+    else if constexpr (std::is_same_v<T, uint16_t>)
+    {
+        int16_t xSigned;
+        memcpy(&xSigned, &x, sizeof(xSigned));
+        return _mm_set1_epi16(xSigned);
+    }
+    else if constexpr (std::is_same_v<T, int16_t>)
+        return _mm_set1_epi16(x);
+    else if constexpr (std::is_same_v<T, uint32_t>)
+    {
+        int32_t xSigned;
+        memcpy(&xSigned, &x, sizeof(xSigned));
+        return _mm_set1_epi32(xSigned);
+    }
+    else if constexpr (std::is_same_v<T, int32_t>)
+        return _mm_set1_epi32(x);
+    else if constexpr (std::is_same_v<T, float>)
+        return _mm_set1_ps(x);
+    else
+        return _mm_set1_pd(x);
+}
+
+// Load as many values of type T at a _mm128[i|d] register can contain from x
+template <class T> static inline auto loadv(const T *x)
+{
+    if constexpr (std::is_same_v<T, float>)
+        return _mm_loadu_ps(x);
+    else if constexpr (std::is_same_v<T, double>)
+        return _mm_loadu_pd(x);
+    else
+        return _mm_loadu_si128(reinterpret_cast<const __m128i *>(x));
+}
+
+// Return a __m128i register with bits set when x[i] < y[i] when !IS_MAX
+// or x[i] > y[i] when IS_MAX
+template <class T, bool IS_MAX, class SSE_T>
+static inline __m128i comp(SSE_T x, SSE_T y)
+{
+    if constexpr (IS_MAX)
+    {
+        if constexpr (std::is_same_v<T, uint8_t>)
+            return _mm_cmpgt_epi8(
+                _mm_add_epi8(x,
+                             _mm_set1_epi8(std::numeric_limits<int8_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int8_t>)
+            return _mm_cmpgt_epi8(x, y);
+        else if constexpr (std::is_same_v<T, uint16_t>)
+            return _mm_cmpgt_epi16(
+                _mm_add_epi16(
+                    x, _mm_set1_epi16(std::numeric_limits<int16_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int16_t>)
+            return _mm_cmpgt_epi16(x, y);
+        else if constexpr (std::is_same_v<T, uint32_t>)
+            return _mm_cmpgt_epi32(
+                _mm_add_epi32(
+                    x, _mm_set1_epi32(std::numeric_limits<int32_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int32_t>)
+            return _mm_cmpgt_epi32(x, y);
+        // We could use _mm_cmpgt_pX() if there was no NaN values
+        else if constexpr (std::is_same_v<T, float>)
+            return _mm_castps_si128(_mm_cmpnle_ps(x, y));
+        else
+            return _mm_castpd_si128(_mm_cmpnle_pd(x, y));
+    }
+    else
+    {
+        if constexpr (std::is_same_v<T, uint8_t>)
+            return _mm_cmplt_epi8(
+                _mm_add_epi8(x,
+                             _mm_set1_epi8(std::numeric_limits<int8_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int8_t>)
+            return _mm_cmplt_epi8(x, y);
+        else if constexpr (std::is_same_v<T, uint16_t>)
+            return _mm_cmplt_epi16(
+                _mm_add_epi16(
+                    x, _mm_set1_epi16(std::numeric_limits<int16_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int16_t>)
+            return _mm_cmplt_epi16(x, y);
+        else if constexpr (std::is_same_v<T, uint32_t>)
+            return _mm_cmplt_epi32(
+                _mm_add_epi32(
+                    x, _mm_set1_epi32(std::numeric_limits<int32_t>::min())),
+                y);
+        else if constexpr (std::is_same_v<T, int32_t>)
+            return _mm_cmplt_epi32(x, y);
+        // We could use _mm_cmplt_pX() if there was no NaN values
+        else if constexpr (std::is_same_v<T, float>)
+            return _mm_castps_si128(_mm_cmpnge_ps(x, y));
+        else
+            return _mm_castpd_si128(_mm_cmpnge_pd(x, y));
+    }
+}
+
+template <class T, class SSE_T> static inline SSE_T compeq(SSE_T a, SSE_T b);
+
+template <> __m128i compeq<uint8_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi8(a, b);
+}
+
+template <> __m128i compeq<int8_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi8(a, b);
+}
+
+template <> __m128i compeq<uint16_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi16(a, b);
+}
+
+template <> __m128i compeq<int16_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi16(a, b);
+}
+
+template <> __m128i compeq<uint32_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi32(a, b);
+}
+
+template <> __m128i compeq<int32_t>(__m128i a, __m128i b)
+{
+    return _mm_cmpeq_epi32(a, b);
+}
+
+template <> __m128 compeq<float>(__m128 a, __m128 b)
+{
+    return _mm_cmpeq_ps(a, b);
+}
+
+template <> __m128d compeq<double>(__m128d a, __m128d b)
+{
+    return _mm_cmpeq_pd(a, b);
+}
+
+template <class T> static inline T blendv(T a, T b, T mask);
+
+template <> __m128i blendv(__m128i a, __m128i b, __m128i mask)
+{
+#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
+    return _mm_blendv_epi8(a, b, mask);
+#else
+    return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b));
+#endif
+}
+
+template <> __m128 blendv(__m128 a, __m128 b, __m128 mask)
+{
+#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
+    return _mm_blendv_ps(a, b, mask);
+#else
+    return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b));
+#endif
+}
+
+template <> __m128d blendv(__m128d a, __m128d b, __m128d mask)
+{
+#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
+    return _mm_blendv_pd(a, b, mask);
+#else
+    return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, b));
+#endif
+}
+
+// Using SSE2
+template <class T, bool IS_MAX, bool HAS_NODATA>
+inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
+{
+    static_assert(std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t> ||
+                  std::is_same_v<T, uint16_t> || std::is_same_v<T, int16_t> ||
+                  std::is_same_v<T, uint32_t> || std::is_same_v<T, int32_t> ||
+                  std::is_floating_point_v<T>);
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    T extremum = v[0];
+    [[maybe_unused]] bool extremum_is_invalid = false;
+    if constexpr (std::is_floating_point_v<T>)
+    {
+        extremum_is_invalid = std::isnan(extremum);
+    }
+    if constexpr (HAS_NODATA)
+    {
+        if (extremum == noDataValue)
+            extremum_is_invalid = true;
+    }
+    size_t i = 1;
+
+    constexpr size_t VALS_PER_REG = sizeof(set1(extremum)) / sizeof(extremum);
+    constexpr int LOOP_UNROLLING = 4;
+    // If changing the value, then we need to adjust the number of sse_valX
+    // loading in the loop.
+    static_assert(LOOP_UNROLLING == 4);
+    constexpr size_t VALS_PER_ITER = VALS_PER_REG * LOOP_UNROLLING;
+
+    const auto update = [v, noDataValue, &extremum, &idx_of_extremum,
+                         &extremum_is_invalid](size_t idx)
+    {
+        if constexpr (HAS_NODATA)
+        {
+            if (v[idx] == noDataValue)
+                return;
+            if (extremum_is_invalid)
+            {
+                if constexpr (std::is_floating_point_v<T>)
+                {
+                    if (std::isnan(v[idx]))
+                        return;
+                }
+                extremum = v[idx];
+                idx_of_extremum = idx;
+                extremum_is_invalid = false;
+                return;
+            }
+        }
+        else
+        {
+            CPL_IGNORE_RET_VAL(noDataValue);
+        }
+        if (compScalar<T, IS_MAX>(v[idx], extremum))
+        {
+            extremum = v[idx];
+            idx_of_extremum = idx;
+            extremum_is_invalid = false;
+        }
+        else if constexpr (std::is_floating_point_v<T>)
+        {
+            if (extremum_is_invalid && !std::isnan(v[idx]))
+            {
+                extremum = v[idx];
+                idx_of_extremum = idx;
+                extremum_is_invalid = false;
+            }
+        }
+    };
+
+    for (; i < VALS_PER_ITER && i < size; ++i)
+    {
+        update(i);
+    }
+
+    [[maybe_unused]] auto sse_neutral = set1_unshifted(static_cast<T>(0));
+    [[maybe_unused]] auto sse_nodata = set1_unshifted(noDataValue);
+    if constexpr (HAS_NODATA)
+    {
+        for (; i < size && extremum_is_invalid; ++i)
+        {
+            update(i);
+        }
+        if (!extremum_is_invalid)
+        {
+            for (; i < size && (i % VALS_PER_ITER) != 0; ++i)
+            {
+                update(i);
+            }
+            sse_neutral = set1_unshifted(extremum);
+        }
+    }
+
+    auto sse_extremum = set1(extremum);
+
+    [[maybe_unused]] size_t hits = 0;
+    const auto sse_iter_count = (size / VALS_PER_ITER) * VALS_PER_ITER;
+    for (; i < sse_iter_count; i += VALS_PER_ITER)
+    {
+        // A bit of loop unrolling to save 3/4 of slow movemask operations.
+        auto sse_val0 = loadv(v + i + 0 * VALS_PER_REG);
+        auto sse_val1 = loadv(v + i + 1 * VALS_PER_REG);
+        auto sse_val2 = loadv(v + i + 2 * VALS_PER_REG);
+        auto sse_val3 = loadv(v + i + 3 * VALS_PER_REG);
+
+        if constexpr (HAS_NODATA)
+        {
+            // Replace all components that are at the nodata value by a
+            // neutral value (current minimum)
+            const auto replaceNoDataByNeutral =
+                [sse_neutral, sse_nodata](auto sse_val)
+            {
+                const auto eq_nodata = compeq<T>(sse_val, sse_nodata);
+                return blendv(sse_val, sse_neutral, eq_nodata);
+            };
+
+            sse_val0 = replaceNoDataByNeutral(sse_val0);
+            sse_val1 = replaceNoDataByNeutral(sse_val1);
+            sse_val2 = replaceNoDataByNeutral(sse_val2);
+            sse_val3 = replaceNoDataByNeutral(sse_val3);
+        }
+
+        if (_mm_movemask_epi8(_mm_or_si128(
+                _mm_or_si128(comp<T, IS_MAX>(sse_val0, sse_extremum),
+                             comp<T, IS_MAX>(sse_val1, sse_extremum)),
+                _mm_or_si128(comp<T, IS_MAX>(sse_val2, sse_extremum),
+                             comp<T, IS_MAX>(sse_val3, sse_extremum)))) != 0)
+        {
+            if constexpr (!std::is_same_v<T, int8_t> &&
+                          !std::is_same_v<T, uint8_t>)
+            {
+                // The above tests excluding int8_t/uint8_t is due to the fact
+                // with those small ranges of values we will quickly converge
+                // to the minimum, so no need to do the below "smart" test.
+
+                if (++hits == size / 16)
+                {
+                    // If we have an almost sorted array, then using this code path
+                    // will hurt performance. Arbitrary give up if we get here
+                    // more than 1. / 16 of the size of the array.
+                    // fprintf(stderr, "going to non-vector path\n");
+                    break;
+                }
+            }
+            for (size_t j = 0; j < VALS_PER_ITER; j++)
+            {
+                update(i + j);
+            }
+            sse_extremum = set1(extremum);
+            if constexpr (HAS_NODATA)
+            {
+                sse_neutral = set1_unshifted(extremum);
+            }
+        }
+    }
+    for (; i < size; ++i)
+    {
+        update(i);
+    }
+    return idx_of_extremum;
+}
+
+#else
+
+/************************************************************************/
+/*                    extremum_element_with_nan()                       */
+/************************************************************************/
+
+template <class T, bool IS_MAX, bool HAS_NODATA>
+inline size_t extremum_element_with_nan(const T *v, size_t size, T /* nodata */)
+{
+    static_assert(!HAS_NODATA);
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    auto extremum = v[0];
+    bool extremum_is_nan = std::isnan(extremum);
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (compScalar<T, IS_MAX>(v[i], extremum) ||
+            (extremum_is_nan && !std::isnan(v[i])))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+            extremum_is_nan = false;
+        }
+    }
+    return idx_of_extremum;
+}
+#endif
+
+/************************************************************************/
+/*                         extremum_element()                           */
+/************************************************************************/
+
+#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2
+
+template <>
+size_t extremum_element<uint8_t, true>(const uint8_t *v, size_t size,
+                                       uint8_t noDataValue)
+{
+    return extremum_element_with_nan<uint8_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<uint8_t, false>(const uint8_t *v, size_t size,
+                                        uint8_t noDataValue)
+{
+    return extremum_element_with_nan<uint8_t, false, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<uint8_t, true>(const uint8_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint8_t, true, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint8_t, false>(const uint8_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint8_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int8_t, true>(const int8_t *v, size_t size,
+                                      int8_t noDataValue)
+{
+    return extremum_element_with_nan<int8_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<int8_t, false>(const int8_t *v, size_t size,
+                                       int8_t noDataValue)
+{
+    return extremum_element_with_nan<int8_t, false, true>(v, size, noDataValue);
+}
+
+template <> size_t extremum_element<int8_t, true>(const int8_t *v, size_t size)
+{
+    return extremum_element_with_nan<int8_t, true, false>(v, size, 0);
+}
+
+template <> size_t extremum_element<int8_t, false>(const int8_t *v, size_t size)
+{
+    return extremum_element_with_nan<int8_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint16_t, true>(const uint16_t *v, size_t size,
+                                        uint16_t noDataValue)
+{
+    return extremum_element_with_nan<uint16_t, true, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<uint16_t, false>(const uint16_t *v, size_t size,
+                                         uint16_t noDataValue)
+{
+    return extremum_element_with_nan<uint16_t, false, true>(v, size,
+                                                            noDataValue);
+}
+
+template <>
+size_t extremum_element<uint16_t, true>(const uint16_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint16_t, true, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint16_t, false>(const uint16_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint16_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int16_t, true>(const int16_t *v, size_t size,
+                                       int16_t noDataValue)
+{
+    return extremum_element_with_nan<int16_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<int16_t, false>(const int16_t *v, size_t size,
+                                        int16_t noDataValue)
+{
+    return extremum_element_with_nan<int16_t, false, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<int16_t, true>(const int16_t *v, size_t size)
+{
+    return extremum_element_with_nan<int16_t, true, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int16_t, false>(const int16_t *v, size_t size)
+{
+    return extremum_element_with_nan<int16_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint32_t, true>(const uint32_t *v, size_t size,
+                                        uint32_t noDataValue)
+{
+    return extremum_element_with_nan<uint32_t, true, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<uint32_t, false>(const uint32_t *v, size_t size,
+                                         uint32_t noDataValue)
+{
+    return extremum_element_with_nan<uint32_t, false, true>(v, size,
+                                                            noDataValue);
+}
+
+template <>
+size_t extremum_element<uint32_t, true>(const uint32_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint32_t, true, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<uint32_t, false>(const uint32_t *v, size_t size)
+{
+    return extremum_element_with_nan<uint32_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int32_t, true>(const int32_t *v, size_t size,
+                                       int32_t noDataValue)
+{
+    return extremum_element_with_nan<int32_t, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<int32_t, false>(const int32_t *v, size_t size,
+                                        int32_t noDataValue)
+{
+    return extremum_element_with_nan<int32_t, false, true>(v, size,
+                                                           noDataValue);
+}
+
+template <>
+size_t extremum_element<int32_t, true>(const int32_t *v, size_t size)
+{
+    return extremum_element_with_nan<int32_t, true, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<int32_t, false>(const int32_t *v, size_t size)
+{
+    return extremum_element_with_nan<int32_t, false, false>(v, size, 0);
+}
+
+template <>
+size_t extremum_element<float, true>(const float *v, size_t size,
+                                     float noDataValue)
+{
+    return extremum_element_with_nan<float, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<float, false>(const float *v, size_t size,
+                                      float noDataValue)
+{
+    return extremum_element_with_nan<float, false, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, true>(const double *v, size_t size,
+                                      double noDataValue)
+{
+    return extremum_element_with_nan<double, true, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, false>(const double *v, size_t size,
+                                       double noDataValue)
+{
+    return extremum_element_with_nan<double, false, true>(v, size, noDataValue);
+}
+
+#endif
+
+template <> size_t extremum_element<float, true>(const float *v, size_t size)
+{
+    return extremum_element_with_nan<float, true, false>(v, size, 0);
+}
+
+template <> size_t extremum_element<double, true>(const double *v, size_t size)
+{
+    return extremum_element_with_nan<double, true, false>(v, size, 0);
+}
+
+template <> size_t extremum_element<float, false>(const float *v, size_t size)
+{
+    return extremum_element_with_nan<float, false, false>(v, size, 0);
+}
+
+template <> size_t extremum_element<double, false>(const double *v, size_t size)
+{
+    return extremum_element_with_nan<double, false, false>(v, size, 0);
+}
+
+/************************************************************************/
+/*                       extremum_element_with_nan()                    */
+/************************************************************************/
+
+template <class T, bool IS_MAX>
+inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue)
+{
+    if (std::isnan(noDataValue))
+        return extremum_element_with_nan<T, IS_MAX, false>(v, size, 0);
+    if (size == 0)
+        return 0;
+    size_t idx_of_extremum = 0;
+    auto extremum = v[0];
+    bool extremum_is_nan_or_nodata =
+        std::isnan(extremum) || (extremum == noDataValue);
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] != noDataValue &&
+            (compScalar<T, IS_MAX>(v[i], extremum) ||
+             (extremum_is_nan_or_nodata && !std::isnan(v[i]))))
+        {
+            extremum = v[i];
+            idx_of_extremum = i;
+            extremum_is_nan_or_nodata = false;
+        }
+    }
+    return idx_of_extremum;
+}
+
+/************************************************************************/
+/*                            extremum_element()                        */
+/************************************************************************/
+
+#if !defined(GDAL_MINMAX_ELEMENT_USE_SSE2)
+
+template <>
+size_t extremum_element<float, true>(const float *v, size_t size,
+                                     float noDataValue)
+{
+    return extremum_element_with_nan<float, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<float, false>(const float *v, size_t size,
+                                      float noDataValue)
+{
+    return extremum_element_with_nan<float, false>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, true>(const double *v, size_t size,
+                                      double noDataValue)
+{
+    return extremum_element_with_nan<double, true>(v, size, noDataValue);
+}
+
+template <>
+size_t extremum_element<double, false>(const double *v, size_t size,
+                                       double noDataValue)
+{
+    return extremum_element_with_nan<double, false>(v, size, noDataValue);
+}
+
+#endif
+
+template <class T, bool IS_MAX>
+inline size_t extremum_element(const T *buffer, size_t size, bool bHasNoData,
+                               T noDataValue)
+{
+    if (bHasNoData)
+        return extremum_element<T, IS_MAX>(buffer, size, noDataValue);
+    else
+        return extremum_element<T, IS_MAX>(buffer, size);
+}
+
+#else
+
+template <class T, bool IS_MAX>
+inline size_t extremum_element(const T *buffer, size_t size, bool bHasNoData,
+                               T noDataValue)
+{
+    if (bHasNoData)
+    {
+        if constexpr (std::is_floating_point_v<T>)
+        {
+            if (std::isnan(noDataValue))
+            {
+                if constexpr (IS_MAX)
+                {
+                    return std::max_element(buffer, buffer + size,
+                                            [](T a, T b) {
+                                                return std::isnan(b)   ? false
+                                                       : std::isnan(a) ? true
+                                                                       : a < b;
+                                            }) -
+                           buffer;
+                }
+                else
+                {
+                    return std::min_element(buffer, buffer + size,
+                                            [](T a, T b) {
+                                                return std::isnan(b)   ? true
+                                                       : std::isnan(a) ? false
+                                                                       : a < b;
+                                            }) -
+                           buffer;
+                }
+            }
+            else
+            {
+                if constexpr (IS_MAX)
+                {
+                    return std::max_element(buffer, buffer + size,
+                                            [noDataValue](T a, T b)
+                                            {
+                                                return std::isnan(b)   ? false
+                                                       : std::isnan(a) ? true
+                                                       : (b == noDataValue)
+                                                           ? false
+                                                       : (a == noDataValue)
+                                                           ? true
+                                                           : a < b;
+                                            }) -
+                           buffer;
+                }
+                else
+                {
+                    return std::min_element(buffer, buffer + size,
+                                            [noDataValue](T a, T b)
+                                            {
+                                                return std::isnan(b)   ? true
+                                                       : std::isnan(a) ? false
+                                                       : (b == noDataValue)
+                                                           ? true
+                                                       : (a == noDataValue)
+                                                           ? false
+                                                           : a < b;
+                                            }) -
+                           buffer;
+                }
+            }
+        }
+        else
+        {
+            if constexpr (IS_MAX)
+            {
+                return std::max_element(buffer, buffer + size,
+                                        [noDataValue](T a, T b) {
+                                            return (b == noDataValue)   ? false
+                                                   : (a == noDataValue) ? true
+                                                                        : a < b;
+                                        }) -
+                       buffer;
+            }
+            else
+            {
+                return std::min_element(buffer, buffer + size,
+                                        [noDataValue](T a, T b) {
+                                            return (b == noDataValue)   ? true
+                                                   : (a == noDataValue) ? false
+                                                                        : a < b;
+                                        }) -
+                       buffer;
+            }
+        }
+    }
+    else
+    {
+        if constexpr (std::is_floating_point_v<T>)
+        {
+            if constexpr (IS_MAX)
+            {
+                return std::max_element(buffer, buffer + size,
+                                        [](T a, T b) {
+                                            return std::isnan(b)   ? false
+                                                   : std::isnan(a) ? true
+                                                                   : a < b;
+                                        }) -
+                       buffer;
+            }
+            else
+            {
+                return std::min_element(buffer, buffer + size,
+                                        [](T a, T b) {
+                                            return std::isnan(b)   ? true
+                                                   : std::isnan(a) ? false
+                                                                   : a < b;
+                                        }) -
+                       buffer;
+            }
+        }
+        else
+        {
+            if constexpr (IS_MAX)
+            {
+                return std::max_element(buffer, buffer + size) - buffer;
+            }
+            else
+            {
+                return std::min_element(buffer, buffer + size) - buffer;
+            }
+        }
+    }
+}
+#endif
+
+template <bool IS_MAX>
+size_t extremum_element(const void *buffer, size_t nElts, GDALDataType eDT,
+                        bool bHasNoData, double dfNoDataValue)
+{
+    switch (eDT)
+    {
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 7, 0)
+        case GDT_Int8:
+        {
+            using T = int8_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+#endif
+        case GDT_Byte:
+        {
+            using T = uint8_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Int16:
+        {
+            using T = int16_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt16:
+        {
+            using T = uint16_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Int32:
+        {
+            using T = int32_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt32:
+        {
+            using T = uint32_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 5, 0)
+        case GDT_Int64:
+        {
+            using T = int64_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt64:
+        {
+            using T = uint64_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+#endif
+        case GDT_Float32:
+        {
+            using T = float;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Float64:
+        {
+            using T = double;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return extremum_element<T, IS_MAX>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        default:
+            break;
+    }
+    CPLError(CE_Failure, CPLE_NotSupported,
+             "%s not supported for this data type.", __FUNCTION__);
+    return 0;
+}
+
+}  // namespace detail
+
+/************************************************************************/
+/*                            max_element()                             */
+/************************************************************************/
+
+/** Return the index of the element where the maximum value is hit.
+ *
+ * If it is hit in several locations, it is not specified which one will be
+ * returned.
+ *
+ * @param buffer Vector of nElts elements of type eDT.
+ * @param nElts Number of elements in buffer.
+ * @param eDT Data type of the elements of buffer.
+ * @param bHasNoData Whether dfNoDataValue is valid.
+ * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true
+ *
+ * @since GDAL 3.11
+ */
+inline size_t max_element(const void *buffer, size_t nElts, GDALDataType eDT,
+                          bool bHasNoData, double dfNoDataValue)
+{
+    return detail::extremum_element<true>(buffer, nElts, eDT, bHasNoData,
+                                          dfNoDataValue);
+}
+
+/************************************************************************/
+/*                            min_element()                             */
+/************************************************************************/
+
+/** Return the index of the element where the minimum value is hit.
+ *
+ * If it is hit in several locations, it is not specified which one will be
+ * returned.
+ *
+ * @param buffer Vector of nElts elements of type eDT.
+ * @param nElts Number of elements in buffer.
+ * @param eDT Data type of the elements of buffer.
+ * @param bHasNoData Whether dfNoDataValue is valid.
+ * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true
+ *
+ * @since GDAL 3.11
+ */
+inline size_t min_element(const void *buffer, size_t nElts, GDALDataType eDT,
+                          bool bHasNoData, double dfNoDataValue)
+{
+    return detail::extremum_element<false>(buffer, nElts, eDT, bHasNoData,
+                                           dfNoDataValue);
+}
+
+namespace detail
+{
+
+#ifdef NOT_EFFICIENT
+
+/************************************************************************/
+/*                         minmax_element()                             */
+/************************************************************************/
+
+template <class T>
+std::pair<size_t, size_t> minmax_element(const T *v, size_t size, T noDataValue)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return std::pair(0, 0);
+    size_t idx_of_min = 0;
+    size_t idx_of_max = 0;
+    T vmin = v[0];
+    T vmax = v[0];
+    bool extremum_is_nodata = vmin == noDataValue;
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] != noDataValue && (v[i] < vmin || extremum_is_nodata))
+        {
+            vmin = v[i];
+            idx_of_min = i;
+            extremum_is_nodata = false;
+        }
+        if (v[i] != noDataValue && (v[i] > vmax || extremum_is_nodata))
+        {
+            vmax = v[i];
+            idx_of_max = i;
+            extremum_is_nodata = false;
+        }
+    }
+    return std::pair(idx_of_min, idx_of_max);
+}
+
+template <class T>
+std::pair<size_t, size_t> minmax_element(const T *v, size_t size)
+{
+    static_assert(!(std::is_floating_point_v<T>));
+    if (size == 0)
+        return std::pair(0, 0);
+    size_t idx_of_min = 0;
+    size_t idx_of_max = 0;
+    T vmin = v[0];
+    T vmax = v[0];
+    size_t i = 1;
+    for (; i < size; ++i)
+    {
+        if (v[i] < vmin)
+        {
+            vmin = v[i];
+            idx_of_min = i;
+        }
+        if (v[i] > vmax)
+        {
+            vmax = v[i];
+            idx_of_max = i;
+        }
+    }
+    return std::pair(idx_of_min, idx_of_max);
+}
+
+template <class T>
+inline std::pair<size_t, size_t> minmax_element_with_nan(const T *v,
+                                                         size_t size)
+{
+    if (size == 0)
+        return std::pair(0, 0);
+    size_t idx_of_min = 0;
+    size_t idx_of_max = 0;
+    T vmin = v[0];
+    T vmax = v[0];
+    size_t i = 1;
+    if (std::isnan(v[0]))
+    {
+        for (; i < size; ++i)
+        {
+            if (!std::isnan(v[i]))
+            {
+                vmin = v[i];
+                idx_of_min = i;
+                vmax = v[i];
+                idx_of_max = i;
+                break;
+            }
+        }
+    }
+    for (; i < size; ++i)
+    {
+        if (v[i] < vmin)
+        {
+            vmin = v[i];
+            idx_of_min = i;
+        }
+        if (v[i] > vmax)
+        {
+            vmax = v[i];
+            idx_of_max = i;
+        }
+    }
+    return std::pair(idx_of_min, idx_of_max);
+}
+
+template <>
+std::pair<size_t, size_t> minmax_element<float>(const float *v, size_t size)
+{
+    return minmax_element_with_nan<float>(v, size);
+}
+
+template <>
+std::pair<size_t, size_t> minmax_element<double>(const double *v, size_t size)
+{
+    return minmax_element_with_nan<double>(v, size);
+}
+
+template <class T>
+inline std::pair<size_t, size_t> minmax_element(const T *buffer, size_t size,
+                                                bool bHasNoData, T noDataValue)
+{
+    if (bHasNoData)
+    {
+        return minmax_element<T>(buffer, size, noDataValue);
+    }
+    else
+    {
+        return minmax_element<T>(buffer, size);
+    }
+}
+#else
+
+/************************************************************************/
+/*                         minmax_element()                             */
+/************************************************************************/
+
+template <class T>
+inline std::pair<size_t, size_t> minmax_element(const T *buffer, size_t size,
+                                                bool bHasNoData, T noDataValue)
+{
+#ifdef NOT_EFFICIENT
+    if (bHasNoData)
+    {
+        return minmax_element<T>(buffer, size, noDataValue);
+    }
+    else
+    {
+        return minmax_element<T>(buffer, size);
+        //auto [imin, imax] = std::minmax_element(buffer, buffer + size);
+        //return std::pair(imin - buffer, imax - buffer);
+    }
+#else
+
+#if !defined(GDAL_MINMAX_ELEMENT_USE_SSE2)
+    if constexpr (!std::is_floating_point_v<T>)
+    {
+        if (!bHasNoData)
+        {
+            auto [min_iter, max_iter] =
+                std::minmax_element(buffer, buffer + size);
+            return std::pair(min_iter - buffer, max_iter - buffer);
+        }
+    }
+#endif
+
+    // Using separately min and max is more efficient than computing them
+    // within the same loop
+    return std::pair(
+        extremum_element<T, false>(buffer, size, bHasNoData, noDataValue),
+        extremum_element<T, true>(buffer, size, bHasNoData, noDataValue));
+
+#endif
+}
+#endif
+
+}  // namespace detail
+
+/************************************************************************/
+/*                          minmax_element()                            */
+/************************************************************************/
+
+/** Return the index of the elements where the minimum and maximum values are hit.
+ *
+ * If they are hit in several locations, it is not specified which one will be
+ * returned (contrary to std::minmax_element).
+ *
+ * @param buffer Vector of nElts elements of type eDT.
+ * @param nElts Number of elements in buffer.
+ * @param eDT Data type of the elements of buffer.
+ * @param bHasNoData Whether dfNoDataValue is valid.
+ * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true
+ *
+ * @since GDAL 3.11
+ */
+inline std::pair<size_t, size_t> minmax_element(const void *buffer,
+                                                size_t nElts, GDALDataType eDT,
+                                                bool bHasNoData,
+                                                double dfNoDataValue)
+{
+    switch (eDT)
+    {
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 7, 0)
+        case GDT_Int8:
+        {
+            using T = int8_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+#endif
+        case GDT_Byte:
+        {
+            using T = uint8_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Int16:
+        {
+            using T = int16_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt16:
+        {
+            using T = uint16_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Int32:
+        {
+            using T = int32_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt32:
+        {
+            using T = uint32_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 5, 0)
+        case GDT_Int64:
+        {
+            using T = int64_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_UInt64:
+        {
+            using T = uint64_t;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+#endif
+        case GDT_Float32:
+        {
+            using T = float;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        case GDT_Float64:
+        {
+            using T = double;
+            bHasNoData = bHasNoData && GDALIsValueExactAs<T>(dfNoDataValue);
+            return detail::minmax_element<T>(
+                static_cast<const T *>(buffer), nElts, bHasNoData,
+                bHasNoData ? static_cast<T>(dfNoDataValue) : 0);
+        }
+        default:
+            break;
+    }
+    CPLError(CE_Failure, CPLE_NotSupported,
+             "%s not supported for this data type.", __FUNCTION__);
+    return std::pair(0, 0);
+}
+
+}  // namespace GDAL_MINMAXELT_NS
+
+#endif  // GDAL_MINMAX_ELEMENT_INCLUDED
diff --git a/gcore/gdal_priv.h b/gcore/gdal_priv.h
index 20dcc3734cf3..c615a3a90319 100644
--- a/gcore/gdal_priv.h
+++ b/gcore/gdal_priv.h
@@ -1810,6 +1810,9 @@ class CPL_DLL GDALRasterBand : public GDALMajorObject
     virtual CPLErr SetStatistics(double dfMin, double dfMax, double dfMean,
                                  double dfStdDev);
     virtual CPLErr ComputeRasterMinMax(int bApproxOK, double *adfMinMax);
+    virtual CPLErr ComputeRasterMinMaxLocation(double *pdfMin, double *pdfMax,
+                                               int *pnMinX, int *pnMinY,
+                                               int *pnMaxX, int *pnMaxY);
 
 // Only defined when Doxygen enabled
 #ifdef DOXYGEN_SKIP
diff --git a/gcore/gdal_priv_templates.hpp b/gcore/gdal_priv_templates.hpp
index 3c20c055687b..cb1631485cf1 100644
--- a/gcore/gdal_priv_templates.hpp
+++ b/gcore/gdal_priv_templates.hpp
@@ -585,9 +585,14 @@ inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut)
 }
 
 // Needs SSE2
-#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)
+#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) ||               \
+    defined(USE_NEON_OPTIMIZATIONS)
 
+#ifdef USE_NEON_OPTIMIZATIONS
+#include "include_sse2neon.h"
+#else
 #include <emmintrin.h>
+#endif
 
 static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest)
 {
diff --git a/gcore/gdalrasterband.cpp b/gcore/gdalrasterband.cpp
index bf2da7a2893f..87c814efdedf 100644
--- a/gcore/gdalrasterband.cpp
+++ b/gcore/gdalrasterband.cpp
@@ -38,6 +38,7 @@
 #include "gdal_rat.h"
 #include "gdal_priv_templates.hpp"
 #include "gdal_interpolateatpoint.h"
+#include "gdal_minmax_element.hpp"
 
 /************************************************************************/
 /*                           GDALRasterBand()                           */
@@ -7415,6 +7416,288 @@ CPLErr CPL_STDCALL GDALComputeRasterMinMax(GDALRasterBandH hBand, int bApproxOK,
     return poBand->ComputeRasterMinMax(bApproxOK, adfMinMax);
 }
 
+/************************************************************************/
+/*                    ComputeRasterMinMaxLocation()                     */
+/************************************************************************/
+
+/**
+ * \brief Compute the min/max values for a band, and their location.
+ *
+ * Pixels whose value matches the nodata value or are masked by the mask
+ * band are ignored.
+ *
+ * If the minimum or maximum value is hit in several locations, it is not
+ * specified which one will be returned.
+ *
+ * @param[out] pdfMin Pointer to the minimum value.
+ * @param[out] pdfMax Pointer to the maximum value.
+ * @param[out] pnMinX Pointer to the column where the minimum value is hit.
+ * @param[out] pnMinY Pointer to the line where the minimum value is hit.
+ * @param[out] pnMaxX Pointer to the column where the maximum value is hit.
+ * @param[out] pnMaxY Pointer to the line where the maximum value is hit.
+ *
+ * @return CE_None in case of success, CE_Warning if there are no valid values,
+ *         CE_Failure in case of error.
+ *
+ * @since GDAL 3.11
+ */
+
+CPLErr GDALRasterBand::ComputeRasterMinMaxLocation(double *pdfMin,
+                                                   double *pdfMax, int *pnMinX,
+                                                   int *pnMinY, int *pnMaxX,
+                                                   int *pnMaxY)
+{
+    int nMinX = -1;
+    int nMinY = -1;
+    int nMaxX = -1;
+    int nMaxY = -1;
+    double dfMin = std::numeric_limits<double>::infinity();
+    double dfMax = -std::numeric_limits<double>::infinity();
+    if (pdfMin)
+        *pdfMin = dfMin;
+    if (pdfMax)
+        *pdfMax = dfMax;
+    if (pnMinX)
+        *pnMinX = nMinX;
+    if (pnMinY)
+        *pnMinY = nMinY;
+    if (pnMaxX)
+        *pnMaxX = nMaxX;
+    if (pnMaxY)
+        *pnMaxY = nMaxY;
+
+    if (GDALDataTypeIsComplex(eDataType))
+    {
+        CPLError(CE_Failure, CPLE_NotSupported,
+                 "Complex data type not supported");
+        return CE_Failure;
+    }
+
+    if (!InitBlockInfo())
+        return CE_Failure;
+
+    int bGotNoDataValue = FALSE;
+    const double dfNoDataValue = GetNoDataValue(&bGotNoDataValue);
+    bGotNoDataValue = bGotNoDataValue && !std::isnan(dfNoDataValue);
+    bool bGotFloatNoDataValue = false;
+    float fNoDataValue = 0.0f;
+    ComputeFloatNoDataValue(eDataType, dfNoDataValue, bGotNoDataValue,
+                            fNoDataValue, bGotFloatNoDataValue);
+
+    GDALRasterBand *poMaskBand = nullptr;
+    if (!bGotNoDataValue)
+    {
+        const int l_nMaskFlags = GetMaskFlags();
+        if (l_nMaskFlags != GMF_ALL_VALID && l_nMaskFlags != GMF_NODATA &&
+            GetColorInterpretation() != GCI_AlphaBand)
+        {
+            poMaskBand = GetMaskBand();
+        }
+    }
+
+    bool bSignedByte = false;
+    if (eDataType == GDT_Byte)
+    {
+        EnablePixelTypeSignedByteWarning(false);
+        const char *pszPixelType =
+            GetMetadataItem("PIXELTYPE", "IMAGE_STRUCTURE");
+        EnablePixelTypeSignedByteWarning(true);
+        bSignedByte =
+            pszPixelType != nullptr && EQUAL(pszPixelType, "SIGNEDBYTE");
+    }
+
+    GByte *pabyMaskData = nullptr;
+    if (poMaskBand)
+    {
+        pabyMaskData =
+            static_cast<GByte *>(VSI_MALLOC2_VERBOSE(nBlockXSize, nBlockYSize));
+        if (!pabyMaskData)
+        {
+            return CE_Failure;
+        }
+    }
+
+    const GIntBig nTotalBlocks =
+        static_cast<GIntBig>(nBlocksPerRow) * nBlocksPerColumn;
+    bool bNeedsMin = pdfMin || pnMinX || pnMinY;
+    bool bNeedsMax = pdfMax || pnMaxX || pnMaxY;
+    for (GIntBig iBlock = 0; iBlock < nTotalBlocks; ++iBlock)
+    {
+        const int iYBlock = static_cast<int>(iBlock / nBlocksPerRow);
+        const int iXBlock = static_cast<int>(iBlock % nBlocksPerRow);
+
+        GDALRasterBlock *poBlock = GetLockedBlockRef(iXBlock, iYBlock);
+        if (poBlock == nullptr)
+        {
+            CPLFree(pabyMaskData);
+            return CE_Failure;
+        }
+
+        void *const pData = poBlock->GetDataRef();
+
+        int nXCheck = 0, nYCheck = 0;
+        GetActualBlockSize(iXBlock, iYBlock, &nXCheck, &nYCheck);
+
+        if (poMaskBand &&
+            poMaskBand->RasterIO(GF_Read, iXBlock * nBlockXSize,
+                                 iYBlock * nBlockYSize, nXCheck, nYCheck,
+                                 pabyMaskData, nXCheck, nYCheck, GDT_Byte, 0,
+                                 nBlockXSize, nullptr) != CE_None)
+        {
+            poBlock->DropLock();
+            CPLFree(pabyMaskData);
+            return CE_Failure;
+        }
+
+        if (poMaskBand || nYCheck < nBlockYSize || nXCheck < nBlockXSize)
+        {
+            for (int iY = 0; iY < nYCheck; ++iY)
+            {
+                for (int iX = 0; iX < nXCheck; ++iX)
+                {
+                    const GPtrDiff_t iOffset =
+                        iX + static_cast<GPtrDiff_t>(iY) * nBlockXSize;
+                    if (pabyMaskData && pabyMaskData[iOffset] == 0)
+                        continue;
+                    bool bValid = true;
+                    double dfValue = GetPixelValue(
+                        eDataType, bSignedByte, pData, iOffset, bGotNoDataValue,
+                        dfNoDataValue, bGotFloatNoDataValue, fNoDataValue,
+                        bValid);
+                    if (!bValid)
+                        continue;
+                    if (dfValue < dfMin)
+                    {
+                        dfMin = dfValue;
+                        nMinX = iXBlock * nBlockXSize + iX;
+                        nMinY = iYBlock * nBlockYSize + iY;
+                    }
+                    if (dfValue > dfMax)
+                    {
+                        dfMax = dfValue;
+                        nMaxX = iXBlock * nBlockXSize + iX;
+                        nMaxY = iYBlock * nBlockYSize + iY;
+                    }
+                }
+            }
+        }
+        else
+        {
+            size_t pos_min = 0;
+            size_t pos_max = 0;
+            const auto eEffectiveDT = bSignedByte ? GDT_Int8 : eDataType;
+            if (bNeedsMin && bNeedsMax)
+            {
+                std::tie(pos_min, pos_max) = gdal::minmax_element(
+                    pData, static_cast<size_t>(nBlockXSize) * nBlockYSize,
+                    eEffectiveDT, bGotNoDataValue, dfNoDataValue);
+            }
+            else if (bNeedsMin)
+            {
+                pos_min = gdal::min_element(
+                    pData, static_cast<size_t>(nBlockXSize) * nBlockYSize,
+                    eEffectiveDT, bGotNoDataValue, dfNoDataValue);
+            }
+            else if (bNeedsMax)
+            {
+                pos_max = gdal::max_element(
+                    pData, static_cast<size_t>(nBlockXSize) * nBlockYSize,
+                    eEffectiveDT, bGotNoDataValue, dfNoDataValue);
+            }
+
+            if (bNeedsMin)
+            {
+                const int nMinXBlock = static_cast<int>(pos_min % nBlockXSize);
+                const int nMinYBlock = static_cast<int>(pos_min / nBlockXSize);
+                bool bValid = true;
+                const double dfMinValueBlock = GetPixelValue(
+                    eDataType, bSignedByte, pData, pos_min, bGotNoDataValue,
+                    dfNoDataValue, bGotFloatNoDataValue, fNoDataValue, bValid);
+                if (bValid && dfMinValueBlock < dfMin)
+                {
+                    dfMin = dfMinValueBlock;
+                    nMinX = iXBlock * nBlockXSize + nMinXBlock;
+                    nMinY = iYBlock * nBlockYSize + nMinYBlock;
+                }
+            }
+
+            if (bNeedsMax)
+            {
+                const int nMaxXBlock = static_cast<int>(pos_max % nBlockXSize);
+                const int nMaxYBlock = static_cast<int>(pos_max / nBlockXSize);
+                bool bValid = true;
+                const double dfMaxValueBlock = GetPixelValue(
+                    eDataType, bSignedByte, pData, pos_max, bGotNoDataValue,
+                    dfNoDataValue, bGotFloatNoDataValue, fNoDataValue, bValid);
+                if (bValid && dfMaxValueBlock > dfMax)
+                {
+                    dfMax = dfMaxValueBlock;
+                    nMaxX = iXBlock * nBlockXSize + nMaxXBlock;
+                    nMaxY = iYBlock * nBlockYSize + nMaxYBlock;
+                }
+            }
+        }
+
+        poBlock->DropLock();
+
+        if (eDataType == GDT_Byte)
+        {
+            if (bNeedsMin && dfMin == 0)
+            {
+                bNeedsMin = false;
+            }
+            if (bNeedsMax && dfMax == 255)
+            {
+                bNeedsMax = false;
+            }
+            if (!bNeedsMin && !bNeedsMax)
+            {
+                break;
+            }
+        }
+    }
+
+    CPLFree(pabyMaskData);
+
+    if (pdfMin)
+        *pdfMin = dfMin;
+    if (pdfMax)
+        *pdfMax = dfMax;
+    if (pnMinX)
+        *pnMinX = nMinX;
+    if (pnMinY)
+        *pnMinY = nMinY;
+    if (pnMaxX)
+        *pnMaxX = nMaxX;
+    if (pnMaxY)
+        *pnMaxY = nMaxY;
+    return ((bNeedsMin && nMinX < 0) || (bNeedsMax && nMaxX < 0)) ? CE_Warning
+                                                                  : CE_None;
+}
+
+/************************************************************************/
+/*                    GDALComputeRasterMinMaxLocation()                 */
+/************************************************************************/
+
+/**
+ * \brief Compute the min/max values for a band, and their location.
+ *
+ * @see GDALRasterBand::ComputeRasterMinMax()
+ * @since GDAL 3.11
+ */
+
+CPLErr GDALComputeRasterMinMaxLocation(GDALRasterBandH hBand, double *pdfMin,
+                                       double *pdfMax, int *pnMinX, int *pnMinY,
+                                       int *pnMaxX, int *pnMaxY)
+
+{
+    VALIDATE_POINTER1(hBand, "GDALComputeRasterMinMaxLocation", CE_Failure);
+
+    GDALRasterBand *poBand = GDALRasterBand::FromHandle(hBand);
+    return poBand->ComputeRasterMinMaxLocation(pdfMin, pdfMax, pnMinX, pnMinY,
+                                               pnMaxX, pnMaxY);
+}
+
 /************************************************************************/
 /*                        SetDefaultHistogram()                         */
 /************************************************************************/
diff --git a/gcore/gdalsse_priv.h b/gcore/gdalsse_priv.h
index 3c7ec7ba8cdd..ade33367ee55 100644
--- a/gcore/gdalsse_priv.h
+++ b/gcore/gdalsse_priv.h
@@ -23,13 +23,18 @@
 #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) &&             \
     !defined(USE_SSE2_EMULATION)
 
+#include <string.h>
+
+#ifdef USE_NEON_OPTIMIZATIONS
+#include "include_sse2neon.h"
+#else
 /* Requires SSE2 */
 #include <emmintrin.h>
-#include <string.h>
 
 #ifdef __SSE4_1__
 #include <smmintrin.h>
 #endif
+#endif
 
 #include "gdal_priv_templates.hpp"
 
diff --git a/gcore/include_sse2neon.h b/gcore/include_sse2neon.h
new file mode 100644
index 000000000000..fd1dbb927cb8
--- /dev/null
+++ b/gcore/include_sse2neon.h
@@ -0,0 +1,32 @@
+/******************************************************************************
+ *
+ * Project:  GDAL
+ * Purpose:  Includes sse2neon.h headers
+ * Author:   Even Rouault <even dot rouault at spatialys dot com>
+ *
+ ******************************************************************************
+ * Copyright (c) 2024, Even Rouault <even dot rouault at spatialys dot com>
+ *
+ * SPDX-License-Identifier: MIT
+ *****************************************************************************/
+
+#ifndef INCLUDE_SSE2NEON_H
+#define INCLUDE_SSE2NEON_H
+
+#if defined(__GNUC__)
+#pragma GCC system_header
+#endif
+
+// This check is done in sse2neon.h just as a warning. Turn that into an
+// error, so that gdal.cmake doesn't try to use it
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
+#error "sse2neon.h: GCC versions earlier than 10 are not supported."
+#endif
+
+#include "sse2neon.h"
+
+#ifndef _MM_SHUFFLE2
+#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0))
+#endif
+
+#endif /* INCLUDE_SSE2NEON_H */
diff --git a/gcore/overview.cpp b/gcore/overview.cpp
index 5867ac11b04f..846c89a91e4e 100644
--- a/gcore/overview.cpp
+++ b/gcore/overview.cpp
@@ -36,9 +36,15 @@
 #include "gdal_thread_pool.h"
 #include "gdalwarper.h"
 
+#ifdef USE_NEON_OPTIMIZATIONS
+#include "include_sse2neon.h"
+#define USE_SSE2
+
+#include "gdalsse_priv.h"
+
 // Restrict to 64bit processors because they are guaranteed to have SSE2,
 // or if __AVX2__ is defined.
-#if defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
+#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__)
 #define USE_SSE2
 
 #include "gdalsse_priv.h"
@@ -335,7 +341,7 @@ inline GUInt16 ComputeIntegerRMS_4values<GUInt16, double>(double sumSquares)
 /*                   QuadraticMeanByteSSE2OrAVX2()                      */
 /************************************************************************/
 
-#ifdef __SSE4_1__
+#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS)
 #define sse2_packus_epi32 _mm_packus_epi32
 #else
 inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
@@ -350,7 +356,7 @@ inline __m128i sse2_packus_epi32(__m128i a, __m128i b)
 }
 #endif
 
-#ifdef __SSSE3__
+#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS)
 #define sse2_hadd_epi16 _mm_hadd_epi16
 #else
 inline __m128i sse2_hadd_epi16(__m128i a, __m128i b)
diff --git a/gcore/rasterio.cpp b/gcore/rasterio.cpp
index 880e851204b9..c46df08b9b88 100644
--- a/gcore/rasterio.cpp
+++ b/gcore/rasterio.cpp
@@ -41,6 +41,18 @@
 #include "memdataset.h"
 #include "vrtdataset.h"
 
+#if defined(__x86_64) || defined(_M_X64)
+#include <emmintrin.h>
+#define HAVE_SSE2
+#elif defined(USE_NEON_OPTIMIZATIONS)
+#include "include_sse2neon.h"
+#define HAVE_SSE2
+#endif
+
+#ifdef HAVE_SSSE3_AT_COMPILE_TIME
+#include "rasterio_ssse3.h"
+#endif
+
 static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
                              int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
                              int nDstPixelStride, GPtrDiff_t nWordCount);
@@ -2217,9 +2229,7 @@ static void inline GDALCopyWordsT_8atatime(
     }
 }
 
-#if defined(__x86_64) || defined(_M_X64)
-
-#include <emmintrin.h>
+#ifdef HAVE_SSE2
 
 template <class Tout>
 void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
@@ -2630,7 +2640,7 @@ void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
                             nDstPixelStride, nWordCount);
 }
 
-#endif  // defined(__x86_64) || defined(_M_X64)
+#endif  // HAVE_SSE2
 
 template <>
 void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
@@ -3068,13 +3078,7 @@ static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
     GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
 }
 
-#if (defined(__x86_64) || defined(_M_X64))
-
-#ifdef HAVE_SSSE3_AT_COMPILE_TIME
-
-#include "rasterio_ssse3.h"
-
-#endif
+#ifdef HAVE_SSE2
 
 template <>
 void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
@@ -3175,7 +3179,7 @@ void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
         pSrc += 4;
     }
 }
-#endif  // defined(__x86_64) || defined(_M_X64)
+#endif  // HAVE_SSE2
 
 /************************************************************************/
 /*                         GDALFastCopy()                               */
@@ -5299,13 +5303,7 @@ bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
     return false;
 }
 
-#if defined(__x86_64) || defined(_M_X64)
-
-#include <emmintrin.h>
-
-#ifdef HAVE_SSSE3_AT_COMPILE_TIME
-#include "rasterio_ssse3.h"
-#endif
+#ifdef HAVE_SSE2
 
 /************************************************************************/
 /*                    GDALDeinterleave3Byte()                           */
@@ -5319,6 +5317,12 @@ GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
                       GByte *CPL_RESTRICT pabyDest0,
                       GByte *CPL_RESTRICT pabyDest1,
                       GByte *CPL_RESTRICT pabyDest2, size_t nIters)
+#ifdef USE_NEON_OPTIMIZATIONS
+{
+    return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
+                                       nIters);
+}
+#else
 {
 #ifdef HAVE_SSSE3_AT_COMPILE_TIME
     if (CPLHaveRuntimeSSSE3())
@@ -5366,6 +5370,7 @@ GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
         pabyDest2[i] = pabySrc[3 * i + 2];
     }
 }
+#endif
 
 /************************************************************************/
 /*                    GDALDeinterleave4Byte()                           */
@@ -5421,6 +5426,12 @@ static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
                                   GByte *CPL_RESTRICT pabyDest1,
                                   GByte *CPL_RESTRICT pabyDest2,
                                   GByte *CPL_RESTRICT pabyDest3, size_t nIters)
+#ifdef USE_NEON_OPTIMIZATIONS
+{
+    return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
+                                       pabyDest3, nIters);
+}
+#else
 {
 #ifdef HAVE_SSSE3_AT_COMPILE_TIME
     if (CPLHaveRuntimeSSSE3())
@@ -5469,6 +5480,7 @@ static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
         pabyDest3[i] = pabySrc[4 * i + 3];
     }
 }
+#endif
 #else
 // GCC autovectorizer does an excellent job
 __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
@@ -5596,8 +5608,7 @@ void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
         }
 #if ((defined(__GNUC__) && !defined(__clang__)) ||                             \
      defined(__INTEL_CLANG_COMPILER)) &&                                       \
-    (defined(__x86_64) || defined(_M_X64)) &&                                  \
-    defined(HAVE_SSSE3_AT_COMPILE_TIME)
+    defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
         else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
                  CPLHaveRuntimeSSSE3())
         {
diff --git a/gcore/rasterio_ssse3.cpp b/gcore/rasterio_ssse3.cpp
index 7b770f6030b5..fa9cd6ab24e4 100644
--- a/gcore/rasterio_ssse3.cpp
+++ b/gcore/rasterio_ssse3.cpp
@@ -12,12 +12,18 @@
 
 #include "cpl_port.h"
 
-#if defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                     \
-    (defined(__x86_64) || defined(_M_X64))
+#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                    \
+     (defined(__x86_64) || defined(_M_X64))) ||                                \
+    defined(USE_NEON_OPTIMIZATIONS)
 
 #include "rasterio_ssse3.h"
 
+#ifdef USE_NEON_OPTIMIZATIONS
+#include "include_sse2neon.h"
+#else
 #include <tmmintrin.h>
+#endif
+
 #include "gdal_priv_templates.hpp"
 
 void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
diff --git a/gcore/rasterio_ssse3.h b/gcore/rasterio_ssse3.h
index 57f5d556fd10..ac20e45c6c57 100644
--- a/gcore/rasterio_ssse3.h
+++ b/gcore/rasterio_ssse3.h
@@ -16,7 +16,7 @@
 #include "cpl_port.h"
 
 #if defined(HAVE_SSSE3_AT_COMPILE_TIME) &&                                     \
-    (defined(__x86_64) || defined(_M_X64))
+    (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))
 
 void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
                                       const GByte *CPL_RESTRICT pSrc,
diff --git a/gcore/sse2neon.h b/gcore/sse2neon.h
new file mode 100644
index 000000000000..7754a2dc574b
--- /dev/null
+++ b/gcore/sse2neon.h
@@ -0,0 +1,9402 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Copyright (c) 2015-2024 SSE2NEON Contributors.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@ccns.ncku.edu.tw>
+//   Mark Cheng <marktwtn@gmail.com>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yuanyanghau@gmail.com>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+//   Jonathan Hue <jhue@adobe.com>
+//   Cuda Chen <clh960524@gmail.com>
+//   Aymen Qader <aymen.qader@arm.com>
+//   Anthony Roberts <anthony.roberts@linaro.org>
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min|max_ps|ss|pd|sd */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+/* _mm_dp_pd */
+#ifndef SSE2NEON_PRECISE_DP
+#define SSE2NEON_PRECISE_DP (0)
+#endif
+
+/* Enable inclusion of windows.h on MSVC platforms
+ * This makes _mm_clflush functional on windows, as there is no builtin.
+ */
+#ifndef SSE2NEON_INCLUDE_WINDOWS_H
+#define SSE2NEON_INCLUDE_WINDOWS_H (0)
+#endif
+
+/* compiler specific definitions */
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
+#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
+#elif defined(_MSC_VER)
+#if _MSVC_TRADITIONAL
+#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
+#endif
+#ifndef FORCE_INLINE
+#define FORCE_INLINE static inline
+#endif
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#define _sse2neon_likely(x) (x)
+#define _sse2neon_unlikely(x) (x)
+#else
+#pragma message("Macro name collisions may happen with unsupported compilers.")
+#endif
+
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10
+#warning "GCC versions earlier than 10 are not supported."
+#endif
+
+// Disabled by GDAL to avoid issues with -Werror
+#if 0
+#ifdef __OPTIMIZE__
+#warning \
+    "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon."
+#endif
+#endif
+
+/* C language does not allow initializing a variable with a function call. */
+#ifdef __cplusplus
+#define _sse2neon_const static const
+#else
+#define _sse2neon_const const
+#endif
+
+#include <fenv.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64)
+{
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
+}
+FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64)
+{
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(uint64_t));
+    return i64;
+}
+
+#if defined(_WIN32)
+/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
+ * from both MinGW-w64 and MSVC.
+ */
+#define SSE2NEON_ALLOC_DEFINED
+#endif
+
+/* If using MSVC */
+#ifdef _MSC_VER
+#include <intrin.h>
+#if SSE2NEON_INCLUDE_WINDOWS_H
+#include <processthreadsapi.h>
+#include <windows.h>
+#endif
+
+#if !defined(__cplusplus)
+#error SSE2NEON only supports C++ compilation with this compiler
+#endif
+
+#ifdef SSE2NEON_ALLOC_DEFINED
+#include <malloc.h>
+#endif
+
+#if (defined(_M_AMD64) || defined(__x86_64__)) || \
+    (defined(_M_ARM64) || defined(__arm64__))
+#define SSE2NEON_HAS_BITSCAN64
+#endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#define _sse2neon_define0(type, s, body) \
+    __extension__({                      \
+        type _a = (s);                   \
+        body                             \
+    })
+#define _sse2neon_define1(type, s, body) \
+    __extension__({                      \
+        type _a = (s);                   \
+        body                             \
+    })
+#define _sse2neon_define2(type, a, b, body) \
+    __extension__({                         \
+        type _a = (a), _b = (b);            \
+        body                                \
+    })
+#define _sse2neon_return(ret) (ret)
+#else
+#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
+#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
+#define _sse2neon_define2(type, a, b, body) \
+    [](type _a, type _b) { body }((a), (b))
+#define _sse2neon_return(ret) return ret
+#endif
+
+#define _sse2neon_init(...) \
+    {                       \
+        __VA_ARGS__         \
+    }
+
+/* Compiler barrier */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define SSE2NEON_BARRIER() _ReadWriteBarrier()
+#else
+#define SSE2NEON_BARRIER()                     \
+    do {                                       \
+        __asm__ __volatile__("" ::: "memory"); \
+        (void) 0;                              \
+    } while (0)
+#endif
+
+/* Memory barriers
+ * __atomic_thread_fence does not include a compiler barrier; instead,
+ * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
+ * semantics.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+#include <stdatomic.h>
+#endif
+
+FORCE_INLINE void _sse2neon_smp_mb(void)
+{
+    SSE2NEON_BARRIER();
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
+    !defined(__STDC_NO_ATOMICS__)
+    atomic_thread_fence(memory_order_seq_cst);
+#elif defined(__GNUC__) || defined(__clang__)
+    __atomic_thread_fence(__ATOMIC_SEQ_CST);
+#else /* MSVC */
+    __dmb(_ARM64_BARRIER_ISH);
+#endif
+}
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#if !defined(__clang__) && !defined(_MSC_VER)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#elif __ARM_ARCH == 8
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error \
+    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
+#endif
+#if !defined(__clang__) && !defined(_MSC_VER)
+#pragma GCC push_options
+#endif
+#else
+#error \
+    "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A \
+(you could try setting target explicitly with -march or -mcpu)"
+#endif
+#endif
+
+#include <arm_neon.h>
+#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
+#if defined __has_include && __has_include(<arm_acle.h>)
+#include <arm_acle.h>
+#endif
+#endif
+
+/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
+ * and other Arm microarchitectures use.
+ * From sysctl -a on Apple M1:
+ * hw.cachelinesize: 128
+ */
+#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
+#define SSE2NEON_CACHELINE_SIZE 128
+#else
+#define SSE2NEON_CACHELINE_SIZE 64
+#endif
+
+/* Rounding functions require either Aarch64 instructions or libm fallback */
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+#include <math.h>
+#endif
+
+/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
+ * or even not accessible in user mode.
+ * To write or access to these registers in user mode,
+ * we have to perform syscall instead.
+ */
+#if (!defined(__aarch64__) && !defined(_M_ARM64))
+#include <sys/time.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if defined(__GNUC__) && (__GNUC__ <= 9)
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+
+// __builtin_shuffle introduced in GCC 4.7.0
+#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
+#define HAS__builtin_shuffle 1
+#else
+#define HAS__builtin_shuffle 0
+#endif
+
+#define HAS__builtin_shufflevector 0
+#define HAS__builtin_nontemporal_store 0
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+#if __has_builtin(__builtin_shufflevector)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __builtin_shufflevector(a, b, __VA_ARGS__)
+#elif __has_builtin(__builtin_shuffle)
+#define _sse2neon_shuffle(type, a, b, ...) \
+    __extension__({                        \
+        type tmp = {__VA_ARGS__};          \
+        __builtin_shuffle(a, b, tmp);      \
+    })
+#endif
+
+#ifdef _sse2neon_shuffle
+#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
+#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
+#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
+#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
+#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
+#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
+#endif
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+/* Flush zero mode macros. */
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+/* Denormals are zeros mode macros. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__) || defined(_M_ARM64)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+// Some intrinsics operate on unaligned data types.
+typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t;
+typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t;
+typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t;
+
+// __int64 is defined in the Intrinsics Guide which maps to different datatype
+// in different data model
+#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
+#if (defined(__x86_64__) || defined(__i386__))
+#define __int64 long long
+#else
+#define __int64 int64_t
+#endif
+#endif
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an __m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://learn.microsoft.com/en-us/cpp/cpp/m128
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* SSE macros */
+#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
+#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
+#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
+#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
+
+// Function declaration
+// SSE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
+FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
+FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
+FORCE_INLINE __m128 _mm_set_ps1(float);
+FORCE_INLINE __m128 _mm_setzero_ps(void);
+// SSE2
+FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_castps_si128(__m128);
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
+FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
+FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
+FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
+FORCE_INLINE __m128d _mm_set_pd(double, double);
+FORCE_INLINE __m128i _mm_set1_epi32(int);
+FORCE_INLINE __m128i _mm_setzero_si128(void);
+// SSE4.1
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
+FORCE_INLINE __m128 _mm_ceil_ps(__m128);
+FORCE_INLINE __m128d _mm_floor_pd(__m128d);
+FORCE_INLINE __m128 _mm_floor_ps(__m128);
+FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
+FORCE_INLINE __m128 _mm_round_ps(__m128, int);
+// SSE4.2
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&                        \
+    ((__GNUC__ <= 13 && defined(__arm__)) ||                           \
+     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
+     (__GNUC__ <= 9 && defined(__aarch64__)))
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+/* emulate vaddv u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
+    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
+}
+#else
+// Wraps vaddv_u8
+FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
+{
+    return vaddv_u8(v8);
+}
+#endif
+
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+/* emulate vaddvq u8 variant */
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+    uint8_t res = 0;
+    for (int i = 0; i < 8; ++i)
+        res += tmp[i];
+    return res;
+}
+#else
+// Wraps vaddvq_u8
+FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
+{
+    return vaddvq_u8(a);
+}
+#endif
+
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+/* emulate vaddvq u16 variant */
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    uint32x4_t m = vpaddlq_u16(a);
+    uint64x2_t n = vpaddlq_u32(m);
+    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
+
+    return vget_lane_u32((uint32x2_t) o, 0);
+}
+#else
+// Wraps vaddvq_u16
+FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
+{
+    return vaddvq_u16(a);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors contain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ */
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint {
+    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
+};
+
+// The bit field mapping to the FPCR(floating-point control register)
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t bit24 : 1;
+    uint8_t res2 : 7;
+#if defined(__aarch64__) || defined(_M_ARM64)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// For MSVC, we check only if it is ARM64, as every single ARM64 processor
+// supported by WoA has crypto extensions. If this changes in the future,
+// this can be verified via the runtime-only method of:
+// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
+#if (defined(_M_ARM64) && !defined(__clang__)) || \
+    (defined(__ARM_FEATURE_CRYPTO) &&             \
+     (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+#if defined(_MSC_VER) && !defined(__clang__)
+    __n64 a1 = {a}, b1 = {b};
+    return vreinterpretq_u64_p128(vmull_p64(a1, b1));
+#else
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+#endif
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    vreinterpretq_m128i_s32(vsetq_lane_s32(                                 \
+        vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3),     \
+        vsetq_lane_s32(                                                     \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a),       \
+                                          ((imm) >> 2) & 0x3),              \
+                           vmovq_n_s32(vgetq_lane_s32(                      \
+                               vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
+                           1),                                              \
+            2),                                                             \
+        3))
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most significant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least significant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define _mm_shuffle_epi32_splat(a, imm) \
+    vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
+#else
+#define _mm_shuffle_epi32_splat(a, imm) \
+    vreinterpretq_m128i_s32(            \
+        vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
+#endif
+
+// NEON does not support a general purpose permute intrinsic.
+// Shuffle single-precision (32-bit) floating-point elements in a using the
+// control in imm8, and store the results in dst.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
+#define _mm_shuffle_ps_default(a, b, imm)                                      \
+    vreinterpretq_m128_f32(vsetq_lane_f32(                                     \
+        vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3),         \
+        vsetq_lane_f32(                                                        \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3),     \
+            vsetq_lane_f32(                                                    \
+                vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+                vmovq_n_f32(                                                   \
+                    vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
+                1),                                                            \
+            2),                                                                \
+        3))
+
+// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
+// Store the results in the low 64 bits of dst, with the high 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    _sse2neon_define1(                                                        \
+        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);              \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
+
+// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
+// Store the results in the high 64 bits of dst, with the low 64 bits being
+// copied from a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    _sse2neon_define1(                                                         \
+        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);               \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
+
+/* MMX */
+
+//_mm_empty is a no-op on arm
+FORCE_INLINE void _mm_empty(void) {}
+
+/* SSE */
+
+// Add packed single-precision (32-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add the lower single-precision (32-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Compute the bitwise AND of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
+//
+// See also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_le_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
+}
+
+// Compare the lower single-precision (32-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    return !_mm_comieq_ss(a, b);
+}
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
+#else
+    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
+#endif
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
+                          0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int32_t) data;
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then convert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 8-bit integers, and store the results in lower 4 elements of dst.
+// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
+// between 0x7F and 0x7FFFFFFF.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
+FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
+{
+    return vreinterpret_m64_s8(vqmovn_s16(
+        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
+FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(
+        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
+    return (int64_t) data;
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Divide packed single-precision (32-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
+// division by multiplying a by b's reciprocal before using the Newton-Raphson
+// method to approximate the results.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divide the lower single-precision (32-bit) floating-point element in a by the
+// lower single-precision (32-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper 3 packed elements from a to
+// the upper elements of dst.
+// Warning: ARMv7-A does not produce the same result compared to Intel and not
+// IEEE-compliant.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+#endif
+
+FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
+{
+    uint64_t value;
+#if defined(_MSC_VER) && !defined(__clang__)
+    value = _ReadStatusReg(ARM64_FPCR);
+#else
+    __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
+#endif
+    return value;
+}
+
+FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+    _WriteStatusReg(ARM64_FPCR, value);
+#else
+    __asm__ __volatile__("msr FPCR, %0" ::"r"(value));  /* write */
+#endif
+}
+
+// Macro: Get the flush zero bits from the MXCSR control and status register.
+// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
+// _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
+FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    r.value = _sse2neon_get_fpcr();
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
+}
+
+// Macro: Get the rounding mode bits from the MXCSR control and status register.
+// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
+// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
+FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
+{
+    switch (fegetround()) {
+    case FE_TONEAREST:
+        return _MM_ROUND_NEAREST;
+    case FE_DOWNWARD:
+        return _MM_ROUND_DOWN;
+    case FE_UPWARD:
+        return _MM_ROUND_UP;
+    case FE_TOWARDZERO:
+        return _MM_ROUND_TOWARD_ZERO;
+    default:
+        // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+        // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error
+        // cases we treat them as FE_TOWARDZERO (truncate).
+        return _MM_ROUND_TOWARD_ZERO;
+    }
+}
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm) \
+    vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Load a single-precision (32-bit) floating-point element from memory into the
+// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Load 2 single-precision (32-bit) floating-point elements from memory into the
+// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
+// mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from memory into dst. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0));
+}
+
+// Allocate size bytes of memory, aligned to the alignment specified in align,
+// and return a pointer to the allocated memory. _mm_free should be used to free
+// memory that is allocated with _mm_malloc.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
+#if !defined(SSE2NEON_ALLOC_DEFINED)
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+#endif
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed maximum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed single-precision (32-bit) floating-point elements in a and b,
+// and store packed minimum values in dst. dst does not follow the IEEE Standard
+// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
+// signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare the lower single-precision (32-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper 3
+// packed elements from a to the upper element of dst. dst does not follow the
+// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
+// inputs are NaN or signed-zero values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower single-precision (32-bit) floating-point element from b to the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the upper 2 single-precision (32-bit) floating-point elements from b to
+// the lower 2 elements of dst, and copy the upper 2 elements from a to the
+// upper 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
+{
+#if defined(aarch64__)
+    return vreinterpretq_m128_u64(
+        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
+#else
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+#endif
+}
+
+// Move the lower 2 single-precision (32-bit) floating-point elements from b to
+// the upper 2 elements of dst, and copy the lower 2 elements from a to the
+// lower 2 elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
+#else
+    // Refer the implementation of `_mm_movemask_epi8`
+    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
+    uint32x2_t paired16 =
+        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
+    uint8x8_t paired32 =
+        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
+    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+#endif
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed single-precision (32-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    static const int32_t shift[4] = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Multiply packed single-precision (32-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Compute the bitwise OR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Fetch the line of data from memory that contains address p to a location in
+// the cache hierarchy specified by the locality hint i.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
+FORCE_INLINE void _mm_prefetch(char const *p, int i)
+{
+    (void) i;
+#if defined(_MSC_VER) && !defined(__clang__)
+    switch (i) {
+    case _MM_HINT_NTA:
+        __prefetch2(p, 1);
+        break;
+    case _MM_HINT_T0:
+        __prefetch2(p, 0);
+        break;
+    case _MM_HINT_T1:
+        __prefetch2(p, 2);
+        break;
+    case _MM_HINT_T2:
+        __prefetch2(p, 4);
+        break;
+    }
+#else
+    switch (i) {
+    case _MM_HINT_NTA:
+        __builtin_prefetch(p, 0, 0);
+        break;
+    case _MM_HINT_T0:
+        __builtin_prefetch(p, 0, 3);
+        break;
+    case _MM_HINT_T1:
+        __builtin_prefetch(p, 0, 2);
+        break;
+    case _MM_HINT_T2:
+        __builtin_prefetch(p, 0, 1);
+        break;
+    }
+#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
+#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Compute the approximate reciprocal square root of packed single-precision
+// (32-bit) floating-point elements in a, and store the results in dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Generate masks for detecting whether input has any 0.0f/-0.0f
+    // (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
+    const uint32x4_t has_pos_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
+    const uint32x4_t has_neg_zero =
+        vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
+
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#if SSE2NEON_PRECISE_SQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+
+    // Set output vector element to infinity/negative-infinity if
+    // the corresponding input vector element is 0.0f/-0.0f.
+    out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
+    out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
+
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint64x1_t t = vpaddl_u32(vpaddl_u16(
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
+    return vreinterpret_m64_u16(
+        vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
+}
+
+// Macro: Set the flush zero bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The flush zero may contain any of the
+// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
+FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    r.value = _sse2neon_get_fpcr();
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    _sse2neon_set_fpcr(r.value);
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    switch (rounding) {
+    case _MM_ROUND_NEAREST:
+        rounding = FE_TONEAREST;
+        break;
+    case _MM_ROUND_DOWN:
+        rounding = FE_DOWNWARD;
+        break;
+    case _MM_ROUND_UP:
+        rounding = FE_UPWARD;
+        break;
+    case _MM_ROUND_TOWARD_ZERO:
+        rounding = FE_TOWARDZERO;
+        break;
+    default:
+        // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+        // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as
+        // FE_TOWARDZERO (truncate).
+        rounding = FE_TOWARDZERO;
+    }
+    fesetround(rounding);
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
+}
+
+// Broadcast single-precision (32-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Set the MXCSR control and status register with the value in unsigned 32-bit
+// integer a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
+// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Get the unsigned 32-bit value of the MXCSR control and status register.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
+// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
+FORCE_INLINE unsigned int _mm_getcsr(void)
+{
+    return _MM_GET_ROUNDING_MODE();
+}
+
+// Set packed single-precision (32-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Return vector of type __m128 with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Shuffle 16-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pi16(a, imm)                                       \
+    vreinterpret_m64_s16(vshuffle_s16(                                 \
+        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
+        ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
+#else
+#define _mm_shuffle_pi16(a, imm)                                              \
+    _sse2neon_define1(                                                        \
+        __m64, a, int16x4_t ret;                                              \
+        ret = vmov_n_s16(                                                     \
+            vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3)));          \
+        ret = vset_lane_s16(                                                  \
+            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
+            1);                                                               \
+        ret = vset_lane_s16(                                                  \
+            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
+            2);                                                               \
+        ret = vset_lane_s16(                                                  \
+            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
+            3);                                                               \
+        _sse2neon_return(vreinterpret_m64_s16(ret));)
+#endif
+
+// Perform a serializing operation on all store-to-memory instructions that were
+// issued prior to this instruction. Guarantees that every store instruction
+// that precedes, in program order, is globally visible before any store
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
+FORCE_INLINE void _mm_sfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory and store-to-memory
+// instructions that were issued prior to this instruction. Guarantees that
+// every memory access that precedes, in program order, the memory fence
+// instruction is globally visible before any memory instruction which follows
+// the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
+FORCE_INLINE void _mm_mfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// Perform a serializing operation on all load-from-memory instructions that
+// were issued prior to this instruction. Guarantees that every load instruction
+// that precedes, in program order, is globally visible before any load
+// instruction which follows the fence in program order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
+FORCE_INLINE void _mm_lfence(void)
+{
+    _sse2neon_smp_mb();
+}
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_ps(a, b, imm)                                              \
+    __extension__({                                                            \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
+        float32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                                         \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                            \
+    _sse2neon_define2(                                       \
+        __m128, a, b, __m128 ret; switch (imm) {             \
+            case _MM_SHUFFLE(1, 0, 3, 2):                    \
+                ret = _mm_shuffle_ps_1032(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 3, 0, 1):                    \
+                ret = _mm_shuffle_ps_2301(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(0, 3, 2, 1):                    \
+                ret = _mm_shuffle_ps_0321(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 1, 0, 3):                    \
+                ret = _mm_shuffle_ps_2103(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(1, 0, 1, 0):                    \
+                ret = _mm_movelh_ps(_a, _b);                 \
+                break;                                       \
+            case _MM_SHUFFLE(1, 0, 0, 1):                    \
+                ret = _mm_shuffle_ps_1001(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(0, 1, 0, 1):                    \
+                ret = _mm_shuffle_ps_0101(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(3, 2, 1, 0):                    \
+                ret = _mm_shuffle_ps_3210(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(0, 0, 1, 1):                    \
+                ret = _mm_shuffle_ps_0011(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(0, 0, 2, 2):                    \
+                ret = _mm_shuffle_ps_0022(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 2, 0, 0):                    \
+                ret = _mm_shuffle_ps_2200(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(3, 2, 0, 2):                    \
+                ret = _mm_shuffle_ps_3202(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(3, 2, 3, 2):                    \
+                ret = _mm_movehl_ps(_b, _a);                 \
+                break;                                       \
+            case _MM_SHUFFLE(1, 1, 3, 3):                    \
+                ret = _mm_shuffle_ps_1133(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 0, 1, 0):                    \
+                ret = _mm_shuffle_ps_2010(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 0, 0, 1):                    \
+                ret = _mm_shuffle_ps_2001(_a, _b);           \
+                break;                                       \
+            case _MM_SHUFFLE(2, 0, 3, 2):                    \
+                ret = _mm_shuffle_ps_2032(_a, _b);           \
+                break;                                       \
+            default:                                         \
+                ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
+                break;                                       \
+        } _sse2neon_return(ret);)
+#endif
+
+// Compute the square root of packed single-precision (32-bit) floating-point
+// elements in a, and store the results in dst.
+// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
+// square root by multiplying input in with its reciprocal square root before
+// using the Newton-Raphson method to approximate the results.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#endif
+}
+
+// Compute the square root of the lower single-precision (32-bit) floating-point
+// element in a, store the result in the lower element of dst, and copy the
+// upper 3 packed elements from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store the upper 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Store the lower 2 single-precision (32-bit) floating-point elements from a
+// into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Subtract packed single-precision (32-bit) floating-point elements in b from
+// packed single-precision (32-bit) floating-point elements in a, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+// Return vector of type __m128i with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
+FORCE_INLINE __m128i _mm_undefined_si128(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128i a;
+#if defined(_MSC_VER)
+    a = _mm_setzero_si128();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+#if defined(_MSC_VER)
+    a = _mm_setzero_ps();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the high half a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave single-precision (32-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+/* SSE2 */
+
+// Add packed 16-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed 32-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Add packed 64-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Add packed 8-bit integers in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[2];
+    c[0] = a0 + b0;
+    c[1] = a1 + b1;
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2];
+    c[0] = a0 + b0;
+    c[1] = a1;
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Add packed signed 16-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Add packed unsigned 8-bit integers in a and b using saturation, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
+// AND with b, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Cast vector of type __m128i to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Invalidate and flush the cache line that contains p from all levels of the
+// cache hierarchy.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
+#if defined(__APPLE__)
+#include <libkern/OSCacheControl.h>
+#endif
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+
+    /* sys_icache_invalidate is supported since macOS 10.5.
+     * However, it does not work on non-jailbroken iOS devices, although the
+     * compilation is successful.
+     */
+#if defined(__APPLE__)
+    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
+#elif defined(__GNUC__) || defined(__clang__)
+    uintptr_t ptr = (uintptr_t) p;
+    __builtin___clear_cache((char *) ptr,
+                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
+#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
+    FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
+#endif
+}
+
+// Compare packed 16-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 8-bit integers in a and b for equality, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for greater-than, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b for less-than, and store the
+// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
+// order of the operands switched.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
+FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
+FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than-or-equal, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
+FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
+FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-less-than, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
+FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_u64(veorq_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
+        vdupq_n_u64(UINT64_MAX)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-less-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
+FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if neither is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
+FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    // Excluding NaNs, any two floating point numbers can be compared.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if neither is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
+FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// to see if either is NaN, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
+FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    // Two NaNs are not equal in comparison operation.
+    uint64x2_t not_nan_a =
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
+    uint64x2_t not_nan_b =
+        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_s32(
+        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    uint64_t d[2];
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b to see if either is NaN, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
+FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    uint64_t d[2];
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
+FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return a0 >= b0;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for greater-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
+FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+
+    return a0 > b0;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than-or-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
+FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+
+    return a0 <= b0;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for less-than, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
+FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
+#else
+    double a0, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+
+    return a0 < b0;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
+#else
+    uint32x4_t a_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
+    uint32x4_t b_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
+                                       vreinterpretq_u64_u32(a_eq_b));
+    return vgetq_lane_u64(and_results, 0) & 0x1;
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for not-equal, and return the boolean result (0 or 1).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
+FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
+{
+    return !_mm_comieq_sd(a, b);
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+#else
+    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
+FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
+{
+// vrnd32xq_f64 not supported on clang
+#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
+    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
+    int64x2_t integers = vcvtq_s64_f64(rounded);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
+FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
+{
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double d0, d1;
+    d0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_ps(0, 0, (float) a1, (float) a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+#else
+    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
+    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__ARM_FEATURE_FRINT)
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
+#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST:
+        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+    case _MM_ROUND_DOWN:
+        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
+    case _MM_ROUND_UP:
+        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
+    }
+#else
+    float *f = (float *) &a;
+    switch (_MM_GET_ROUNDING_MODE()) {
+    case _MM_ROUND_NEAREST: {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128i_s32(
+            vbslq_s32(is_delta_half, r_even, r_normal));
+    }
+    case _MM_ROUND_DOWN:
+        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
+                             floorf(f[0]));
+    case _MM_ROUND_UP:
+        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
+                             ceilf(f[0]));
+    default:  // _MM_ROUND_TOWARD_ZERO
+        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
+                             (int32_t) f[0]);
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return _a;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
+FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    return (int32_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
+FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+    double ret = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
+#define _mm_cvtsd_si64x _mm_cvtsd_si64
+
+// Convert the lower double-precision (64-bit) floating-point element in b to a
+// single-precision (32-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper 3 packed elements from a to the
+// upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
+FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(vsetq_lane_f32(
+        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
+        vreinterpretq_f32_m128(a), 0));
+#else
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0));
+#endif
+}
+
+// Copy the lower 32-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Convert the signed 32-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
+FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    int64_t _b = sse2neon_recast_f64_s64((double) b);
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Copy 32-bit integer a to the lower elements of dst, and zero the upper
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
+FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
+#else
+    int64_t _b = sse2neon_recast_f64_s64((double) b);
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Copy 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
+#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
+
+// Convert the signed 64-bit integer b to a double-precision (64-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
+#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(vsetq_lane_s64(
+        sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
+FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
+{
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
+FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
+{
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
+    return vreinterpret_m64_s32(vld1_s32(data));
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
+FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
+{
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int32_t) _a;
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double _a =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int64_t) _a;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[2];
+    c[0] = a0 / b0;
+    c[1] = a1 / b1;
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm) \
+    vreinterpretq_m128i_s16(        \
+        vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
+
+// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from memory into dst. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Load 128-bits of integer data from memory into dst. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
+// 32-bit integers, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int32x4_t high =
+        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
+
+    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
+#else
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+#endif
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 > b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 > b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
+
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+#else
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 > b0 ? a0 : b0, a1};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+#if SSE2NEON_PRECISE_MINMAX
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
+#else
+    return vreinterpretq_m128d_f64(
+        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#endif
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    int64_t d[2];
+    d[0] = a0 < b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0);
+    d[1] = a1 < b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1);
+    return vreinterpretq_m128d_s64(vld1q_s64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+#else
+    double a0, a1, b0;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 < b0 ? a0 : b0, a1};
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
+#endif
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return (int) (vgetq_lane_u64(high_bits, 0) |
+                  (vgetq_lane_u64(high_bits, 1) << 1));
+}
+
+// Copy the lower 64-bit integer in a to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[2];
+    c[0] = a0 * b0;
+    c[1] = a1 * b1;
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the packed signed 16-bit integers in a and b, producing intermediate
+// 32-bit integers, and store the high 16 bits of the intermediate integers in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and store the low 16 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using signed saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit because it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
+FORCE_INLINE void _mm_pause(void)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+    __isb(_ARM64_BARRIER_SY);
+#else
+    __asm__ __volatile__("isb\n");
+#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
+}
+
+// Set packed 16-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
+}
+
+// Set packed 64-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Set packed 8-bit integers in dst with the supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
+#else
+    return _mm_set_pd(0, a);
+#endif
+}
+
+// Broadcast 16-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Broadcast 32-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
+}
+
+// Broadcast 64-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Broadcast 8-bit integer a to all elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    int64_t _d = sse2neon_recast_f64_s64(d);
+    return vreinterpretq_m128d_s64(vdupq_n_s64(_d));
+#endif
+}
+
+// Set packed 16-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Set packed 32-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Set packed 8-bit integers in dst with the supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Return vector of type __m128i with all elements set to zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Shuffle 32-bit integers in a using the control in imm8, and store the results
+// in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if defined(_sse2neon_shuffle)
+#define _mm_shuffle_epi32(a, imm)                                            \
+    __extension__({                                                          \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
+        int32x4_t _shuf =                                                    \
+            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                                      \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                           \
+    _sse2neon_define1(                                      \
+        __m128i, a, __m128i ret; switch (imm) {             \
+            case _MM_SHUFFLE(1, 0, 3, 2):                   \
+                ret = _mm_shuffle_epi_1032(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(2, 3, 0, 1):                   \
+                ret = _mm_shuffle_epi_2301(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(0, 3, 2, 1):                   \
+                ret = _mm_shuffle_epi_0321(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(2, 1, 0, 3):                   \
+                ret = _mm_shuffle_epi_2103(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(1, 0, 1, 0):                   \
+                ret = _mm_shuffle_epi_1010(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(1, 0, 0, 1):                   \
+                ret = _mm_shuffle_epi_1001(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(0, 1, 0, 1):                   \
+                ret = _mm_shuffle_epi_0101(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(2, 2, 1, 1):                   \
+                ret = _mm_shuffle_epi_2211(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(0, 1, 2, 2):                   \
+                ret = _mm_shuffle_epi_0122(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(3, 3, 3, 2):                   \
+                ret = _mm_shuffle_epi_3332(_a);             \
+                break;                                      \
+            case _MM_SHUFFLE(0, 0, 0, 0):                   \
+                ret = _mm_shuffle_epi32_splat(_a, 0);       \
+                break;                                      \
+            case _MM_SHUFFLE(1, 1, 1, 1):                   \
+                ret = _mm_shuffle_epi32_splat(_a, 1);       \
+                break;                                      \
+            case _MM_SHUFFLE(2, 2, 2, 2):                   \
+                ret = _mm_shuffle_epi32_splat(_a, 2);       \
+                break;                                      \
+            case _MM_SHUFFLE(3, 3, 3, 3):                   \
+                ret = _mm_shuffle_epi32_splat(_a, 3);       \
+                break;                                      \
+            default:                                        \
+                ret = _mm_shuffle_epi32_default(_a, (imm)); \
+                break;                                      \
+        } _sse2neon_return(ret);)
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
+#ifdef _sse2neon_shuffle
+#define _mm_shuffle_pd(a, b, imm8)                                            \
+    vreinterpretq_m128d_s64(                                                  \
+        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
+                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if defined(_sse2neon_shuffle)
+#define _mm_shufflehi_epi16(a, imm)                                           \
+    __extension__({                                                           \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
+        int16x8_t _shuf =                                                     \
+            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+                          (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                                       \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if defined(_sse2neon_shuffle)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = vshuffleq_s16(                             \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shift packed 16-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a left by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
+FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~15))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s16(
+        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
+}
+
+// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (_sse2neon_unlikely(imm & ~63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
+#define _mm_slli_si128(a, imm)                                              \
+    _sse2neon_define1(                                                      \
+        __m128i, a, int8x16_t ret;                                          \
+        if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
+        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
+        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),      \
+                            ((imm <= 0 || imm > 15) ? 0 : (16 - imm)));     \
+        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double _a0 = sqrt(a0);
+    double _a1 = sqrt(a1);
+    return _mm_set_pd(_a1, _a0);
+#endif
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+#else
+    double _a, _b;
+    _a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    _b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return _mm_set_pd(_a, sqrt(_b));
+#endif
+}
+
+// Shift packed 16-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = vgetq_lane_s64(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(
+        vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = vgetq_lane_s64(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(
+        vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in sign
+// bits, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) {        \
+            ret = _a;                                                         \
+        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {               \
+            ret = vreinterpretq_m128i_s32(                                    \
+                vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_s32(                                    \
+                vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31));                \
+        } _sse2neon_return(ret);)
+
+// Shift packed 16-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shift packed 32-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shift packed 64-bit integers in a right by count while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (_sse2neon_unlikely(c & ~63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u16(                                    \
+                vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u32(                                    \
+                vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                                \
+    _sse2neon_define0(                                                        \
+        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) {       \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            ret = vreinterpretq_m128i_u64(                                    \
+                vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
+        } _sse2neon_return(ret);)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
+#define _mm_srli_si128(a, imm)                                         \
+    _sse2neon_define1(                                                 \
+        __m128i, a, int8x16_t ret;                                     \
+        if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
+        else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
+                            (imm > 15 ? 0 : imm));                     \
+        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr must be aligned
+// on a 16-byte boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 64-bit integer from the first element of a into memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Store 128-bits of integer data from a into memory. mem_addr does not need to
+// be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Store 32-bit integer from the first element of a into memory. mem_addr does
+// not need to be aligned on any particular boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (__m128d *) p);
+#elif defined(__aarch64__) || defined(_M_ARM64)
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+#endif
+}
+
+// Store 128-bits of integer data from a into memory using a non-temporal memory
+// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
+// exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Store 64-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
+FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
+{
+    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[2];
+    c[0] = a0 - b0;
+    c[1] = a1 - b1;
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
+// using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
+// integers in a using saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+// Return vector of type __m128d with undefined elements.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
+FORCE_INLINE __m128d _mm_undefined_pd(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128d a;
+#if defined(_MSC_VER) && !defined(__clang__)
+    a = _mm_setzero_pd();
+#endif
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the high half of a and b, and
+// store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s64(
+        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the high half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Unpack and interleave 16-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 32-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave 64-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s64(
+        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+#endif
+}
+
+// Unpack and interleave 8-bit integers from the low half of a and b, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
+// and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+/* SSE3 */
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+#else
+    return _mm_add_pd(_mm_mul_pd(b, mask), a);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_FMA) /* VFPv4+ */
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
+                                            vreinterpretq_f32_m128(mask),
+                                            vreinterpretq_f32_m128(b)));
+#else
+    return _mm_add_ps(_mm_mul_ps(b, mask), a);
+#endif
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 + a1, b0 + b1};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float64x2_t _a = vreinterpretq_f64_m128d(a);
+    float64x2_t _b = vreinterpretq_f64_m128d(b);
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 - a1, b0 - b1};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
+#else
+    float32x4x2_t c = vuzpq_f32(a, b);
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128_f32(
+        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
+#elif defined(_sse2neon_shuffle)
+    return vreinterpretq_m128_f32(vshuffleq_s32(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+/* SSSE3 */
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
+#if defined(__GNUC__) && !defined(__clang__)
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
+        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
+        __m128i ret;                                                          \
+        if (_sse2neon_unlikely((imm) & ~31))                                  \
+            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
+        else if (imm >= 16)                                                   \
+            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
+        else                                                                  \
+            ret =                                                             \
+                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
+        ret;                                                                  \
+    })
+
+#else
+#define _mm_alignr_epi8(a, b, imm)                                          \
+    _sse2neon_define2(                                                      \
+        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a);         \
+        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;           \
+        if (_sse2neon_unlikely((imm) & ~31)) ret =                          \
+            vreinterpretq_m128i_u8(vdupq_n_u8(0));                          \
+        else if (imm >= 16) ret =                                           \
+            _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0);                   \
+        else ret =                                                          \
+            vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
+        _sse2neon_return(ret);)
+
+#endif
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    _sse2neon_define2(                                                      \
+        __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) {      \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low;                                              \
+            uint8x8_t tmp_high;                                             \
+            if ((imm) >= 8) {                                               \
+                const int idx = (imm) -8;                                   \
+                tmp_low = vreinterpret_u8_m64(_a);                          \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = (imm);                                      \
+                tmp_low = vreinterpret_u8_m64(_b);                          \
+                tmp_high = vreinterpret_u8_m64(_a);                         \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        } _sse2neon_return(ret);)
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
+#else
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
+// saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
+FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t res = vuzp_s16(a, b);
+    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
+#else
+    int32x4x2_t c = vuzpq_s32(a, b);
+    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
+// the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
+FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
+// the signed 32-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
+FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
+#else
+    int32x2x2_t c = vuzp_s32(a, b);
+    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int16x8x2_t c = vuzpq_s16(a, b);
+    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
+// using saturation, and pack the signed 16-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
+FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
+#else
+    int16x4x2_t c = vuzp_s16(a, b);
+    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
+// pack the saturated results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
+FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
+{
+    uint16x4_t a = vreinterpret_u16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // Zero extend a
+    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
+    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
+    int16x4_t b_odd = vshr_n_s16(b, 8);
+
+    // multiply
+    int16x4_t prod1 = vmul_s16(a_even, b_even);
+    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Truncate each intermediate integer to the 18 most
+// significant bits, round by adding 1, and store bits [16:1] to dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
+FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
+{
+    int32x4_t mul_extend =
+        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
+
+    // Rounding narrowing shift right
+    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
+FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
+{
+    const int8x8_t controlMask =
+        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
+    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
+    return vreinterpret_m64_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__) || defined(_M_ARM64)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+/* SSE4.1 */
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                      \
+    _sse2neon_define2(                                                  \
+        __m128i, a, b,                                                  \
+        const uint16_t _mask[8] =                                       \
+            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,    \
+                           ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0);   \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                        \
+        uint16x8_t __a = vreinterpretq_u16_m128i(_a);                   \
+        uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
+            vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                              \
+    _sse2neon_define2(                                                       \
+        __m128d, a, b,                                                       \
+        const uint64_t _mask[2] =                                            \
+            _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),  \
+                           ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);                             \
+        uint64x2_t __a = vreinterpretq_u64_m128d(_a);                        \
+        uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return(      \
+            vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__) || defined(_M_ARM64)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a up
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
+FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(ceil(a1), ceil(a0));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b up to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
+FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_ceil_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_ceil_ps(b));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
+// the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
+// integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
+// and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
+// 64-bit integers, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Conditionally multiply the packed double-precision (64-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products, and
+// conditionally store the sum in dst using the low 4 bits of imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
+FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
+{
+    // Generate mask value from constant immediate bit value
+    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
+    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
+#if !SSE2NEON_PRECISE_DP
+    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
+    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
+#endif
+    // Conditional multiplication
+#if !SSE2NEON_PRECISE_DP
+    __m128d mul = _mm_mul_pd(a, b);
+    const __m128d mulMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
+    __m128d tmp = _mm_and_pd(mul, mulMask);
+#else
+#if defined(__aarch64__) || defined(_M_ARM64)
+    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
+                             : 0;
+    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
+                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
+                             : 0;
+#else
+    double a0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 =
+        sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double d0 = (imm & 0x10) ? a0 * b0 : 0;
+    double d1 = (imm & 0x20) ? a1 * b1 : 0;
+#endif
+    __m128d tmp = _mm_set_pd(d1, d0);
+#endif
+    // Sum the products
+#if defined(__aarch64__) || defined(_M_ARM64)
+    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
+#else
+    double _tmp0 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
+    double _tmp1 = sse2neon_recast_u64_f64(
+        vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
+    double sum = _tmp0 + _tmp1;
+#endif
+    // Conditionally store the sum
+    const __m128d sumMask =
+        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
+    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
+    return res;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+    float32x4_t elementwise_prod = _mm_mul_ps(a, b);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
+    }
+
+    if ((imm & 0x0F) == 0x0F) {
+        if (!(imm & (1 << 4)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
+        if (!(imm & (1 << 5)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
+        if (!(imm & (1 << 6)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
+        if (!(imm & (1 << 7)))
+            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
+
+        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
+    }
+#endif
+
+    float s = 0.0f;
+
+    if (imm & (1 << 4))
+        s += vgetq_lane_f32(elementwise_prod, 0);
+    if (imm & (1 << 5))
+        s += vgetq_lane_f32(elementwise_prod, 1);
+    if (imm & (1 << 6))
+        s += vgetq_lane_f32(elementwise_prod, 2);
+    if (imm & (1 << 7))
+        s += vgetq_lane_f32(elementwise_prod, 3);
+
+    const float32_t res[4] = {
+        (imm & 0x1) ? s : 0.0f,
+        (imm & 0x2) ? s : 0.0f,
+        (imm & 0x4) ? s : 0.0f,
+        (imm & 0x8) ? s : 0.0f,
+    };
+    return vreinterpretq_m128_f32(vld1q_f32(res));
+}
+
+// Extract a 32-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extract a 64-bit integer from a, selected with imm8, and store the result in
+// dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Extract an 8-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
+// __constrange(0,16) int imm)
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Round the packed double-precision (64-bit) floating-point elements in a down
+// to an integer value, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
+FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0, a1;
+    a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(floor(a1), floor(a0));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+#else
+    float *f = (float *) &a;
+    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b down to
+// an integer value, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
+FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_floor_pd(b));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_floor_ps(b));
+}
+
+// Copy a to dst, and insert the 32-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm) \
+    vreinterpretq_m128i_s32(        \
+        vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
+
+// Copy a to dst, and insert the 64-bit integer i into dst at the location
+// specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm) \
+    vreinterpretq_m128i_s64(        \
+        vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
+
+// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
+// location specified by imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm) \
+    vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
+
+// Copy a to tmp, then insert a single-precision (32-bit) floating-point
+// element from b into tmp using the control in imm8. Store tmp to dst using
+// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
+#define _mm_insert_ps(a, b, imm8)                                            \
+    _sse2neon_define2(                                                       \
+        __m128, a, b,                                                        \
+        float32x4_t tmp1 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3),            \
+                           vreinterpretq_f32_m128(_a), 0);                   \
+        float32x4_t tmp2 =                                                   \
+            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                          \
+                           vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
+        const uint32_t data[4] =                                             \
+            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,             \
+                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);            \
+        uint32x4_t mask = vld1q_u32(data);                                   \
+        float32x4_t all_zeros = vdupq_n_f32(0);                              \
+                                                                             \
+        _sse2neon_return(vreinterpretq_m128_f32(                             \
+            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
+
+// Compare packed signed 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed signed 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+#if defined(__aarch64__) || defined(_M_ARM64)
+    // Find the minimum value
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+
+    // Get the index of the minimum value
+    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16x8_t minv = vdupq_n_u16(min);
+    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
+    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
+#else
+    // Find the minimum value
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+#endif
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
+// 8-bit integers in a compared to those in b, and store the 16-bit results in
+// dst. Eight SADs are performed using one quadruplet from b and eight
+// quadruplets from a. One quadruplet is selected from b starting at on the
+// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
+// integers selected from a starting at the offset specified in imm8.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
+FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
+{
+    uint8x16_t _a, _b;
+
+    switch (imm & 0x4) {
+    case 0:
+        // do nothing
+        _a = vreinterpretq_u8_m128i(a);
+        break;
+    case 4:
+        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
+                                            vreinterpretq_u32_m128i(a), 1));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#elif defined(_MSC_VER)
+        __assume(0);
+#endif
+        break;
+    }
+
+    switch (imm & 0x3) {
+    case 0:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
+        break;
+    case 1:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
+        break;
+    case 2:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
+        break;
+    case 3:
+        _b = vreinterpretq_u8_u32(
+            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
+        break;
+    default:
+#if defined(__GNUC__) || defined(__clang__)
+        __builtin_unreachable();
+#elif defined(_MSC_VER)
+        __assume(0);
+#endif
+        break;
+    }
+
+    int16x8_t c04, c15, c26, c37;
+    uint8x8_t low_b = vget_low_u8(_b);
+    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
+    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
+    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
+    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
+    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
+    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
+    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
+#if defined(__aarch64__) || defined(_M_ARM64)
+    // |0|4|2|6|
+    c04 = vpaddq_s16(c04, c26);
+    // |1|5|3|7|
+    c15 = vpaddq_s16(c15, c37);
+
+    int32x4_t trn1_c =
+        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    int32x4_t trn2_c =
+        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
+    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
+                                              vreinterpretq_s16_s32(trn2_c)));
+#else
+    int16x4_t c01, c23, c45, c67;
+    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
+    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
+    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
+    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
+
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
+#endif
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
+// integers, and store the low 32 bits of the intermediate integers in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
+// using unsigned saturation, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Round the packed double-precision (64-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed double-precision
+// floating-point elements in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
+FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_pd(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_pd(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
+    }
+#else
+    double *v_double = (double *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        double res[2], tmp;
+        for (int i = 0; i < 2; i++) {
+            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
+            double roundDown = floor(tmp);  // Round down value
+            double roundUp = ceil(tmp);     // Round up value
+            double diffDown = tmp - roundDown;
+            double diffUp = roundUp - tmp;
+            if (diffDown < diffUp) {
+                /* If it's closer to the round down value, then use it */
+                res[i] = roundDown;
+            } else if (diffDown > diffUp) {
+                /* If it's closer to the round up value, then use it */
+                res[i] = roundUp;
+            } else {
+                /* If it's equidistant between round up and round down value,
+                 * pick the one which is an even number */
+                double half = roundDown / 2;
+                if (half != floor(half)) {
+                    /* If the round down value is odd, return the round up value
+                     */
+                    res[i] = roundUp;
+                } else {
+                    /* If the round up value is odd, return the round down value
+                     */
+                    res[i] = roundDown;
+                }
+            }
+            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
+        }
+        return _mm_set_pd(res[1], res[0]);
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_pd(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_pd(a);
+    }
+    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
+                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
+#endif
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if (defined(__aarch64__) || defined(_M_ARM64)) || \
+    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return _mm_floor_ps(a);
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return _mm_ceil_ps(a);
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+
+    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
+        (rounding == _MM_FROUND_CUR_DIRECTION &&
+         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
+        uint32x4_t signmask = vdupq_n_u32(0x80000000);
+        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
+        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+        int32x4_t r_trunc = vcvtq_s32_f32(
+            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+        float32x4_t delta = vsubq_f32(
+            vreinterpretq_f32_m128(a),
+            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+        uint32x4_t is_delta_half =
+            vceqq_f32(delta, half); /* delta == +/- 0.5 */
+        return vreinterpretq_m128_f32(
+            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
+    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
+        return _mm_floor_ps(a);
+    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
+               (rounding == _MM_FROUND_CUR_DIRECTION &&
+                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
+        return _mm_ceil_ps(a);
+    }
+    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
+                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
+                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
+                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
+#endif
+}
+
+// Round the lower double-precision (64-bit) floating-point element in b using
+// the rounding parameter, store the result as a double-precision floating-point
+// element in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
+FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
+{
+    return _mm_move_sd(a, _mm_round_pd(b, rounding));
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b using
+// the rounding parameter, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst. Rounding is done according to the
+// rounding[3:0] parameter, which can be one of:
+//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
+//     suppress exceptions
+//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
+//     exceptions
+//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
+//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
+//     _MM_SET_ROUNDING_MODE
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
+FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
+{
+    return _mm_move_ss(a, _mm_round_ps(b, rounding));
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
+// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
+// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
+// Note: Argument names may be wrong in the Intel intrinsics guide.
+FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
+{
+    uint64x2_t v = vreinterpretq_u64_m128i(a);
+    uint64x2_t m = vreinterpretq_u64_m128i(mask);
+
+    // find ones (set-bits) and zeros (clear-bits) under clip mask
+    uint64x2_t ones = vandq_u64(m, v);
+    uint64x2_t zeros = vbicq_u64(m, v);
+
+    // If both 128-bit variables are populated (non-zero) then return 1.
+    // For comparison purposes, first compact each var down to 32-bits.
+    uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros));
+
+    // if folding minimum is non-zero then both vars must be non-zero
+    return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0);
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
+// otherwise return 0.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
+#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+/* SSE4.2 */
+
+static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+};
+
+/* specify the source data format */
+#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
+#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
+#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
+#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
+
+/* specify the comparison operation */
+#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
+#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
+#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
+#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
+
+/* specify the polarity */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
+#define _SIDD_MASKED_NEGATIVE_POLARITY \
+    0x30 /* negate results only before end of string */
+
+/* specify the output selection in _mm_cmpXstri */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* specify the output selection in _mm_cmpXstrm */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* Pattern Matching for C macros.
+ * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
+ */
+
+/* catenate */
+#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
+#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
+
+#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
+/* run the 2nd parameter */
+#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
+/* run the 1st parameter */
+#define SSE2NEON_IIF_1(t, ...) t
+
+#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
+#define SSE2NEON_COMPL_0 1
+#define SSE2NEON_COMPL_1 0
+
+#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
+#define SSE2NEON_DEC_1 0
+#define SSE2NEON_DEC_2 1
+#define SSE2NEON_DEC_3 2
+#define SSE2NEON_DEC_4 3
+#define SSE2NEON_DEC_5 4
+#define SSE2NEON_DEC_6 5
+#define SSE2NEON_DEC_7 6
+#define SSE2NEON_DEC_8 7
+#define SSE2NEON_DEC_9 8
+#define SSE2NEON_DEC_10 9
+#define SSE2NEON_DEC_11 10
+#define SSE2NEON_DEC_12 11
+#define SSE2NEON_DEC_13 12
+#define SSE2NEON_DEC_14 13
+#define SSE2NEON_DEC_15 14
+#define SSE2NEON_DEC_16 15
+
+/* detection */
+#define SSE2NEON_CHECK_N(x, n, ...) n
+#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
+#define SSE2NEON_PROBE(x) x, 1,
+
+#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
+#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
+
+#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
+#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
+
+#define SSE2NEON_EAT(...)
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
+
+/* recursion */
+/* deferred expression */
+#define SSE2NEON_EMPTY()
+#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
+#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
+#define SSE2NEON_EXPAND(...) __VA_ARGS__
+
+#define SSE2NEON_EVAL(...) \
+    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
+#define SSE2NEON_EVAL1(...) \
+    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
+#define SSE2NEON_EVAL2(...) \
+    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
+#define SSE2NEON_EVAL3(...) __VA_ARGS__
+
+#define SSE2NEON_REPEAT(count, macro, ...)                         \
+    SSE2NEON_WHEN(count)                                           \
+    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
+        SSE2NEON_DEC(count), macro,                                \
+        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
+                                              __VA_ARGS__))
+#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
+
+#define SSE2NEON_SIZE_OF_byte 8
+#define SSE2NEON_NUMBER_OF_LANES_byte 16
+#define SSE2NEON_SIZE_OF_word 16
+#define SSE2NEON_NUMBER_OF_LANES_word 8
+
+#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
+    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
+        vreinterpretq_##type##_m128i(a)));
+
+#define SSE2NEON_FILL_LANE(i, type) \
+    vec_b[i] =                      \
+        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
+
+#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
+                       number_of_lanes, byte_or_word)                         \
+    do {                                                                      \
+        SSE2NEON_CAT(                                                         \
+            data_type_prefix,                                                 \
+            SSE2NEON_CAT(size,                                                \
+                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
+        vec_b[number_of_lanes];                                               \
+        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
+            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
+            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
+                                      SSE2NEON_CAT(type_prefix, size)))       \
+        for (int i = 0; i < number_of_lanes; i++) {                           \
+            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
+                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
+                SSE2NEON_CAT(vreinterpretq_u,                                 \
+                             SSE2NEON_CAT(size, _m128i))(mask),               \
+                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a))))),        \
+                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
+                    vec_b[i],                                                 \
+                    SSE2NEON_CAT(                                             \
+                        vreinterpretq_,                                       \
+                        SSE2NEON_CAT(type_prefix,                             \
+                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
+        }                                                                     \
+    } while (0)
+
+#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
+    do {                                                                     \
+        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
+                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
+                                      SSE2NEON_CAT(u, size)))                \
+    } while (0)
+
+#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
+    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
+                                                int lb)                       \
+    {                                                                         \
+        __m128i mtx[16];                                                      \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
+        return SSE2NEON_CAT(                                                  \
+            _sse2neon_aggregate_equal_any_,                                   \
+            SSE2NEON_CAT(                                                     \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
+                                             type))))(la, lb, mtx);           \
+    }
+
+#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
+    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
+                                                 int lb)                       \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_RANGES(                                                        \
+            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_ranges_,                                       \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
+                                             type))))(la, lb, mtx);            \
+    }
+
+#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
+    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
+                                                    __m128i b, int lb)         \
+    {                                                                          \
+        __m128i mtx[16];                                                       \
+        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
+                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
+        return SSE2NEON_CAT(                                                   \
+            _sse2neon_aggregate_equal_ordered_,                                \
+            SSE2NEON_CAT(                                                      \
+                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
+                SSE2NEON_CAT(x,                                                \
+                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
+            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
+    }
+
+static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
+    prefix##IMPL(byte) \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
+
+static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint16x8_t vec =
+        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u16(
+            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
+        __m128i tmp = vreinterpretq_m128i_u32(
+            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
+        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
+                                       vreinterpretq_u32_m128i(tmp));
+#if defined(__aarch64__) || defined(_M_ARM64)
+        int t = vaddvq_u32(vec_res) ? 1 : 0;
+#else
+        uint64x2_t sumh = vpaddlq_u32(vec_res);
+        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
+#endif
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int m = (1 << la) - 1;
+    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
+    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
+    for (int j = 0; j < lb; j++) {
+        mtx[j] = vreinterpretq_m128i_u8(
+            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
+        mtx[j] = vreinterpretq_m128i_u8(
+            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
+        __m128i tmp = vreinterpretq_m128i_u16(
+            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
+        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
+                                       vreinterpretq_u16_m128i(tmp));
+        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+#define SSE2NEON_CMP_RANGES_IS_BYTE 1
+#define SSE2NEON_CMP_RANGES_IS_WORD 0
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
+    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
+    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
+    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
+    prefix##IMPL(word, int, s, prefix##IS_WORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
+
+#undef SSE2NEON_CMP_RANGES_IS_BYTE
+#undef SSE2NEON_CMP_RANGES_IS_WORD
+
+static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint8x16_t mtx =
+        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
+    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
+    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
+    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
+    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
+    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
+    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
+    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
+
+    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
+    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
+    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
+    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
+    res_lo = vand_u8(res_lo, vec_mask);
+    res_hi = vand_u8(res_hi, vec_mask);
+
+    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
+    return res;
+}
+
+static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint16x8_t mtx =
+        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
+    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
+    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
+    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
+    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
+    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
+    mtx = vbslq_u16(vec1, tmp, mtx);
+    mtx = vandq_u16(mtx, vec_mask);
+    return _sse2neon_vaddvq_u16(mtx);
+}
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
+
+#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
+    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
+        int bound, int la, int lb, __m128i mtx[16])                            \
+    {                                                                          \
+        int res = 0;                                                           \
+        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
+        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
+            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
+            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
+        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
+            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
+                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
+            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
+        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
+        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
+        for (int j = 0; j < lb; j++) {                                         \
+            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
+                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
+        }                                                                      \
+        for (int j = lb; j < bound; j++) {                                     \
+            mtx[j] = vreinterpretq_m128i_u##size(                              \
+                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
+        }                                                                      \
+        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
+            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
+        for (int i = 0; i < bound; i++) {                                      \
+            int val = 1;                                                       \
+            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
+                val &= ptr[k * bound + j];                                     \
+            res += val << i;                                                   \
+        }                                                                      \
+        return res;                                                            \
+    }
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
+    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
+    prefix##IMPL(16, 8, prefix##IS_UWORD)
+/* clang-format on */
+
+SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
+
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
+#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
+
+/* clang-format off */
+#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
+    prefix##IMPL(byte)                              \
+    prefix##IMPL(word)
+/* clang-format on */
+
+SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
+
+#define SSE2NEON_CMPESTR_LIST                          \
+    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
+    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
+    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
+    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
+    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
+    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
+    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
+    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
+    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
+    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
+    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
+
+enum {
+#define _(name, func_suffix) name,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
+static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
+#define _(name, func_suffix) _sse2neon_##func_suffix,
+    SSE2NEON_CMPESTR_LIST
+#undef _
+};
+
+FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
+{
+    switch (imm8 & 0x30) {
+    case _SIDD_NEGATIVE_POLARITY:
+        res ^= 0xffffffff;
+        break;
+    case _SIDD_MASKED_NEGATIVE_POLARITY:
+        res ^= (1 << lb) - 1;
+        break;
+    default:
+        break;
+    }
+
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+}
+
+FORCE_INLINE int _sse2neon_clz(unsigned int x)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+    unsigned long cnt = 0;
+    if (_BitScanReverse(&cnt, x))
+        return 31 - cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_clz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctz(unsigned int x)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+    unsigned long cnt = 0;
+    if (_BitScanForward(&cnt, x))
+        return cnt;
+    return 32;
+#else
+    return x != 0 ? __builtin_ctz(x) : 32;
+#endif
+}
+
+FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
+{
+#ifdef _MSC_VER
+    unsigned long cnt;
+#if defined(SSE2NEON_HAS_BITSCAN64)
+    if (_BitScanForward64(&cnt, x))
+        return (int) (cnt);
+#else
+    if (_BitScanForward(&cnt, (unsigned long) (x)))
+        return (int) cnt;
+    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
+        return (int) (cnt + 32);
+#endif /* SSE2NEON_HAS_BITSCAN64 */
+    return 64;
+#else /* assume GNU compatible compilers */
+    return x != 0 ? __builtin_ctzll(x) : 64;
+#endif
+}
+
+#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
+
+#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
+    const int var = (imm & 0x01) ? 8 : 16
+
+#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
+    int tmp1 = la ^ (la >> 31);                  \
+    la = tmp1 - (la >> 31);                      \
+    int tmp2 = lb ^ (lb >> 31);                  \
+    lb = tmp2 - (lb >> 31);                      \
+    la = SSE2NEON_MIN(la, bound);                \
+    lb = SSE2NEON_MIN(lb, bound)
+
+// Compare all pairs of character in string a and b,
+// then aggregate the result.
+// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
+// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
+// string a and b.
+#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
+    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
+    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
+    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
+
+#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
+    return (r2 == 0) ? bound                                     \
+                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
+                                      : _sse2neon_ctz(r2))
+
+#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
+    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
+    if (imm8 & 0x40) {                                                         \
+        if (bound == 8) {                                                      \
+            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
+                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
+            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
+                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
+        } else {                                                               \
+            uint8x16_t vec_r2 =                                                \
+                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
+            uint8x16_t tmp =                                                   \
+                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
+        }                                                                      \
+    } else {                                                                   \
+        if (bound == 16) {                                                     \
+            dst = vreinterpretq_m128i_u16(                                     \
+                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
+        } else {                                                               \
+            dst = vreinterpretq_m128i_u8(                                      \
+                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
+        }                                                                      \
+    }                                                                          \
+    return dst
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and returns 1 if b did not contain a null character and the
+// resulting mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
+FORCE_INLINE int _mm_cmpestra(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    int lb_cpy = lb;
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return !r2 & (lb_cpy > bound);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
+FORCE_INLINE int _mm_cmpestrc(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
+FORCE_INLINE int _mm_cmpestri(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control
+// in imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
+FORCE_INLINE __m128i
+_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
+FORCE_INLINE int _mm_cmpestro(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
+FORCE_INLINE int _mm_cmpestrs(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    (void) a;
+    (void) b;
+    (void) lb;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings in a and b with lengths la and lb using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
+FORCE_INLINE int _mm_cmpestrz(__m128i a,
+                              int la,
+                              __m128i b,
+                              int lb,
+                              const int imm8)
+{
+    (void) a;
+    (void) b;
+    (void) la;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    return lb <= (bound - 1);
+}
+
+#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
+    do {                                                                 \
+        if (imm8 & 0x01) {                                               \
+            uint16x8_t equal_mask_##str =                                \
+                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
+        } else {                                                         \
+            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
+                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
+            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
+            uint64_t matches_##str =                                     \
+                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
+            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
+        }                                                                \
+    } while (0)
+
+#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
+    int la, lb;                                  \
+    do {                                         \
+        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
+        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
+    } while (0)
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if b did not contain a null character and the resulting
+// mask was zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
+FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return !r2 & (lb >= bound);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
+FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 != 0;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated index in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
+FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and store the generated mask in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
+FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns bit 0 of the resulting bit mask.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
+FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
+{
+    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
+    return r2 & 1;
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in a was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
+FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
+{
+    (void) b;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int la;
+    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
+    return la <= (bound - 1);
+}
+
+// Compare packed strings with implicit lengths in a and b using the control in
+// imm8, and returns 1 if any character in b was null, and 0 otherwise.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
+FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
+{
+    (void) a;
+    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
+    int lb;
+    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
+    return lb <= (bound - 1);
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    return vreinterpretq_m128i_s64(vshrq_n_s64(
+        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
+        63));
+#endif
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32ch(crc, v);
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cw(crc, v);
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cd((uint32_t) crc, v);
+#else
+    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v, and stores the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
+    (defined(_M_ARM64) && !defined(__clang__))
+    crc = __crc32cb(crc, v);
+#else
+    crc ^= v;
+#if defined(__ARM_FEATURE_CRYPTO)
+    // Adapted from: https://mary.rs/lab/crc32/
+    // Barrent reduction
+    uint64x2_t orig =
+        vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0));
+    uint64x2_t tmp = orig;
+
+    // Polynomial P(x) of CRC32C
+    uint64_t p = 0x105EC76F1;
+    // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor
+    // 2^{64} / P(x) \rfloor = 0x11f91caf6
+    uint64_t mu = 0x1dea713f1;
+
+    // Multiply by mu_{64}
+    tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu));
+    // Divide by 2^{64} (mask away the unnecessary bits)
+    tmp =
+        vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0)));
+    // Multiply by P(x) (shifted left by 1 for alignment reasons)
+    tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p));
+    // Subtract original from result
+    tmp = veorq_u64(tmp, orig);
+
+    // Extract the 'lower' (in bit-reflected sense) 32 bits
+    crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1);
+#else  // Fall back to the generic table lookup approach
+    // Adapted from: https://create.stephan-brumme.com/crc32/
+    // Apply half-byte comparison algorithm for the best ratio between
+    // performance and lookup table.
+
+    // The lookup table just needs to store every 16th entry
+    // of the standard look-up table.
+    static const uint32_t crc32_half_byte_tbl[] = {
+        0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3,
+        0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9,
+        0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75,
+    };
+
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+    crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F];
+#endif
+#endif
+    return crc;
+}
+
+/* AES */
+
+#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
+/* clang-format off */
+#define SSE2NEON_AES_SBOX(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+#define SSE2NEON_AES_RSBOX(w)                                          \
+    {                                                                  \
+        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
+        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
+        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
+        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
+        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
+        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
+        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
+        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
+        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
+        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
+        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
+        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
+        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
+        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
+        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
+        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
+        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
+        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
+        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
+        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
+        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
+        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
+        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
+        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
+        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
+        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
+        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
+        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
+        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
+        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
+        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
+        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
+        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
+        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
+        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
+        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
+        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
+static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+/* x_time function and matrix multiply function */
+#if !defined(__aarch64__) && !defined(_M_ARM64)
+#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
+#define SSE2NEON_MULTIPLY(x, y)                                  \
+    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
+     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
+     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
+     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
+#endif
+
+// In the absence of crypto extensions, implement aesenc using regular NEON
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// for more information.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    /* shift rows */
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    /* sub bytes */
+    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
+    // look up each of the table. After each lookup, we load the next table
+    // which locates at the next 64-bytes. In the meantime, the index in the
+    // table would be smaller than it was, so the index parameters of
+    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    /* mix columns */
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    /* add round key */
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A implementation for a table-based AES */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
+    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
+     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
+// multiplying 'x' by 2 in GF(2^8)
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+// multiplying 'x' by 3 in GF(2^8)
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+
+    // this generates a table containing every possible permutation of
+    // shift_rows() and sub_bytes() with mix_columns().
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
+        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
+    uint32_t x1 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
+    uint32_t x2 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
+    uint32_t x3 =
+        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
+
+    // finish the modulo addition step in mix_columns()
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // inverse mix columns
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
+                                 0x1b);  // multiplying 'v' by 2 in GF(2^8)
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    // add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t i, e, f, g, h, v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    // inverse mix columns
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {
+        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
+        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A implementation */
+    uint8_t v[16] = {
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
+        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
+    };
+
+    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
+#endif
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t inv_shift_rows[] = {
+        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
+        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
+    };
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(a);
+
+    // inverse shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
+
+    // inverse sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
+
+    // add round key
+    return vreinterpretq_m128i_u8(v) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4];
+    uint8_t *_a = (uint8_t *) &a;
+    for (int i = 0; i < 16; ++i) {
+        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
+#endif
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+#if defined(__aarch64__)
+    static const uint8_t ror32by8[] = {
+        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
+    };
+    uint8x16_t v = vreinterpretq_u8_m128i(a);
+    uint8x16_t w;
+
+    // multiplying 'v' by 4 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
+    v ^= w;
+    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
+
+    // multiplying 'v' by 2 in GF(2^8)
+    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+    return vreinterpretq_m128i_u8(w);
+
+#else /* ARMv7-A NEON implementation */
+    uint8_t i, e, f, g, h, v[4][4];
+    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
+    for (i = 0; i < 4; ++i) {
+        e = v[i][0];
+        f = v[i][1];
+        g = v[i][2];
+        h = v[i][3];
+
+        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
+                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
+        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
+                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
+        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
+                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
+        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
+                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
+    }
+
+    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
+#endif
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+//
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+#if defined(__aarch64__)
+    uint8x16_t _a = vreinterpretq_u8_m128i(a);
+    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
+
+    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
+    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
+    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
+
+    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
+
+#else /* ARMv7-A NEON implementation */
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+#endif
+}
+#undef SSE2NEON_AES_SBOX
+#undef SSE2NEON_AES_RSBOX
+
+#if defined(__aarch64__)
+#undef SSE2NEON_XT
+#undef SSE2NEON_MULTIPLY
+#endif
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(b)));
+}
+
+// Perform one round of an AES decryption flow on data (state) in a using the
+// round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
+FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(veorq_u8(
+        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+        vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+// Perform the last round of an AES decryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
+FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
+{
+    return vreinterpretq_m128i_u8(
+        veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
+                 vreinterpretq_u8_m128i(RoundKey)));
+}
+
+// Perform the InvMixColumns transformation on a and store the result in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
+FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
+{
+    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
+}
+
+// Assist in expanding the AES cipher key by computing steps towards generating
+// a round key for encryption cipher using data from a and an 8-bit round
+// constant specified in imm8, and store the result in dst."
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+#if !defined(_MSC_VER) || defined(__clang__)
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+#else
+    // We have to do this hack because MSVC is strictly adhering to the CPP
+    // standard, in particular C++03 8.5.1 sub-section 15, which states that
+    // unions must be initialized by their first member type.
+
+    // As per the Windows ARM64 ABI, it is always little endian, so this works
+    __n128 dest{
+        ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
+            ((uint64_t) u8.n128_u8[0xE] << 16) |
+            ((uint64_t) u8.n128_u8[0xB] << 24) |
+            ((uint64_t) u8.n128_u8[0x1] << 32) |
+            ((uint64_t) u8.n128_u8[0xE] << 40) |
+            ((uint64_t) u8.n128_u8[0xB] << 48) |
+            ((uint64_t) u8.n128_u8[0x4] << 56),
+        ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
+            ((uint64_t) u8.n128_u8[0x6] << 16) |
+            ((uint64_t) u8.n128_u8[0x3] << 24) |
+            ((uint64_t) u8.n128_u8[0x9] << 32) |
+            ((uint64_t) u8.n128_u8[0x6] << 40) |
+            ((uint64_t) u8.n128_u8[0x3] << 48) |
+            ((uint64_t) u8.n128_u8[0xC] << 56)};
+
+    dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
+    dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
+
+    return dest;
+#endif
+}
+#endif
+
+/* Others */
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    r.value = _sse2neon_get_fpcr();
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
+}
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#elif defined(_MSC_VER)
+    return _CountOneBits(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#elif defined(_MSC_VER)
+    return _CountOneBits64(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
+{
+    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
+    // regardless of the value of the FZ bit.
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__) || defined(_M_ARM64)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    r.value = _sse2neon_get_fpcr();
+#else
+    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+    _sse2neon_set_fpcr(r.value);
+#else
+    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+// Return the current 64-bit value of the processor's time-stamp counter.
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
+FORCE_INLINE uint64_t _rdtsc(void)
+{
+#if defined(__aarch64__) || defined(_M_ARM64)
+    uint64_t val;
+
+    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
+     * system counter is at least 56 bits wide; from Armv8.6, the counter
+     * must be 64 bits wide.  So the system counter could be less than 64
+     * bits wide and it is attributed with the flag 'cap_user_time_short'
+     * is true.
+     */
+#if defined(_MSC_VER) && !defined(__clang__)
+    val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
+#else
+    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
+#endif
+
+    return val;
+#else
+    uint32_t pmccntr, pmuseren, pmcntenset;
+    // Read the user mode Performance Monitoring Unit (PMU)
+    // User Enable Register (PMUSERENR) access permissions.
+    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
+    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
+        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
+        if (pmcntenset & 0x80000000UL) {  // Is it counting?
+            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
+            // The counter is set up to count every 64th cycle
+            return (uint64_t) (pmccntr) << 6;
+        }
+    }
+
+    // Fallback to syscall as we can't enable PMUSERENR in user mode.
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
+#endif
+}
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
diff --git a/gdal.cmake b/gdal.cmake
index 6af60831fb01..1799ea709019 100644
--- a/gdal.cmake
+++ b/gdal.cmake
@@ -40,6 +40,13 @@ option(GDAL_OBJECT_LIBRARIES_POSITION_INDEPENDENT_CODE "Set ON to produce -fPIC
 # Option to set preferred C# compiler
 option(CSHARP_MONO "Whether to force the C# compiler to be Mono" OFF)
 
+if (SSE2NEON_COMPILES)
+  option(GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS "Set ON to use ARM Neon FPU optimizations" ON)
+  if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
+      message(STATUS "Using ARM Neon optimizations")
+  endif()
+endif()
+
 # This line must be kept early in the CMake instructions. At time of writing,
 # this file is populated only be scripts/install_bash_completions.cmake.in
 install(CODE "file(REMOVE \"${PROJECT_BINARY_DIR}/install_manifest_extra.txt\")")
diff --git a/ogr/ogr_geometry.h b/ogr/ogr_geometry.h
index 7589a90864d4..2e07be78d05e 100644
--- a/ogr/ogr_geometry.h
+++ b/ogr/ogr_geometry.h
@@ -408,9 +408,11 @@ class CPL_DLL OGRGeometry
 
     OGRGeometry();
     OGRGeometry(const OGRGeometry &other);
+    OGRGeometry(OGRGeometry &&other);
     virtual ~OGRGeometry();
 
     OGRGeometry &operator=(const OGRGeometry &other);
+    OGRGeometry &operator=(OGRGeometry &&other);
 
     /** Returns if two geometries are equal. */
     bool operator==(const OGRGeometry &other) const
@@ -1145,9 +1147,13 @@ class CPL_DLL OGRPoint : public OGRGeometry
     OGRPoint(double x, double y, double z);
     OGRPoint(double x, double y, double z, double m);
     OGRPoint(const OGRPoint &other);
+    /** Move constructor */
+    OGRPoint(OGRPoint &&other) = default;
     static OGRPoint *createXYM(double x, double y, double m);
 
     OGRPoint &operator=(const OGRPoint &other);
+    /** Move assignment operator */
+    OGRPoint &operator=(OGRPoint &&other) = default;
 
     // IWks Interface
     size_t WkbSize() const override;
@@ -1318,6 +1324,7 @@ class CPL_DLL OGRCurve : public OGRGeometry
     //! @cond Doxygen_Suppress
     OGRCurve() = default;
     OGRCurve(const OGRCurve &other) = default;
+    OGRCurve(OGRCurve &&other) = default;
 
     virtual OGRCurveCasterToLineString GetCasterToLineString() const = 0;
     virtual OGRCurveCasterToLinearRing GetCasterToLinearRing() const = 0;
@@ -1351,6 +1358,7 @@ class CPL_DLL OGRCurve : public OGRGeometry
   public:
     //! @cond Doxygen_Suppress
     OGRCurve &operator=(const OGRCurve &other);
+    OGRCurve &operator=(OGRCurve &&other) = default;
     //! @endcond
 
     /** Type of child elements. */
@@ -1533,6 +1541,8 @@ class CPL_DLL OGRSimpleCurve : public OGRCurve
 
     OGRSimpleCurve(const OGRSimpleCurve &other);
 
+    OGRSimpleCurve(OGRSimpleCurve &&other);
+
   private:
     class CPL_DLL Iterator
     {
@@ -1577,6 +1587,8 @@ class CPL_DLL OGRSimpleCurve : public OGRCurve
 
     OGRSimpleCurve &operator=(const OGRSimpleCurve &other);
 
+    OGRSimpleCurve &operator=(OGRSimpleCurve &&other);
+
     /** Type of child elements. */
     typedef OGRPoint ChildType;
 
@@ -1777,8 +1789,10 @@ class CPL_DLL OGRLineString : public OGRSimpleCurve
     /** Create an empty line string. */
     OGRLineString() = default;
     OGRLineString(const OGRLineString &other);
+    OGRLineString(OGRLineString &&other);
 
     OGRLineString &operator=(const OGRLineString &other);
+    OGRLineString &operator=(OGRLineString &&other);
 
     virtual OGRLineString *clone() const override;
     virtual OGRLineString *
@@ -1883,9 +1897,13 @@ class CPL_DLL OGRLinearRing : public OGRLineString
     /** Constructor */
     OGRLinearRing() = default;
     OGRLinearRing(const OGRLinearRing &other);
+    /** Move constructor*/
+    OGRLinearRing(OGRLinearRing &&other) = default;
     explicit OGRLinearRing(const OGRLinearRing *);
 
     OGRLinearRing &operator=(const OGRLinearRing &other);
+    /** Move assignment operator */
+    OGRLinearRing &operator=(OGRLinearRing &&other) = default;
 
     // Non standard.
     virtual const char *getGeometryName() const override;
@@ -1966,8 +1984,12 @@ class CPL_DLL OGRCircularString : public OGRSimpleCurve
     OGRCircularString() = default;
 
     OGRCircularString(const OGRCircularString &other);
+    /** Move constructor */
+    OGRCircularString(OGRCircularString &&other) = default;
 
     OGRCircularString &operator=(const OGRCircularString &other);
+    /** Move assignment operator */
+    OGRCircularString &operator=(OGRCircularString &&other) = default;
 
     // IWks Interface.
     virtual OGRErr importFromWkb(const unsigned char *, size_t, OGRwkbVariant,
@@ -2075,9 +2097,11 @@ class CPL_DLL OGRCurveCollection
   public:
     OGRCurveCollection() = default;
     OGRCurveCollection(const OGRCurveCollection &other);
+    OGRCurveCollection(OGRCurveCollection &&other);
     ~OGRCurveCollection();
 
     OGRCurveCollection &operator=(const OGRCurveCollection &other);
+    OGRCurveCollection &operator=(OGRCurveCollection &&other);
 
     /** Type of child elements. */
     typedef OGRCurve ChildType;
@@ -2208,8 +2232,12 @@ class CPL_DLL OGRCompoundCurve : public OGRCurve
     OGRCompoundCurve() = default;
 
     OGRCompoundCurve(const OGRCompoundCurve &other);
+    /** Move constructor */
+    OGRCompoundCurve(OGRCompoundCurve &&other) = default;
 
     OGRCompoundCurve &operator=(const OGRCompoundCurve &other);
+    /** Move assignment operator */
+    OGRCompoundCurve &operator=(OGRCompoundCurve &&other) = default;
 
     /** Type of child elements. */
     typedef OGRCurve ChildType;
@@ -2477,8 +2505,12 @@ class CPL_DLL OGRCurvePolygon : public OGRSurface
     OGRCurvePolygon() = default;
 
     OGRCurvePolygon(const OGRCurvePolygon &);
+    /** Move constructor */
+    OGRCurvePolygon(OGRCurvePolygon &&) = default;
 
     OGRCurvePolygon &operator=(const OGRCurvePolygon &other);
+    /** Move assignment operator */
+    OGRCurvePolygon &operator=(OGRCurvePolygon &&other) = default;
 
     /** Type of child elements. */
     typedef OGRCurve ChildType;
@@ -2686,8 +2718,12 @@ class CPL_DLL OGRPolygon : public OGRCurvePolygon
     OGRPolygon() = default;
 
     OGRPolygon(const OGRPolygon &other);
+    /** Move constructor */
+    OGRPolygon(OGRPolygon &&other) = default;
 
     OGRPolygon &operator=(const OGRPolygon &other);
+    /** Move assignment operator */
+    OGRPolygon &operator=(OGRPolygon &&other) = default;
 
     /** Type of child elements. */
     typedef OGRLinearRing ChildType;
@@ -2857,8 +2893,12 @@ class CPL_DLL OGRTriangle : public OGRPolygon
     OGRTriangle() = default;
     OGRTriangle(const OGRPoint &p, const OGRPoint &q, const OGRPoint &r);
     OGRTriangle(const OGRTriangle &other);
+    /** Move constructor */
+    OGRTriangle(OGRTriangle &&other) = default;
     OGRTriangle(const OGRPolygon &other, OGRErr &eErr);
     OGRTriangle &operator=(const OGRTriangle &other);
+    /** Move assignment operator */
+    OGRTriangle &operator=(OGRTriangle &&other) = default;
 
     virtual const char *getGeometryName() const override;
     virtual OGRwkbGeometryType getGeometryType() const override;
@@ -2939,9 +2979,11 @@ class CPL_DLL OGRGeometryCollection : public OGRGeometry
     OGRGeometryCollection() = default;
 
     OGRGeometryCollection(const OGRGeometryCollection &other);
+    OGRGeometryCollection(OGRGeometryCollection &&other);
     ~OGRGeometryCollection() override;
 
     OGRGeometryCollection &operator=(const OGRGeometryCollection &other);
+    OGRGeometryCollection &operator=(OGRGeometryCollection &&other);
 
     /** Type of child elements. */
     typedef OGRGeometry ChildType;
@@ -3123,8 +3165,12 @@ class CPL_DLL OGRMultiSurface : public OGRGeometryCollection
     OGRMultiSurface() = default;
 
     OGRMultiSurface(const OGRMultiSurface &other);
+    /** Move constructor */
+    OGRMultiSurface(OGRMultiSurface &&other) = default;
 
     OGRMultiSurface &operator=(const OGRMultiSurface &other);
+    /** Move assignment operator */
+    OGRMultiSurface &operator=(OGRMultiSurface &&other) = default;
 
     /** Type of child elements. */
     typedef OGRSurface ChildType;
@@ -3291,8 +3337,12 @@ class CPL_DLL OGRMultiPolygon : public OGRMultiSurface
     OGRMultiPolygon() = default;
 
     OGRMultiPolygon(const OGRMultiPolygon &other);
+    /** Move constructor */
+    OGRMultiPolygon(OGRMultiPolygon &&other) = default;
 
     OGRMultiPolygon &operator=(const OGRMultiPolygon &other);
+    /** Move assignment operator */
+    OGRMultiPolygon &operator=(OGRMultiPolygon &&other) = default;
 
     /** Type of child elements. */
     typedef OGRPolygon ChildType;
@@ -3453,9 +3503,13 @@ class CPL_DLL OGRPolyhedralSurface : public OGRSurface
     /** Create an empty PolyhedralSurface */
     OGRPolyhedralSurface() = default;
 
-    OGRPolyhedralSurface(const OGRPolyhedralSurface &poGeom);
+    OGRPolyhedralSurface(const OGRPolyhedralSurface &other);
+    /** Move constructor */
+    OGRPolyhedralSurface(OGRPolyhedralSurface &&other) = default;
 
     OGRPolyhedralSurface &operator=(const OGRPolyhedralSurface &other);
+    /** Move assignment operator */
+    OGRPolyhedralSurface &operator=(OGRPolyhedralSurface &&other) = default;
 
     /** Type of child elements. */
     typedef OGRPolygon ChildType;
@@ -3630,6 +3684,12 @@ class CPL_DLL OGRTriangulatedSurface : public OGRPolyhedralSurface
     OGRTriangulatedSurface() = default;
 
     OGRTriangulatedSurface(const OGRTriangulatedSurface &other);
+    /** Move constructor */
+    OGRTriangulatedSurface(OGRTriangulatedSurface &&other) = default;
+
+    OGRTriangulatedSurface &operator=(const OGRTriangulatedSurface &other);
+    /** Move assignment operator */
+    OGRTriangulatedSurface &operator=(OGRTriangulatedSurface &&other) = default;
 
     /** Type of child elements. */
     typedef OGRTriangle ChildType;
@@ -3662,7 +3722,6 @@ class CPL_DLL OGRTriangulatedSurface : public OGRPolyhedralSurface
         return reinterpret_cast<const ChildType *const *>(oMP.end());
     }
 
-    OGRTriangulatedSurface &operator=(const OGRTriangulatedSurface &other);
     virtual const char *getGeometryName() const override;
     virtual OGRwkbGeometryType getGeometryType() const override;
     virtual OGRTriangulatedSurface *clone() const override;
@@ -3765,8 +3824,12 @@ class CPL_DLL OGRMultiPoint : public OGRGeometryCollection
     OGRMultiPoint() = default;
 
     OGRMultiPoint(const OGRMultiPoint &other);
+    /** Move constructor */
+    OGRMultiPoint(OGRMultiPoint &&other) = default;
 
     OGRMultiPoint &operator=(const OGRMultiPoint &other);
+    /** Move assignment operator */
+    OGRMultiPoint &operator=(OGRMultiPoint &&other) = default;
 
     /** Type of child elements. */
     typedef OGRPoint ChildType;
@@ -3923,8 +3986,12 @@ class CPL_DLL OGRMultiCurve : public OGRGeometryCollection
     OGRMultiCurve() = default;
 
     OGRMultiCurve(const OGRMultiCurve &other);
+    /** Move constructor */
+    OGRMultiCurve(OGRMultiCurve &&other) = default;
 
     OGRMultiCurve &operator=(const OGRMultiCurve &other);
+    /** Move assignment operator */
+    OGRMultiCurve &operator=(OGRMultiCurve &&other) = default;
 
     /** Type of child elements. */
     typedef OGRCurve ChildType;
@@ -4076,8 +4143,12 @@ class CPL_DLL OGRMultiLineString : public OGRMultiCurve
     OGRMultiLineString() = default;
 
     OGRMultiLineString(const OGRMultiLineString &other);
+    /** Move constructor */
+    OGRMultiLineString(OGRMultiLineString &&other) = default;
 
     OGRMultiLineString &operator=(const OGRMultiLineString &other);
+    /** Move assignment operator */
+    OGRMultiLineString &operator=(OGRMultiLineString &&other) = default;
 
     /** Type of child elements. */
     typedef OGRLineString ChildType;
diff --git a/ogr/ogr_recordbatch.h b/ogr/ogr_recordbatch.h
index 6fcde1405b65..48d1a8d7c874 100644
--- a/ogr/ogr_recordbatch.h
+++ b/ogr/ogr_recordbatch.h
@@ -19,7 +19,8 @@
 // https://github.com/apache/arrow/blob/main/cpp/src/arrow/c/abi.h WARNING: DO
 // NOT MODIFY the content as it would break interoperability !
 
-#pragma once
+#ifndef OGR_RECORDBATCH_H_INCLUDED
+#define OGR_RECORDBATCH_H_INCLUDED
 
 /*! @cond Doxygen_Suppress */
 
@@ -123,3 +124,5 @@ extern "C"
 #endif
 
 /*! @endcond */
+
+#endif  // OGR_RECORDBATCH_H_INCLUDED
diff --git a/ogr/ogrcurve.cpp b/ogr/ogrcurve.cpp
index 4e8b7d77f255..260e875571d1 100644
--- a/ogr/ogrcurve.cpp
+++ b/ogr/ogrcurve.cpp
@@ -756,7 +756,7 @@ int OGRCurve::isClockwise() const
     for (int i = 1; i < nPointCount - 1; i++)
     {
         ++oIter;
-        OGRPoint oPointCur = *oIter;
+        const OGRPoint oPointCur = *oIter;
         if (bNextPointIsNextSel)
         {
             oPointNextSel = oPointCur;
diff --git a/ogr/ogrcurvecollection.cpp b/ogr/ogrcurvecollection.cpp
index 83f3ae2e063a..38132dca41e7 100644
--- a/ogr/ogrcurvecollection.cpp
+++ b/ogr/ogrcurvecollection.cpp
@@ -59,6 +59,23 @@ OGRCurveCollection::OGRCurveCollection(const OGRCurveCollection &other)
     }
 }
 
+/************************************************************************/
+/*             OGRCurveCollection( OGRCurveCollection&& )               */
+/************************************************************************/
+
+/**
+ * \brief Move constructor.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRCurveCollection::OGRCurveCollection(OGRCurveCollection &&other)
+    : nCurveCount(other.nCurveCount), papoCurves(other.papoCurves)
+{
+    other.nCurveCount = 0;
+    other.papoCurves = nullptr;
+}
+
 /************************************************************************/
 /*                         ~OGRCurveCollection()                        */
 /************************************************************************/
@@ -107,6 +124,27 @@ OGRCurveCollection::operator=(const OGRCurveCollection &other)
     return *this;
 }
 
+/************************************************************************/
+/*                    operator=( OGRCurveCollection&& )                 */
+/************************************************************************/
+
+/**
+ * \brief Move assignment operator.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRCurveCollection &OGRCurveCollection::operator=(OGRCurveCollection &&other)
+{
+    if (this != &other)
+    {
+        empty(nullptr);
+        std::swap(nCurveCount, other.nCurveCount);
+        std::swap(papoCurves, other.papoCurves);
+    }
+    return *this;
+}
+
 /************************************************************************/
 /*                              WkbSize()                               */
 /************************************************************************/
diff --git a/ogr/ogrgeometry.cpp b/ogr/ogrgeometry.cpp
index e18e1b467526..e7a1cb4c5025 100644
--- a/ogr/ogrgeometry.cpp
+++ b/ogr/ogrgeometry.cpp
@@ -109,6 +109,22 @@ OGRGeometry::OGRGeometry(const OGRGeometry &other)
         const_cast<OGRSpatialReference *>(poSRS)->Reference();
 }
 
+/************************************************************************/
+/*                   OGRGeometry( OGRGeometry&& )                       */
+/************************************************************************/
+
+/**
+ * \brief Move constructor.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRGeometry::OGRGeometry(OGRGeometry &&other)
+    : poSRS(other.poSRS), flags(other.flags)
+{
+    other.poSRS = nullptr;
+}
+
 /************************************************************************/
 /*                            ~OGRGeometry()                            */
 /************************************************************************/
@@ -144,6 +160,27 @@ OGRGeometry &OGRGeometry::operator=(const OGRGeometry &other)
     return *this;
 }
 
+/************************************************************************/
+/*                    operator=( OGRGeometry&&)                         */
+/************************************************************************/
+
+/**
+ * \brief Move assignment operator.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRGeometry &OGRGeometry::operator=(OGRGeometry &&other)
+{
+    if (this != &other)
+    {
+        poSRS = other.poSRS;
+        other.poSRS = nullptr;
+        flags = other.flags;
+    }
+    return *this;
+}
+
 /************************************************************************/
 /*                            dumpReadable()                            */
 /************************************************************************/
diff --git a/ogr/ogrgeometrycollection.cpp b/ogr/ogrgeometrycollection.cpp
index 915dc7f0ca86..f1beb953ad96 100644
--- a/ogr/ogrgeometrycollection.cpp
+++ b/ogr/ogrgeometrycollection.cpp
@@ -57,6 +57,27 @@ OGRGeometryCollection::OGRGeometryCollection(const OGRGeometryCollection &other)
     }
 }
 
+/************************************************************************/
+/*            OGRGeometryCollection( OGRGeometryCollection&& )          */
+/************************************************************************/
+
+/**
+ * \brief Move constructor.
+ *
+ * @since GDAL 3.11
+ */
+
+// cppcheck-suppress-begin accessMoved
+OGRGeometryCollection::OGRGeometryCollection(OGRGeometryCollection &&other)
+    : OGRGeometry(std::move(other)), nGeomCount(other.nGeomCount),
+      papoGeoms(other.papoGeoms)
+{
+    other.nGeomCount = 0;
+    other.papoGeoms = nullptr;
+}
+
+// cppcheck-suppress-end accessMoved
+
 /************************************************************************/
 /*                       ~OGRGeometryCollection()                       */
 /************************************************************************/
@@ -112,6 +133,30 @@ OGRGeometryCollection::operator=(const OGRGeometryCollection &other)
     return *this;
 }
 
+/************************************************************************/
+/*                  operator=( OGRGeometryCollection&&)                 */
+/************************************************************************/
+
+/**
+ * \brief Move assignment operator.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRGeometryCollection &
+OGRGeometryCollection::operator=(OGRGeometryCollection &&other)
+{
+    if (this != &other)
+    {
+        empty();
+
+        OGRGeometry::operator=(std::move(other));
+        std::swap(nGeomCount, other.nGeomCount);
+        std::swap(papoGeoms, other.papoGeoms);
+    }
+    return *this;
+}
+
 /************************************************************************/
 /*                               empty()                                */
 /************************************************************************/
diff --git a/ogr/ogrgeometryfactory.cpp b/ogr/ogrgeometryfactory.cpp
index e68303667da9..dd4d5833d05d 100644
--- a/ogr/ogrgeometryfactory.cpp
+++ b/ogr/ogrgeometryfactory.cpp
@@ -2010,7 +2010,7 @@ OGRGeometry *OGRGeometryFactory::organizePolygons(OGRGeometry **papoPolygons,
                                 // If it is outside, then i cannot be inside j.
                                 break;
                             }
-                            previousPoint = point;
+                            previousPoint = std::move(point);
                         }
                         if (!b_i_inside_j && k == nPoints && nPoints > 2)
                         {
diff --git a/ogr/ogrlinestring.cpp b/ogr/ogrlinestring.cpp
index 80445fea3b6d..929b570901e0 100644
--- a/ogr/ogrlinestring.cpp
+++ b/ogr/ogrlinestring.cpp
@@ -60,6 +60,31 @@ OGRSimpleCurve::OGRSimpleCurve(const OGRSimpleCurve &other)
         setPoints(other.nPointCount, other.paoPoints, other.padfZ, other.padfM);
 }
 
+/************************************************************************/
+/*                OGRSimpleCurve( OGRSimpleCurve&& )                    */
+/************************************************************************/
+
+/**
+ * \brief Move constructor.
+ *
+ * @since GDAL 3.11
+ */
+
+// cppcheck-suppress-begin accessMoved
+OGRSimpleCurve::OGRSimpleCurve(OGRSimpleCurve &&other)
+    : OGRCurve(std::move(other)), nPointCount(other.nPointCount),
+      m_nPointCapacity(other.m_nPointCapacity), paoPoints(other.paoPoints),
+      padfZ(other.padfZ), padfM(other.padfM)
+{
+    other.nPointCount = 0;
+    other.m_nPointCapacity = 0;
+    other.paoPoints = nullptr;
+    other.padfZ = nullptr;
+    other.padfM = nullptr;
+}
+
+// cppcheck-suppress-end accessMoved
+
 /************************************************************************/
 /*                          ~OGRSimpleCurve()                           */
 /************************************************************************/
@@ -73,7 +98,7 @@ OGRSimpleCurve::~OGRSimpleCurve()
 }
 
 /************************************************************************/
-/*                       operator=( const OGRPoint& )                   */
+/*                 operator=(const OGRSimpleCurve &other)               */
 /************************************************************************/
 
 /**
@@ -98,6 +123,43 @@ OGRSimpleCurve &OGRSimpleCurve::operator=(const OGRSimpleCurve &other)
     return *this;
 }
 
+/************************************************************************/
+/*                     operator=(OGRSimpleCurve &&other)                */
+/************************************************************************/
+
+/**
+ * \brief Move assignment operator.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRSimpleCurve &OGRSimpleCurve::operator=(OGRSimpleCurve &&other)
+{
+    if (this != &other)
+    {
+        // cppcheck-suppress-begin accessMoved
+        OGRCurve::operator=(std::move(other));
+
+        nPointCount = other.nPointCount;
+        m_nPointCapacity = other.m_nPointCapacity;
+        CPLFree(paoPoints);
+        paoPoints = other.paoPoints;
+        CPLFree(padfZ);
+        padfZ = other.padfZ;
+        CPLFree(padfM);
+        padfM = other.padfM;
+        flags = other.flags;
+        other.nPointCount = 0;
+        other.m_nPointCapacity = 0;
+        other.paoPoints = nullptr;
+        other.padfZ = nullptr;
+        other.padfM = nullptr;
+        // cppcheck-suppress-end accessMoved
+    }
+
+    return *this;
+}
+
 /************************************************************************/
 /*                            flattenTo2D()                             */
 /************************************************************************/
@@ -2806,6 +2868,18 @@ OGRPointIterator *OGRSimpleCurve::getPointIterator() const
 
 OGRLineString::OGRLineString(const OGRLineString &) = default;
 
+/************************************************************************/
+/*                  OGRLineString( OGRLineString&& )                    */
+/************************************************************************/
+
+/**
+ * \brief Move constructor.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRLineString::OGRLineString(OGRLineString &&) = default;
+
 /************************************************************************/
 /*                    operator=( const OGRLineString& )                 */
 /************************************************************************/
@@ -2828,6 +2902,25 @@ OGRLineString &OGRLineString::operator=(const OGRLineString &other)
     return *this;
 }
 
+/************************************************************************/
+/*                    operator=( OGRLineString&& )                      */
+/************************************************************************/
+
+/**
+ * \brief Move assignment operator.
+ *
+ * @since GDAL 3.11
+ */
+
+OGRLineString &OGRLineString::operator=(OGRLineString &&other)
+{
+    if (this != &other)
+    {
+        OGRSimpleCurve::operator=(std::move(other));
+    }
+    return *this;
+}
+
 /************************************************************************/
 /*                          getGeometryType()                           */
 /************************************************************************/
diff --git a/ogr/ogrsf_frmts/CMakeLists.txt b/ogr/ogrsf_frmts/CMakeLists.txt
index d9337012cae0..15fc491e7665 100644
--- a/ogr/ogrsf_frmts/CMakeLists.txt
+++ b/ogr/ogrsf_frmts/CMakeLists.txt
@@ -90,6 +90,8 @@ if( NOT WORDS_BIGENDIAN )
     ogr_optional_driver(miramon "MiraMonVector")
 endif()
 
+ogr_optional_driver(aivector AIVector)
+
 # ######################################################################################################################
 #
 if (NOT OGR_ENABLE_DRIVER_GEOJSON_PLUGIN)
diff --git a/ogr/ogrsf_frmts/aivector/CMakeLists.txt b/ogr/ogrsf_frmts/aivector/CMakeLists.txt
new file mode 100644
index 000000000000..8b0b2ce48dac
--- /dev/null
+++ b/ogr/ogrsf_frmts/aivector/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_gdal_driver(
+  TARGET ogr_AIVector
+  SOURCES
+      ograivectordriver.cpp
+  PLUGIN_CAPABLE
+  NO_DEPS
+  STRONG_CXX_WFLAGS)
+
+gdal_standard_includes(ogr_AIVector)
diff --git a/ogr/ogrsf_frmts/aivector/ograivectordriver.cpp b/ogr/ogrsf_frmts/aivector/ograivectordriver.cpp
new file mode 100644
index 000000000000..6606311fffbc
--- /dev/null
+++ b/ogr/ogrsf_frmts/aivector/ograivectordriver.cpp
@@ -0,0 +1,146 @@
+/******************************************************************************
+ *
+ * Project:  GDAL
+ * Purpose:  Artificial Intelligence powered driver
+ * Author:   Even Rouault, <even dot rouault at spatialys.com>
+ *
+ ******************************************************************************
+ * Copyright (c) 2024, Even Rouault <even dot rouault at spatialys.com>
+ *
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#include "ogrsf_frmts.h"
+
+/************************************************************************/
+/*                       OGRAIVectorIdentify()                          */
+/************************************************************************/
+
+static int OGRAIVectorIdentify(GDALOpenInfo *poOpenInfo)
+{
+    return STARTS_WITH_CI(poOpenInfo->pszFilename, "AIVector:") ||
+           poOpenInfo->IsSingleAllowedDriver("AIVector");
+}
+
+/************************************************************************/
+/*                         OGRAIVectorOpen()                            */
+/************************************************************************/
+
+static GDALDataset *OGRAIVectorOpen(GDALOpenInfo *poOpenInfo)
+{
+    if (!OGRAIVectorIdentify(poOpenInfo))
+        return nullptr;
+
+    class MyLayer final : public OGRLayer,
+                          public OGRGetNextFeatureThroughRaw<MyLayer>
+    {
+        OGRFeatureDefn *m_poLayerDefn = nullptr;
+        bool m_bReturnedFeature = false;
+
+        CPL_DISALLOW_COPY_ASSIGN(MyLayer)
+
+      public:
+        MyLayer()
+        {
+            m_poLayerDefn = new OGRFeatureDefn("result");
+            SetDescription(m_poLayerDefn->GetName());
+            m_poLayerDefn->Reference();
+            OGRFieldDefn oFieldDefn("name", OFTString);
+            m_poLayerDefn->AddFieldDefn(&oFieldDefn);
+            OGRSpatialReference *poSRS = new OGRSpatialReference(
+                "GEOGCS[\"I don't know\",\n"
+                "    DATUM[\"I don't care\",\n"
+                "        SPHEROID[\"GRS 1980\",6378137,298.257222101,\n"
+                "            AUTHORITY[\"EPSG\",\"7019\"]]],\n"
+                "    PRIMEM[\"Greenwich\",0,\n"
+                "        AUTHORITY[\"EPSG\",\"8901\"]],\n"
+                "    UNIT[\"degree\",0.0174532925199433,\n"
+                "        AUTHORITY[\"EPSG\",\"9122\"]],\n"
+                "    AUTHORITY[\"AI\",\"TOTALLY_MADE_UP\"]]");
+            m_poLayerDefn->GetGeomFieldDefn(0)->SetSpatialRef(poSRS);
+            poSRS->Release();
+        }
+
+        ~MyLayer() override
+        {
+            m_poLayerDefn->Release();
+        }
+
+        void ResetReading() override
+        {
+            m_bReturnedFeature = false;
+        }
+
+        OGRFeatureDefn *GetLayerDefn() override
+        {
+            return m_poLayerDefn;
+        }
+        DEFINE_GET_NEXT_FEATURE_THROUGH_RAW(MyLayer)
+
+        OGRFeature *GetNextRawFeature()
+        {
+            if (m_bReturnedFeature)
+                return nullptr;
+            m_bReturnedFeature = true;
+            OGRFeature *poFeature = new OGRFeature(m_poLayerDefn);
+            poFeature->SetFID(0);
+            poFeature->SetField(0, "Null Island: the place to be");
+            OGRPoint *poPoint = new OGRPoint(0, 0);
+            poPoint->assignSpatialReference(GetSpatialRef());
+            poFeature->SetGeometryDirectly(poPoint);
+            return poFeature;
+        }
+
+        int TestCapability(const char *) override
+        {
+            return false;
+        }
+    };
+
+    class MyDataset final : public GDALDataset
+    {
+        MyLayer m_oLayer{};
+
+      public:
+        MyDataset() = default;
+
+        int GetLayerCount() override
+        {
+            return 1;
+        }
+
+        OGRLayer *GetLayer(int idx) override
+        {
+            return idx == 0 ? &m_oLayer : nullptr;
+        }
+    };
+
+    return new MyDataset();
+}
+
+/************************************************************************/
+/*                       RegisterOGRAIVector()                          */
+/************************************************************************/
+
+void RegisterOGRAIVector()
+{
+    if (!GDAL_CHECK_VERSION("AIVector"))
+        return;
+
+    if (GDALGetDriverByName("AIVector") != nullptr)
+        return;
+
+    GDALDriver *poDriver = new GDALDriver();
+    poDriver->SetDescription("AIVector");
+    poDriver->SetMetadataItem(GDAL_DCAP_VECTOR, "YES");
+    poDriver->SetMetadataItem(GDAL_DMD_LONGNAME,
+                              "Artificial Intelligence powered vector driver");
+    poDriver->SetMetadataItem(GDAL_DMD_HELPTOPIC,
+                              "drivers/vector/aivector.html");
+
+    poDriver->SetMetadataItem(GDAL_DMD_CONNECTION_PREFIX, "AIVector:");
+
+    poDriver->pfnOpen = OGRAIVectorOpen;
+    poDriver->pfnIdentify = OGRAIVectorIdentify;
+    GetGDALDriverManager()->RegisterDriver(poDriver);
+}
diff --git a/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp b/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp
index 4f5ae6ba6f4d..24ca0f768cac 100644
--- a/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp
+++ b/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp
@@ -1369,7 +1369,7 @@ static void InterpolateSpline(OGRLineString *const poLine,
 
         aoDataPoints.push_back(
             DXFTriple(oPoint.getX(), oPoint.getY(), oPoint.getZ()));
-        oPrevPoint = oPoint;
+        oPrevPoint = std::move(oPoint);
     }
     nDataPoints = static_cast<int>(aoDataPoints.size());
     if (nDataPoints < 2)
diff --git a/ogr/ogrsf_frmts/generic/ogrregisterall.cpp b/ogr/ogrsf_frmts/generic/ogrregisterall.cpp
index b7d6744daea6..843197c6dac5 100644
--- a/ogr/ogrsf_frmts/generic/ogrregisterall.cpp
+++ b/ogr/ogrsf_frmts/generic/ogrregisterall.cpp
@@ -273,4 +273,9 @@ void OGRRegisterAllInternal()
     RegisterOGRAVCE00();
 #endif
 
+    // Last but not the least
+#ifdef AIVECTOR_ENABLED
+    RegisterOGRAIVector();
+#endif
+
 } /* OGRRegisterAll */
diff --git a/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp b/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp
index 796174cce9d1..b5550ee3f3f9 100644
--- a/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp
+++ b/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp
@@ -638,7 +638,7 @@ void OGRLVBAGLayer::EndElementCbk(const char *pszName)
                     poGeom->flattenTo2D();
 
 #ifdef HAVE_GEOS
-                if (!poGeom->IsValid() && bFixInvalidData)
+                if (bFixInvalidData && !poGeom->IsValid())
                 {
                     std::unique_ptr<OGRGeometry> poSubGeom =
                         std::unique_ptr<OGRGeometry>{poGeom->MakeValid()};
diff --git a/ogr/ogrsf_frmts/ogrsf_frmts.h b/ogr/ogrsf_frmts/ogrsf_frmts.h
index 994db366a89c..e7f28b1fb155 100644
--- a/ogr/ogrsf_frmts/ogrsf_frmts.h
+++ b/ogr/ogrsf_frmts/ogrsf_frmts.h
@@ -741,6 +741,7 @@ void CPL_DLL RegisterOGRXODR();
 void DeclareDeferredOGRXODRPlugin();
 void CPL_DLL RegisterOGRADBC();
 void DeclareDeferredOGRADBCPlugin();
+void CPL_DLL RegisterOGRAIVector();
 // @endcond
 
 CPL_C_END
diff --git a/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h b/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h
index ae2a5f3e3d0f..36c29b23f3a9 100644
--- a/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h
+++ b/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h
@@ -28,7 +28,7 @@
 
 using namespace OpenFileGDB;
 
-std::string OFGDBGenerateUUID();
+std::string OFGDBGenerateUUID(bool bInit = false);
 
 int OGROpenFileGDBIsComparisonOp(int op);
 
diff --git a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp
index 4198263416ac..a8aa87b1341e 100644
--- a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp
+++ b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp
@@ -54,7 +54,7 @@ static int CPLGettimeofday(struct CPLTimeVal *tp, void * /* timezonep*/)
 // Probably not the best UUID generator ever. One issue is that mt19937
 // uses only a 32-bit seed.
 CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW
-std::string OFGDBGenerateUUID()
+std::string OFGDBGenerateUUID(bool bInit)
 {
     struct CPLTimeVal tv;
     memset(&tv, 0, sizeof(tv));
@@ -62,57 +62,82 @@ std::string OFGDBGenerateUUID()
     const bool bReproducibleUUID =
         CPLTestBool(CPLGetConfigOption("OPENFILEGDB_REPRODUCIBLE_UUID", "NO"));
 
+    if (bInit)
+    {
+        if (bReproducibleUUID)
+            nCounter = 0;
+        return std::string();
+    }
+
+    uint32_t nCounterLocal = nCounter;
+    // From POSIX.1-2001 as an example of an implementation of rand()
+    // for reproducible output.
+    // We have to use that rather than relying on std::mt19937 +
+    // std::uniform_int_distribution since they don't given the same output
+    // from the same seed on all platforms.
+    const auto reproducibleRand = [&nCounterLocal]()
+    {
+        nCounterLocal = nCounterLocal * 1103515245U + 12345U;
+        return (nCounterLocal / 65536U) % 32768U;
+    };
+
     std::stringstream ss;
 
     {
         if (!bReproducibleUUID)
+        {
             CPLGettimeofday(&tv, nullptr);
-        std::mt19937 gen(++nCounter +
-                         (bReproducibleUUID
-                              ? 0
-                              : static_cast<unsigned>(tv.tv_sec ^ tv.tv_usec)));
+            ++nCounter;
+        }
+        std::mt19937 gen(nCounter +
+                         static_cast<unsigned>(tv.tv_sec ^ tv.tv_usec));
         std::uniform_int_distribution<> dis(0, 15);
 
         ss << "{";
         ss << std::hex;
         for (int i = 0; i < 8; i++)
         {
-            ss << dis(gen);
+            ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen));
         }
         ss << "-";
         for (int i = 0; i < 4; i++)
         {
-            ss << dis(gen);
+            ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen));
         }
         ss << "-4";
         for (int i = 0; i < 3; i++)
         {
-            ss << dis(gen);
+            ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen));
         }
     }
 
     {
         if (!bReproducibleUUID)
+        {
             CPLGettimeofday(&tv, nullptr);
-        std::mt19937 gen(++nCounter +
-                         (bReproducibleUUID
-                              ? 0
-                              : static_cast<unsigned>(tv.tv_sec ^ tv.tv_usec)));
+            ++nCounter;
+        }
+        std::mt19937 gen(nCounter +
+                         static_cast<unsigned>(tv.tv_sec ^ tv.tv_usec));
         std::uniform_int_distribution<> dis(0, 15);
         std::uniform_int_distribution<> dis2(8, 11);
 
         ss << "-";
-        ss << dis2(gen);
+        ss << (bReproducibleUUID ? 8 : dis2(gen));
         for (int i = 0; i < 3; i++)
         {
-            ss << dis(gen);
+            ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen));
         }
         ss << "-";
         for (int i = 0; i < 12; i++)
         {
-            ss << dis(gen);
+            ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen));
         };
         ss << "}";
-        return ss.str();
     }
+
+    if (bReproducibleUUID)
+        nCounter = nCounterLocal;
+
+    return ss.str();
 }
diff --git a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp
index f6e54d500998..27ba3a2dd9f4 100644
--- a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp
+++ b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp
@@ -1299,6 +1299,8 @@ bool OGROpenFileGDBDataSource::Create(const char *pszName)
         return false;
     }
 
+    CPL_IGNORE_RET_VAL(OFGDBGenerateUUID(/* bInit = */ true));
+
     m_osDirName = pszName;
     eAccess = GA_Update;
 
diff --git a/perftests/CMakeLists.txt b/perftests/CMakeLists.txt
index 4f365f34a326..a103013d3c9b 100644
--- a/perftests/CMakeLists.txt
+++ b/perftests/CMakeLists.txt
@@ -1,5 +1,18 @@
 include(GdalTestTarget)
 
+include(GdalSetRuntimeEnv)
+gdal_set_runtime_env(TEST_ENV)
+
+if (MINGW)
+  list(APPEND TEST_ENV SKIP_MEM_INTENSIVE_TEST=YES)
+endif ()
+
+if (WIN32)
+  # If running GDAL as a CustomBuild Command os MSBuild, "ERROR bla:" is considered as failing the job. This is rarely
+  # the intended behavior
+  list(APPEND TEST_ENV "CPL_ERROR_SEPARATOR=\\;")
+endif ()
+
 gdal_test_target(testperfcopywords testperfcopywords.cpp)
 gdal_test_target(testperfdeinterleave testperfdeinterleave.cpp)
 
@@ -10,3 +23,10 @@ target_link_libraries(bench_ogr_batch PRIVATE $<TARGET_NAME:${GDAL_LIB_TARGET_NA
 add_executable(bench_ogr_c_api bench_ogr_c_api.cpp)
 gdal_standard_includes(bench_ogr_c_api)
 target_link_libraries(bench_ogr_c_api PRIVATE $<TARGET_NAME:${GDAL_LIB_TARGET_NAME}>)
+
+gdal_test_target(testperf_gdal_minmax_element testperf_gdal_minmax_element.cpp)
+if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
+  target_compile_definitions(testperf_gdal_minmax_element PRIVATE -DUSE_NEON_OPTIMIZATIONS)
+endif()
+add_test(NAME testperf_gdal_minmax_element COMMAND testperf_gdal_minmax_element)
+set_property(TEST testperf_gdal_minmax_element PROPERTY ENVIRONMENT "${TEST_ENV}")
diff --git a/perftests/testperf_gdal_minmax_element.cpp b/perftests/testperf_gdal_minmax_element.cpp
new file mode 100644
index 000000000000..2e2a0d2f408f
--- /dev/null
+++ b/perftests/testperf_gdal_minmax_element.cpp
@@ -0,0 +1,332 @@
+/******************************************************************************
+ * Project:  GDAL Core
+ * Purpose:  Test performance of gdal_minmax_element.hpp
+ * Author:   Even Rouault, <even dot rouault at spatialys.com>
+ *
+ ******************************************************************************
+ * Copyright (c) 2024, Even Rouault <even dot rouault at spatialys.com>
+ *
+ * SPDX-License-Identifier: MIT
+ ****************************************************************************/
+
+#include "gdal_minmax_element.hpp"
+
+#include <chrono>
+#include <random>
+
+template <class T> void randomFill(T *v, size_t size, bool withNaN = true)
+{
+    std::random_device rd;
+    std::mt19937 gen{rd()};
+    std::normal_distribution<> dist{127, 30};
+    for (size_t i = 0; i < size; i++)
+    {
+        v[i] = static_cast<T>(dist(gen));
+        if constexpr (std::is_same<T, float>::value ||
+                      std::is_same<T, double>::value)
+        {
+            if (withNaN && (i == 0 || (i > 10 && ((i + 1) % 1024) <= 4)))
+                v[i] = std::numeric_limits<float>::quiet_NaN();
+        }
+    }
+}
+
+constexpr size_t SIZE = 10 * 1000 * 1000 + 1;
+constexpr int N_ITERS = 1;
+
+template <class T>
+#if defined(__GNUC__)
+__attribute__((noinline))
+#endif
+static void
+benchIntegers(GDALDataType eDT, T noData)
+{
+    std::vector<T> x;
+    x.resize(SIZE);
+    randomFill(x.data(), x.size());
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, false, 0));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(
+                std::distance(x.begin(), std::min_element(x.begin(), x.end())));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (using std::min_element)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, true, noData));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(std::distance(
+                x.begin(), std::min_element(x.begin(), x.end(),
+                                            [noData](T a, T b) {
+                                                return b == noData   ? true
+                                                       : a == noData ? false
+                                                                     : a < b;
+                                            })));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, using std::min_element with "
+               "nodata aware comparison)\n",
+               idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+}
+
+template <class T>
+#if defined(__GNUC__)
+__attribute__((noinline))
+#endif
+static void
+benchFloatingPointsWithNaN(GDALDataType eDT, T noData)
+{
+    std::vector<T> x;
+    x.resize(SIZE);
+    randomFill(x.data(), x.size());
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, false, 0));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(std::distance(
+                x.begin(), std::min_element(x.begin(), x.end(),
+                                            [](T a, T b) {
+                                                return std::isnan(b)   ? true
+                                                       : std::isnan(a) ? false
+                                                                       : a < b;
+                                            })));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (using std::min_element with NaN aware "
+               "comparison)\n",
+               idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, true, noData));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(std::distance(
+                x.begin(), std::min_element(x.begin(), x.end(),
+                                            [noData](T a, T b)
+                                            {
+                                                return std::isnan(b)   ? true
+                                                       : std::isnan(a) ? false
+                                                       : b == noData   ? true
+                                                       : a == noData   ? false
+                                                                       : a < b;
+                                            })));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, using std::min_element with "
+               "nodata aware and NaN aware comparison)\n",
+               idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+}
+
+template <class T>
+#if defined(__GNUC__)
+__attribute__((noinline))
+#endif
+static void
+benchFloatingPointsWithoutNaN(GDALDataType eDT, T noData)
+{
+    std::vector<T> x;
+    x.resize(SIZE);
+    randomFill(x.data(), x.size(), false);
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, false, 0));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(
+                std::distance(x.begin(), std::min_element(x.begin(), x.end())));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (using std::min_element)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(
+                gdal::min_element(x.data(), x.size(), eDT, true, noData));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, optimized)\n", idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+    {
+        auto start = std::chrono::steady_clock::now();
+        int idx = 0;
+        for (int i = 0; i < N_ITERS; ++i)
+        {
+            idx += static_cast<int>(std::distance(
+                x.begin(), std::min_element(x.begin(), x.end(),
+                                            [noData](T a, T b) {
+                                                return b == noData   ? true
+                                                       : a == noData ? false
+                                                                     : a < b;
+                                            })));
+        }
+        idx /= N_ITERS;
+        printf("min at idx %d (nodata case, using std::min_element with "
+               "nodata aware comparison)\n",
+               idx);
+        auto end = std::chrono::steady_clock::now();
+        printf("-> elapsed=%d\n", static_cast<int>((end - start).count()));
+    }
+}
+
+int main(int /* argc */, char * /* argv */[])
+{
+    {
+        using T = uint8_t;
+        constexpr GDALDataType eDT = GDT_Byte;
+        printf("uint8:\n");
+        benchIntegers<T>(eDT, 0);
+    }
+    printf("--------------------\n");
+    {
+        using T = int8_t;
+        constexpr GDALDataType eDT = GDT_Int8;
+        printf("int8:\n");
+        benchIntegers<T>(eDT, 0);
+    }
+    printf("--------------------\n");
+    {
+        using T = uint16_t;
+        constexpr GDALDataType eDT = GDT_UInt16;
+        printf("uint16:\n");
+        benchIntegers<T>(eDT, 0);
+    }
+    printf("--------------------\n");
+    {
+        using T = int16_t;
+        constexpr GDALDataType eDT = GDT_Int16;
+        printf("int16:\n");
+        benchIntegers<T>(eDT, 0);
+    }
+    printf("--------------------\n");
+    {
+        using T = uint32_t;
+        constexpr GDALDataType eDT = GDT_UInt32;
+        printf("uint32:\n");
+        benchIntegers<T>(eDT, 0);
+    }
+    printf("--------------------\n");
+    {
+        using T = int32_t;
+        constexpr GDALDataType eDT = GDT_Int32;
+        printf("int32:\n");
+        benchIntegers<T>(eDT, 0);
+    }
+    printf("--------------------\n");
+    {
+        using T = float;
+        constexpr GDALDataType eDT = GDT_Float32;
+        printf("float (*with* NaN):\n");
+        benchFloatingPointsWithNaN<T>(eDT, 0);
+    }
+    printf("--------------------\n");
+    {
+        using T = float;
+        constexpr GDALDataType eDT = GDT_Float32;
+        printf("float (without NaN):\n");
+        benchFloatingPointsWithoutNaN<T>(eDT, 0);
+    }
+    printf("--------------------\n");
+    {
+        using T = double;
+        constexpr GDALDataType eDT = GDT_Float64;
+        printf("double (*with* NaN):\n");
+        benchFloatingPointsWithNaN<T>(eDT, 0);
+    }
+    printf("--------------------\n");
+    {
+        using T = double;
+        constexpr GDALDataType eDT = GDT_Float64;
+        printf("double (without NaN):\n");
+        benchFloatingPointsWithoutNaN<T>(eDT, 0);
+    }
+    return 0;
+}
diff --git a/port/cpl_cpu_features.h b/port/cpl_cpu_features.h
index 9106ed5c39e5..10d3daaf9f1b 100644
--- a/port/cpl_cpu_features.h
+++ b/port/cpl_cpu_features.h
@@ -31,7 +31,12 @@ bool CPLHaveRuntimeSSE();
 #endif
 #endif
 
-#ifdef HAVE_SSSE3_AT_COMPILE_TIME
+#ifdef USE_NEON_OPTIMIZATIONS
+static bool inline CPLHaveRuntimeSSSE3()
+{
+    return true;
+}
+#elif defined(HAVE_SSSE3_AT_COMPILE_TIME)
 #if __SSSE3__
 #define HAVE_INLINE_SSSE3
 
diff --git a/port/cpl_error.cpp b/port/cpl_error.cpp
index f751837377b0..b25c6511d4aa 100644
--- a/port/cpl_error.cpp
+++ b/port/cpl_error.cpp
@@ -590,11 +590,18 @@ static void CPLvDebug(const char *pszCategory,
     /* -------------------------------------------------------------------- */
     /*      Does this message pass our current criteria?                    */
     /* -------------------------------------------------------------------- */
-    if (pszDebug == nullptr)
+    if (pszDebug == nullptr || EQUAL(pszDebug, "NO") ||
+        EQUAL(pszDebug, "OFF") || EQUAL(pszDebug, "FALSE") ||
+        EQUAL(pszDebug, "0"))
+    {
         return;
+    }
 
-    if (!EQUAL(pszDebug, "ON") && !EQUAL(pszDebug, ""))
+    if (!EQUAL(pszDebug, "ON") && !EQUAL(pszDebug, "YES") &&
+        !EQUAL(pszDebug, "TRUE") && !EQUAL(pszDebug, "1") &&
+        !EQUAL(pszDebug, ""))
     {
+        // check if value of CPL_DEBUG contains the category
         const size_t nLen = strlen(pszCategory);
 
         size_t i = 0;
@@ -623,7 +630,7 @@ static void CPLvDebug(const char *pszCategory,
 
     pszMessage[0] = '\0';
 #ifdef TIMESTAMP_DEBUG
-    if (CPLGetConfigOption("CPL_TIMESTAMP", nullptr) != nullptr)
+    if (CPLTestBool(CPLGetConfigOption("CPL_TIMESTAMP", "NO")))
     {
         static struct CPLTimeVal tvStart;
         static const auto unused = CPLGettimeofday(&tvStart, nullptr);
diff --git a/port/cpl_worker_thread_pool.cpp b/port/cpl_worker_thread_pool.cpp
index c2134f0b7a30..387ad7bbba43 100644
--- a/port/cpl_worker_thread_pool.cpp
+++ b/port/cpl_worker_thread_pool.cpp
@@ -581,14 +581,14 @@ bool CPLJobQueue::SubmitJob(std::function<void()> task)
         m_nPendingJobs++;
     }
 
-    // cppcheck-suppress knownConditionTrueFalse
     // coverity[uninit_member,copy_constructor_call]
-    return m_poPool->SubmitJob(
-        [this, task]
-        {
-            task();
-            DeclareJobFinished();
-        });
+    const auto lambda = [this, task]
+    {
+        task();
+        DeclareJobFinished();
+    };
+    // cppcheck-suppress knownConditionTrueFalse
+    return m_poPool->SubmitJob(lambda);
 }
 
 /************************************************************************/
diff --git a/scripts/cppcheck.sh b/scripts/cppcheck.sh
index ef7eb728a714..f3b9932c2077 100755
--- a/scripts/cppcheck.sh
+++ b/scripts/cppcheck.sh
@@ -146,10 +146,6 @@ ret_code=0
 grep -v "unmatchedSuppression" ${LOG_FILE} | grep -v -e " yacc.c" -e PublicDecompWT -e "kdu_cache_wrapper.h" > ${LOG_FILE}.tmp
 mv ${LOG_FILE}.tmp ${LOG_FILE}
 
-# I don't want to care about SDE
-grep -v -e "frmts/sde" -e  "ogr/ogrsf_frmts/sde" ${LOG_FILE} > ${LOG_FILE}.tmp
-mv ${LOG_FILE}.tmp ${LOG_FILE}
-
 # I don't want to care about flatbuffers
 grep -v -e "ogr/ogrsf_frmts/flatgeobuf/flatbuffers" ${LOG_FILE} > ${LOG_FILE}.tmp
 mv ${LOG_FILE}.tmp ${LOG_FILE}
@@ -184,6 +180,12 @@ mv ${LOG_FILE}.tmp ${LOG_FILE}
 grep -v -e "The comparison '0 <= yystate' is always true" ${LOG_FILE} > ${LOG_FILE}.tmp
 mv ${LOG_FILE}.tmp ${LOG_FILE}
 
+# False positives with cppcheck of ubuntu 20.04
+grep -v -e "ogrlinestring.cpp:.*warning,accessMoved"  ${LOG_FILE} > ${LOG_FILE}.tmp
+mv ${LOG_FILE}.tmp ${LOG_FILE}
+grep -v -e "ogrgeometrycollection.cpp:.*warning,accessMoved"  ${LOG_FILE} > ${LOG_FILE}.tmp
+mv ${LOG_FILE}.tmp ${LOG_FILE}
+
 if grep "null pointer" ${LOG_FILE} ; then
     echo "Null pointer check failed"
     ret_code=1
diff --git a/scripts/fix_typos.sh b/scripts/fix_typos.sh
index f218033606eb..564212398542 100755
--- a/scripts/fix_typos.sh
+++ b/scripts/fix_typos.sh
@@ -77,6 +77,7 @@ EXCLUDED_FILES="$EXCLUDED_FILES,./cmake/modules/CMakeCheckCompilerFlagCommonPatt
 EXCLUDED_FILES="$EXCLUDED_FILES,./cmake/modules/Copyright.txt"
 EXCLUDED_FILES="$EXCLUDED_FILES,*/sqlite_rtree_bulk_load/*"
 EXCLUDED_FILES="$EXCLUDED_FILES,ogr_adbc_internal.h"
+EXCLUDED_FILES="$EXCLUDED_FILES,sse2neon.h"
 EXCLUDED_FILES="$EXCLUDED_FILES,*/spelling_wordlist.txt"
 AUTHORIZED_LIST="poSession,FIDN,TRAFIC,HTINK,repID,oCurr,INTREST,oPosition"
 AUTHORIZED_LIST="$AUTHORIZED_LIST,CPL_SUPRESS_CPLUSPLUS,SRP_NAM,ADRG_NAM,'SRP_NAM,AuxilaryTarget"
diff --git a/swig/include/Band.i b/swig/include/Band.i
index ea4fd47f6e22..6c45f714e078 100644
--- a/swig/include/Band.i
+++ b/swig/include/Band.i
@@ -669,6 +669,23 @@ CPLErr AdviseRead(  int xoff, int yoff, int xsize, int ysize,
 %clear (CPLErr);
 #endif
 
+%apply (double *OUTPUT){double *pdfMin, double *pdfMax};
+%apply (int *OUTPUT){int *pnMinX, int *pnMinY};
+%apply (int *OUTPUT){int *pnMaxX, int *pnMaxY};
+#if !defined(SWIGPYTHON)
+%apply (IF_ERROR_RETURN_NONE) { (CPLErr) };
+#endif
+  CPLErr ComputeMinMaxLocation( double *pdfMin, double *pdfMax,
+                                int *pnMinX, int *pnMinY,
+                                int *pnMaxX, int *pnMaxY ) {
+    return GDALComputeRasterMinMaxLocation( self, pdfMin, pdfMax,
+                                            pnMinX, pnMinY,
+                                            pnMaxX, pnMaxY );
+  }
+#if !defined(SWIGPYTHON)
+%clear (CPLErr);
+#endif
+
 %newobject AsMDArray;
   GDALMDArrayHS *AsMDArray()
   {
diff --git a/swig/include/python/gdal_python.i b/swig/include/python/gdal_python.i
index 67610e5cfc4e..b9f48c0fa3b0 100644
--- a/swig/include/python/gdal_python.i
+++ b/swig/include/python/gdal_python.i
@@ -4997,6 +4997,50 @@ def InterpolateAtPoint(self, *args, **kwargs):
         return ret[1]
 %}
 
+%feature("shadow") ComputeMinMaxLocation %{
+def ComputeMinMaxLocation(self, *args, **kwargs):
+    """Compute the min/max values for a band, and their location.
+
+       Pixels whose value matches the nodata value or are masked by the mask
+       band are ignored.
+
+       If the minimum or maximum value is hit in several locations, it is not
+       specified which one will be returned.
+
+       This is a mapping of :cpp:func:`GDALRasterBand::ComputeRasterMinMaxLocation`.
+
+       Parameters
+       ----------
+       None
+
+       Returns
+       -------
+       a named tuple (min, max, minX, minY, maxX, maxY) or or ``None``
+       in case of error or no valid pixel.
+    """
+
+    ret = $action(self, *args, **kwargs)
+    if ret[0] != CE_None:
+        return None
+
+    import collections
+    tuple = collections.namedtuple('ComputeMinMaxLocationResult',
+            ['min',
+             'max',
+             'minX',
+             'minY',
+             'maxX',
+             'maxY',
+             ])
+    tuple.min = ret[1]
+    tuple.max = ret[2]
+    tuple.minX = ret[3]
+    tuple.minY = ret[4]
+    tuple.maxX = ret[5]
+    tuple.maxY = ret[6]
+    return tuple
+%}
+
 %pythoncode %{
 
 # VSIFile: Copyright (c) 2024, Dan Baston <dbaston at gmail.com>
diff --git a/swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py b/swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py
new file mode 100644
index 000000000000..018ec00f0d29
--- /dev/null
+++ b/swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py
@@ -0,0 +1,128 @@
+# !/usr/bin/env python3
+###############################################################################
+# Project:  GDAL utils
+# Purpose:  Get min/max location
+# Author:   Even Rouault <even@spatialys.com>
+#
+###############################################################################
+# Copyright (c) 2024, Even Rouault <even@spatialys.com>
+#
+# SPDX-License-Identifier: MIT
+###############################################################################
+
+import sys
+import textwrap
+from typing import Optional
+
+from osgeo import gdal, osr
+from osgeo_utils.auxiliary.gdal_argparse import GDALArgumentParser, GDALScript
+from osgeo_utils.auxiliary.util import PathOrDS, open_ds
+
+
+def gdalminmaxlocation_util(
+    filename_or_ds: PathOrDS,
+    band_num: int,
+    open_options: Optional[dict] = None,
+    **kwargs,
+):
+    ds = open_ds(filename_or_ds, open_options=open_options)
+    band = ds.GetRasterBand(band_num)
+    ret = band.ComputeMinMaxLocation()
+    if ret is None:
+        print("No valid pixels")
+        return 1
+    gt = ds.GetGeoTransform(can_return_null=True)
+    if gt:
+        srs = ds.GetSpatialRef()
+        if srs:
+            wgs84 = osr.SpatialReference()
+            wgs84.SetFromUserInput("WGS84")
+            wgs84.SetAxisMappingStrategy(osr.OAMS_TRADITIONAL_GIS_ORDER)
+            ct = osr.CreateCoordinateTransformation(srs, wgs84)
+            georefX, georefY = gdal.ApplyGeoTransform(
+                gt, ret.minX + 0.5, ret.minY + 0.5
+            )
+            long, lat, _ = ct.TransformPoint(georefX, georefY)
+            print(
+                f"Minimum={ret.min} at (col,line)=({ret.minX},{ret.minY}), (X,Y)_georef=({georefX},{georefY}), (long,lat)_WGS84=({long:.7f},{lat:.7f})"
+            )
+            georefX, georefY = gdal.ApplyGeoTransform(
+                gt, ret.maxX + 0.5, ret.maxY + 0.5
+            )
+            long, lat, _ = ct.TransformPoint(georefX, georefY)
+            print(
+                f"Maximum={ret.max} at (col,line)=({ret.maxX},{ret.maxY}), (X,Y)_georef=({georefX},{georefY}), (long,lat)_WGS84=({long:.7f},{lat:.7f})"
+            )
+        else:
+            georefX, georefY = gdal.ApplyGeoTransform(
+                gt, ret.minX + 0.5, ret.minY + 0.5
+            )
+            print(
+                f"Minimum={ret.min} at (col,line)=({ret.minX},{ret.minY}), (X,Y)_georef=({georefX},{georefY})"
+            )
+            georefX, georefY = gdal.ApplyGeoTransform(
+                gt, ret.maxX + 0.5, ret.maxY + 0.5
+            )
+            print(
+                f"Maximum={ret.max} at (col,line)=({ret.maxX},{ret.maxY}), (X,Y)_georef=({georefX},{georefY})"
+            )
+    else:
+        print(f"Minimum={ret.min} at (col,line)=({ret.minX},{ret.minY})")
+        print(f"Maximum={ret.max} at (col,line)=({ret.maxX},{ret.maxY})")
+
+    return 0
+
+
+class GDALMinMaxLocation(GDALScript):
+    def __init__(self):
+        super().__init__()
+        self.title = "Raster min/max location query tool"
+        self.description = textwrap.dedent(
+            """\
+            The gdal_minmax_location utility returns the location where min/max values of a raster are hit."""
+        )
+        self.interactive_mode = None
+
+    def get_parser(self, argv) -> GDALArgumentParser:
+        parser = self.parser
+
+        parser.add_argument(
+            "-b",
+            dest="band_num",
+            metavar="band",
+            type=int,
+            default=1,
+            help="Selects a band to query (default: first one).",
+        )
+
+        parser.add_argument(
+            "-oo",
+            dest="open_options",
+            metavar="NAME=VALUE",
+            help="Dataset open option (format specific).",
+            nargs="+",
+        )
+
+        parser.add_argument(
+            "filename_or_ds",
+            metavar="filename",
+            type=str,
+            help="The source GDAL raster datasource name.",
+        )
+
+        return parser
+
+    def augment_kwargs(self, kwargs) -> dict:
+        return kwargs
+
+    def doit(self, **kwargs):
+        return gdalminmaxlocation_util(**kwargs)
+
+
+def main(argv=sys.argv):
+    gdal.UseExceptions()
+    return GDALMinMaxLocation().main(argv)
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
diff --git a/third_party/fast_float/PROVENANCE.TXT b/third_party/fast_float/PROVENANCE.TXT
index 0ea9d9ccd1d2..5a310b33ce6f 100644
--- a/third_party/fast_float/PROVENANCE.TXT
+++ b/third_party/fast_float/PROVENANCE.TXT
@@ -1,4 +1,4 @@
 https://github.com/fastfloat/fast_float
-Retrieved at commit https://github.com/fastfloat/fast_float/commit/a5ea2059295260922aa300d676a43a76b5e19a35
+Retrieved at commit https://github.com/fastfloat/fast_float/commit/9058831e6884e95358bcad29139a8b9d6cf0b534
 
 Using the MIT license choice.
diff --git a/third_party/fast_float/ascii_number.h b/third_party/fast_float/ascii_number.h
index d18e3d5360af..c027435e2a01 100644
--- a/third_party/fast_float/ascii_number.h
+++ b/third_party/fast_float/ascii_number.h
@@ -5,6 +5,7 @@
 #include <cstdint>
 #include <cstring>
 #include <iterator>
+#include <limits>
 #include <type_traits>
 
 #include "float_common.h"
@@ -19,8 +20,7 @@
 
 namespace fast_float {
 
-template <typename UC>
-fastfloat_really_inline constexpr bool has_simd_opt() {
+template <typename UC> fastfloat_really_inline constexpr bool has_simd_opt() {
 #ifdef FASTFLOAT_HAS_SIMD
   return std::is_same<UC, char16_t>::value;
 #else
@@ -36,24 +36,20 @@ fastfloat_really_inline constexpr bool is_integer(UC c) noexcept {
 }
 
 fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) {
-  return (val & 0xFF00000000000000) >> 56
-    | (val & 0x00FF000000000000) >> 40
-    | (val & 0x0000FF0000000000) >> 24
-    | (val & 0x000000FF00000000) >> 8
-    | (val & 0x00000000FF000000) << 8
-    | (val & 0x0000000000FF0000) << 24
-    | (val & 0x000000000000FF00) << 40
-    | (val & 0x00000000000000FF) << 56;
+  return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 |
+         (val & 0x0000FF0000000000) >> 24 | (val & 0x000000FF00000000) >> 8 |
+         (val & 0x00000000FF000000) << 8 | (val & 0x0000000000FF0000) << 24 |
+         (val & 0x000000000000FF00) << 40 | (val & 0x00000000000000FF) << 56;
 }
 
 // Read 8 UC into a u64. Truncates UC if not char.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t read8_to_u64(const UC *chars) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+read8_to_u64(const UC *chars) {
   if (cpp20_and_in_constexpr() || !std::is_same<UC, char>::value) {
     uint64_t val = 0;
-    for(int i = 0; i < 8; ++i) {
-      val |= uint64_t(uint8_t(*chars)) << (i*8);
+    for (int i = 0; i < 8; ++i) {
+      val |= uint64_t(uint8_t(*chars)) << (i * 8);
       ++chars;
     }
     return val;
@@ -69,44 +65,41 @@ uint64_t read8_to_u64(const UC *chars) {
 
 #ifdef FASTFLOAT_SSE2
 
-fastfloat_really_inline
-uint64_t simd_read8_to_u64(const __m128i data) {
-FASTFLOAT_SIMD_DISABLE_WARNINGS
+fastfloat_really_inline uint64_t simd_read8_to_u64(const __m128i data) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
   const __m128i packed = _mm_packus_epi16(data, data);
 #ifdef FASTFLOAT_64BIT
   return uint64_t(_mm_cvtsi128_si64(packed));
 #else
   uint64_t value;
   // Visual Studio + older versions of GCC don't support _mm_storeu_si64
-  _mm_storel_epi64(reinterpret_cast<__m128i*>(&value), packed);
+  _mm_storel_epi64(reinterpret_cast<__m128i *>(&value), packed);
   return value;
 #endif
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 }
 
-fastfloat_really_inline
-uint64_t simd_read8_to_u64(const char16_t* chars) {
-FASTFLOAT_SIMD_DISABLE_WARNINGS
-  return simd_read8_to_u64(_mm_loadu_si128(reinterpret_cast<const __m128i*>(chars)));
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  return simd_read8_to_u64(
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)));
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 }
 
 #elif defined(FASTFLOAT_NEON)
 
-
-fastfloat_really_inline
-uint64_t simd_read8_to_u64(const uint16x8_t data) {
-FASTFLOAT_SIMD_DISABLE_WARNINGS
+fastfloat_really_inline uint64_t simd_read8_to_u64(const uint16x8_t data) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
   uint8x8_t utf8_packed = vmovn_u16(data);
   return vget_lane_u64(vreinterpret_u64_u8(utf8_packed), 0);
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 }
 
-fastfloat_really_inline
-uint64_t simd_read8_to_u64(const char16_t* chars) {
-FASTFLOAT_SIMD_DISABLE_WARNINGS
-  return simd_read8_to_u64(vld1q_u16(reinterpret_cast<const uint16_t*>(chars)));
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) {
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  return simd_read8_to_u64(
+      vld1q_u16(reinterpret_cast<const uint16_t *>(chars)));
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 }
 
 #endif // FASTFLOAT_SSE2
@@ -115,34 +108,16 @@ FASTFLOAT_SIMD_RESTORE_WARNINGS
 #if defined(_MSC_VER) && _MSC_VER <= 1900
 template <typename UC>
 #else
-template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
+template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>()) = 0>
 #endif
 // dummy for compile
-uint64_t simd_read8_to_u64(UC const*) {
+uint64_t simd_read8_to_u64(UC const *) {
   return 0;
 }
 
-
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void write_u64(uint8_t *chars, uint64_t val) {
-  if (cpp20_and_in_constexpr()) {
-    for(int i = 0; i < 8; ++i) {
-      *chars = uint8_t(val);
-      val >>= 8;
-      ++chars;
-    }
-    return;
-  }
-#if FASTFLOAT_IS_BIG_ENDIAN == 1
-  // Need to read as-if the number was in little-endian order.
-  val = byteswap(val);
-#endif
-  ::memcpy(chars, &val, sizeof(uint64_t));
-}
-
 // credit  @aqrit
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-uint32_t parse_eight_digits_unrolled(uint64_t val) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t
+parse_eight_digits_unrolled(uint64_t val) {
   const uint64_t mask = 0x000000FF000000FF;
   const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
   const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
@@ -152,38 +127,38 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) {
   return uint32_t(val);
 }
 
-
 // Call this if chars are definitely 8 digits.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint32_t parse_eight_digits_unrolled(UC const * chars)  noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t
+parse_eight_digits_unrolled(UC const *chars) noexcept {
   if (cpp20_and_in_constexpr() || !has_simd_opt<UC>()) {
     return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay
   }
   return parse_eight_digits_unrolled(simd_read8_to_u64(chars));
 }
 
-
 // credit @aqrit
-fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val)  noexcept {
+fastfloat_really_inline constexpr bool
+is_made_of_eight_digits_fast(uint64_t val) noexcept {
   return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) &
-     0x8080808080808080));
+            0x8080808080808080));
 }
 
-
 #ifdef FASTFLOAT_HAS_SIMD
 
 // Call this if chars might not be 8 digits.
-// Using this style (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled())
-// ensures we don't load SIMD registers twice.
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool simd_parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept {
+// Using this style (instead of is_made_of_eight_digits_fast() then
+// parse_eight_digits_unrolled()) ensures we don't load SIMD registers twice.
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+simd_parse_if_eight_digits_unrolled(const char16_t *chars,
+                                    uint64_t &i) noexcept {
   if (cpp20_and_in_constexpr()) {
     return false;
-  }   
+  }
 #ifdef FASTFLOAT_SSE2
-FASTFLOAT_SIMD_DISABLE_WARNINGS
-  const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(chars));
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  const __m128i data =
+      _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars));
 
   // (x - '0') <= 9
   // http://0x80.pl/articles/simd-parsing-int-sequences.html
@@ -193,13 +168,13 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS
   if (_mm_movemask_epi8(t1) == 0) {
     i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data));
     return true;
-  }
-  else return false;
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+  } else
+    return false;
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 #elif defined(FASTFLOAT_NEON)
-FASTFLOAT_SIMD_DISABLE_WARNINGS
-  const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t*>(chars));
-  
+  FASTFLOAT_SIMD_DISABLE_WARNINGS
+  const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(chars));
+
   // (x - '0') <= 9
   // http://0x80.pl/articles/simd-parsing-int-sequences.html
   const uint16x8_t t0 = vsubq_u16(data, vmovq_n_u16('0'));
@@ -208,11 +183,12 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS
   if (vminvq_u16(mask) == 0xFFFF) {
     i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data));
     return true;
-  }
-  else return false;
-FASTFLOAT_SIMD_RESTORE_WARNINGS
+  } else
+    return false;
+  FASTFLOAT_SIMD_RESTORE_WARNINGS
 #else
-  (void)chars; (void)i;
+  (void)chars;
+  (void)i;
   return false;
 #endif // FASTFLOAT_SSE2
 }
@@ -223,55 +199,90 @@ FASTFLOAT_SIMD_RESTORE_WARNINGS
 #if defined(_MSC_VER) && _MSC_VER <= 1900
 template <typename UC>
 #else
-template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>())>
+template <typename UC, FASTFLOAT_ENABLE_IF(!has_simd_opt<UC>()) = 0>
 #endif
 // dummy for compile
-bool simd_parse_if_eight_digits_unrolled(UC const*, uint64_t&) {
+bool simd_parse_if_eight_digits_unrolled(UC const *, uint64_t &) {
   return 0;
 }
 
-
-template <typename UC, FASTFLOAT_ENABLE_IF(!std::is_same<UC, char>::value)>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void loop_parse_if_eight_digits(const UC*& p, const UC* const pend, uint64_t& i) {
+template <typename UC, FASTFLOAT_ENABLE_IF(!std::is_same<UC, char>::value) = 0>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+loop_parse_if_eight_digits(const UC *&p, const UC *const pend, uint64_t &i) {
   if (!has_simd_opt<UC>()) {
     return;
   }
-  while ((std::distance(p, pend) >= 8) && simd_parse_if_eight_digits_unrolled(p, i)) { // in rare cases, this will overflow, but that's ok
+  while ((std::distance(p, pend) >= 8) &&
+         simd_parse_if_eight_digits_unrolled(
+             p, i)) { // in rare cases, this will overflow, but that's ok
     p += 8;
   }
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void loop_parse_if_eight_digits(const char*& p, const char* const pend, uint64_t& i) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+loop_parse_if_eight_digits(const char *&p, const char *const pend,
+                           uint64_t &i) {
   // optimizes better than parse_if_eight_digits_unrolled() for UC = char.
-  while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(read8_to_u64(p))) {
-    i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(p)); // in rare cases, this will overflow, but that's ok
+  while ((std::distance(p, pend) >= 8) &&
+         is_made_of_eight_digits_fast(read8_to_u64(p))) {
+    i = i * 100000000 +
+        parse_eight_digits_unrolled(read8_to_u64(
+            p)); // in rare cases, this will overflow, but that's ok
     p += 8;
   }
 }
 
-template <typename UC>
-struct parsed_number_string_t {
+enum class parse_error {
+  no_error,
+  // [JSON-only] The minus sign must be followed by an integer.
+  missing_integer_after_sign,
+  // A sign must be followed by an integer or dot.
+  missing_integer_or_dot_after_sign,
+  // [JSON-only] The integer part must not have leading zeros.
+  leading_zeros_in_integer_part,
+  // [JSON-only] The integer part must have at least one digit.
+  no_digits_in_integer_part,
+  // [JSON-only] If there is a decimal point, there must be digits in the
+  // fractional part.
+  no_digits_in_fractional_part,
+  // The mantissa must have at least one digit.
+  no_digits_in_mantissa,
+  // Scientific notation requires an exponential part.
+  missing_exponential_part,
+};
+
+template <typename UC> struct parsed_number_string_t {
   int64_t exponent{0};
   uint64_t mantissa{0};
-  UC const * lastmatch{nullptr};
+  UC const *lastmatch{nullptr};
   bool negative{false};
   bool valid{false};
   bool too_many_digits{false};
   // contains the range of the significant digits
   span<const UC> integer{};  // non-nullable
   span<const UC> fraction{}; // nullable
+  parse_error error{parse_error::no_error};
 };
 
 using byte_span = span<const char>;
 using parsed_number_string = parsed_number_string_t<char>;
 
+template <typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t<UC>
+report_parse_error(UC const *p, parse_error error) {
+  parsed_number_string_t<UC> answer;
+  answer.valid = false;
+  answer.lastmatch = p;
+  answer.error = error;
+  return answer;
+}
+
 // Assuming that you use no more than 19 digits, this will
 // parse an ASCII string.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, parse_options_t<UC> options) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t<UC>
+parse_number_string(UC const *p, UC const *pend,
+                    parse_options_t<UC> options) noexcept {
   chars_format const fmt = options.format;
   UC const decimal_point = options.decimal_point;
 
@@ -286,19 +297,24 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
 #endif
     ++p;
     if (p == pend) {
-      return answer;
+      return report_parse_error<UC>(
+          p, parse_error::missing_integer_or_dot_after_sign);
     }
     if (fmt & FASTFLOAT_JSONFMT) {
       if (!is_integer(*p)) { // a sign must be followed by an integer
-        return answer;
-      }    
+        return report_parse_error<UC>(p,
+                                      parse_error::missing_integer_after_sign);
+      }
     } else {
-      if (!is_integer(*p) && (*p != decimal_point)) { // a sign must be followed by an integer or the dot
-        return answer;
+      if (!is_integer(*p) &&
+          (*p !=
+           decimal_point)) { // a sign must be followed by an integer or the dot
+        return report_parse_error<UC>(
+            p, parse_error::missing_integer_or_dot_after_sign);
       }
     }
   }
-  UC const * const start_digits = p;
+  UC const *const start_digits = p;
 
   uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad)
 
@@ -306,16 +322,21 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
     // a multiplication by 10 is cheaper than an arbitrary integer
     // multiplication
     i = 10 * i +
-        uint64_t(*p - UC('0')); // might overflow, we will handle the overflow later
+        uint64_t(*p -
+                 UC('0')); // might overflow, we will handle the overflow later
     ++p;
   }
-  UC const * const end_of_integer_part = p;
+  UC const *const end_of_integer_part = p;
   int64_t digit_count = int64_t(end_of_integer_part - start_digits);
   answer.integer = span<const UC>(start_digits, size_t(digit_count));
   if (fmt & FASTFLOAT_JSONFMT) {
     // at least 1 digit in integer part, without leading zeros
-    if (digit_count == 0 || (start_digits[0] == UC('0') && digit_count > 1)) {
-      return answer;
+    if (digit_count == 0) {
+      return report_parse_error<UC>(p, parse_error::no_digits_in_integer_part);
+    }
+    if ((start_digits[0] == UC('0') && digit_count > 1)) {
+      return report_parse_error<UC>(start_digits,
+                                    parse_error::leading_zeros_in_integer_part);
     }
   }
 
@@ -323,7 +344,7 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
   const bool has_decimal_point = (p != pend) && (*p == decimal_point);
   if (has_decimal_point) {
     ++p;
-    UC const * before = p;
+    UC const *before = p;
     // can occur at most twice without overflowing, but let it occur more, since
     // for integers with many digits, digit parsing is the primary bottleneck.
     loop_parse_if_eight_digits(p, pend, i);
@@ -340,35 +361,39 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
   if (fmt & FASTFLOAT_JSONFMT) {
     // at least 1 digit in fractional part
     if (has_decimal_point && exponent == 0) {
-      return answer;
+      return report_parse_error<UC>(p,
+                                    parse_error::no_digits_in_fractional_part);
     }
-  } 
-  else if (digit_count == 0) { // we must have encountered at least one integer!
-    return answer;
+  } else if (digit_count ==
+             0) { // we must have encountered at least one integer!
+    return report_parse_error<UC>(p, parse_error::no_digits_in_mantissa);
   }
-  int64_t exp_number = 0;            // explicit exponential part
-  if ( ((fmt & chars_format::scientific) &&
-        (p != pend) &&
-        ((UC('e') == *p) || (UC('E') == *p)))
-       ||
-       ((fmt & FASTFLOAT_FORTRANFMT) &&
-        (p != pend) &&
-        ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) || (UC('D') == *p)))) {
-    UC const * location_of_e = p;
-    if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) || (UC('D') == *p)) {
+  int64_t exp_number = 0; // explicit exponential part
+  if (((fmt & chars_format::scientific) && (p != pend) &&
+       ((UC('e') == *p) || (UC('E') == *p))) ||
+      ((fmt & FASTFLOAT_FORTRANFMT) && (p != pend) &&
+       ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) ||
+        (UC('D') == *p)))) {
+    UC const *location_of_e = p;
+    if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) ||
+        (UC('D') == *p)) {
       ++p;
     }
     bool neg_exp = false;
     if ((p != pend) && (UC('-') == *p)) {
       neg_exp = true;
       ++p;
-    } else if ((p != pend) && (UC('+') == *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
+    } else if ((p != pend) &&
+               (UC('+') ==
+                *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1)
       ++p;
     }
     if ((p == pend) || !is_integer(*p)) {
-      if(!(fmt & chars_format::fixed)) {
-        // We are in error.
-        return answer;
+      if (!(fmt & chars_format::fixed)) {
+        // The exponential part is invalid for scientific notation, so it must
+        // be a trailing token for fixed notation. However, fixed notation is
+        // disabled, so report a scientific notation error.
+        return report_parse_error<UC>(p, parse_error::missing_exponential_part);
       }
       // Otherwise, we will be ignoring the 'e'.
       p = location_of_e;
@@ -380,12 +405,16 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
         }
         ++p;
       }
-      if(neg_exp) { exp_number = - exp_number; }
+      if (neg_exp) {
+        exp_number = -exp_number;
+      }
       exponent += exp_number;
     }
   } else {
     // If it scientific and not fixed, we have to bail out.
-    if((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { return answer; }
+    if ((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) {
+      return report_parse_error<UC>(p, parse_error::missing_exponential_part);
+    }
   }
   answer.lastmatch = p;
   answer.valid = true;
@@ -400,9 +429,11 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
     // We have to handle the case where we have 0.0000somenumber.
     // We need to be mindful of the case where we only have zeroes...
     // E.g., 0.000000000...000.
-    UC const * start = start_digits;
+    UC const *start = start_digits;
     while ((start != pend) && (*start == UC('0') || *start == decimal_point)) {
-      if(*start == UC('0')) { digit_count --; }
+      if (*start == UC('0')) {
+        digit_count--;
+      }
       start++;
     }
 
@@ -413,18 +444,17 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
       // pre-tokenized spans from above.
       i = 0;
       p = answer.integer.ptr;
-      UC const* int_end = p + answer.integer.len();
-      const uint64_t minimal_nineteen_digit_integer{ 1000000000000000000 };
+      UC const *int_end = p + answer.integer.len();
+      const uint64_t minimal_nineteen_digit_integer{1000000000000000000};
       while ((i < minimal_nineteen_digit_integer) && (p != int_end)) {
         i = i * 10 + uint64_t(*p - UC('0'));
         ++p;
       }
       if (i >= minimal_nineteen_digit_integer) { // We have a big integers
         exponent = end_of_integer_part - p + exp_number;
-      }
-      else { // We have a value with a fractional component.
+      } else { // We have a value with a fractional component.
         p = answer.fraction.ptr;
-        UC const* frac_end = p + answer.fraction.len();
+        UC const *frac_end = p + answer.fraction.len();
         while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) {
           i = i * 10 + uint64_t(*p - UC('0'));
           ++p;
@@ -439,6 +469,111 @@ parsed_number_string_t<UC> parse_number_string(UC const *p, UC const * pend, par
   return answer;
 }
 
+template <typename T, typename UC>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+parse_int_string(UC const *p, UC const *pend, T &value, int base) {
+  from_chars_result_t<UC> answer;
+
+  UC const *const first = p;
+
+  bool negative = (*p == UC('-'));
+  if (!std::is_signed<T>::value && negative) {
+    answer.ec = std::errc::invalid_argument;
+    answer.ptr = first;
+    return answer;
+  }
+#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default
+  if ((*p == UC('-')) || (*p == UC('+'))) {
+#else
+  if (*p == UC('-')) {
+#endif
+    ++p;
+  }
+
+  UC const *const start_num = p;
+
+  while (p != pend && *p == UC('0')) {
+    ++p;
+  }
+
+  const bool has_leading_zeros = p > start_num;
+
+  UC const *const start_digits = p;
+
+  uint64_t i = 0;
+  if (base == 10) {
+    loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible
+  }
+  while (p != pend) {
+    uint8_t digit = ch_to_digit(*p);
+    if (digit >= base) {
+      break;
+    }
+    i = uint64_t(base) * i + digit; // might overflow, check this later
+    p++;
+  }
+
+  size_t digit_count = size_t(p - start_digits);
+
+  if (digit_count == 0) {
+    if (has_leading_zeros) {
+      value = 0;
+      answer.ec = std::errc();
+      answer.ptr = p;
+    } else {
+      answer.ec = std::errc::invalid_argument;
+      answer.ptr = first;
+    }
+    return answer;
+  }
+
+  answer.ptr = p;
+
+  // check u64 overflow
+  size_t max_digits = max_digits_u64(base);
+  if (digit_count > max_digits) {
+    answer.ec = std::errc::result_out_of_range;
+    return answer;
+  }
+  // this check can be eliminated for all other types, but they will all require
+  // a max_digits(base) equivalent
+  if (digit_count == max_digits && i < min_safe_u64(base)) {
+    answer.ec = std::errc::result_out_of_range;
+    return answer;
+  }
+
+  // check other types overflow
+  if (!std::is_same<T, uint64_t>::value) {
+    if (i > uint64_t(std::numeric_limits<T>::max()) + uint64_t(negative)) {
+      answer.ec = std::errc::result_out_of_range;
+      return answer;
+    }
+  }
+
+  if (negative) {
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(push)
+#pragma warning(disable : 4146)
+#endif
+    // this weird workaround is required because:
+    // - converting unsigned to signed when its value is greater than signed max
+    // is UB pre-C++23.
+    // - reinterpret_casting (~i + 1) would work, but it is not constexpr
+    // this is always optimized into a neg instruction (note: T is an integer
+    // type)
+    value = T(-std::numeric_limits<T>::max() -
+              T(i - uint64_t(std::numeric_limits<T>::max())));
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(pop)
+#endif
+  } else {
+    value = T(i);
+  }
+
+  answer.ec = std::errc();
+  return answer;
+}
+
 } // namespace fast_float
 
 #endif
diff --git a/third_party/fast_float/bigint.h b/third_party/fast_float/bigint.h
index 5076b47cc5c9..03a5caa4a532 100644
--- a/third_party/fast_float/bigint.h
+++ b/third_party/fast_float/bigint.h
@@ -37,8 +37,7 @@ constexpr size_t bigint_limbs = bigint_bits / limb_bits;
 
 // vector-like type that is allocated on the stack. the entire
 // buffer is pre-allocated, and only the length changes.
-template <uint16_t size>
-struct stackvec {
+template <uint16_t size> struct stackvec {
   limb data[size];
   // we never need more than 150 limbs
   uint16_t length{0};
@@ -54,16 +53,16 @@ struct stackvec {
     FASTFLOAT_ASSERT(try_extend(s));
   }
 
-  FASTFLOAT_CONSTEXPR14 limb& operator[](size_t index) noexcept {
+  FASTFLOAT_CONSTEXPR14 limb &operator[](size_t index) noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     return data[index];
   }
-  FASTFLOAT_CONSTEXPR14 const limb& operator[](size_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const limb &operator[](size_t index) const noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     return data[index];
   }
   // index from the end of the container
-  FASTFLOAT_CONSTEXPR14 const limb& rindex(size_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const limb &rindex(size_t index) const noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     size_t rindex = length - index - 1;
     return data[rindex];
@@ -73,15 +72,9 @@ struct stackvec {
   FASTFLOAT_CONSTEXPR14 void set_len(size_t len) noexcept {
     length = uint16_t(len);
   }
-  constexpr size_t len() const noexcept {
-    return length;
-  }
-  constexpr bool is_empty() const noexcept {
-    return length == 0;
-  }
-  constexpr size_t capacity() const noexcept {
-    return size;
-  }
+  constexpr size_t len() const noexcept { return length; }
+  constexpr bool is_empty() const noexcept { return length == 0; }
+  constexpr size_t capacity() const noexcept { return size; }
   // append item to vector, without bounds checking
   FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept {
     data[length] = value;
@@ -98,7 +91,7 @@ struct stackvec {
   }
   // add items to the vector, from a span, without bounds checking
   FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept {
-    limb* ptr = data + length;
+    limb *ptr = data + length;
     std::copy_n(s.ptr, s.len(), ptr);
     set_len(len() + s.len());
   }
@@ -118,8 +111,8 @@ struct stackvec {
   void resize_unchecked(size_t new_len, limb value) noexcept {
     if (new_len > len()) {
       size_t count = new_len - len();
-      limb* first = data + len();
-      limb* last = first + count;
+      limb *first = data + len();
+      limb *last = first + count;
       ::std::fill(first, last, value);
       set_len(new_len);
     } else {
@@ -155,21 +148,21 @@ struct stackvec {
   }
 };
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-uint64_t empty_hi64(bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t
+empty_hi64(bool &truncated) noexcept {
   truncated = false;
   return 0;
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t uint64_hi64(uint64_t r0, bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint64_hi64(uint64_t r0, bool &truncated) noexcept {
   truncated = false;
   int shl = leading_zeroes(r0);
   return r0 << shl;
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint64_hi64(uint64_t r0, uint64_t r1, bool &truncated) noexcept {
   int shl = leading_zeroes(r0);
   if (shl == 0) {
     truncated = r1 != 0;
@@ -181,20 +174,20 @@ uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept {
   }
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t uint32_hi64(uint32_t r0, bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint32_hi64(uint32_t r0, bool &truncated) noexcept {
   return uint64_hi64(r0, truncated);
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t uint32_hi64(uint32_t r0, uint32_t r1, bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint32_hi64(uint32_t r0, uint32_t r1, bool &truncated) noexcept {
   uint64_t x0 = r0;
   uint64_t x1 = r1;
   return uint64_hi64((x0 << 32) | x1, truncated);
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t
+uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool &truncated) noexcept {
   uint64_t x0 = r0;
   uint64_t x1 = r1;
   uint64_t x2 = r2;
@@ -205,17 +198,17 @@ uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noe
 // we want an efficient operation. for msvc, where
 // we don't have built-in intrinsics, this is still
 // pretty fast.
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-limb scalar_add(limb x, limb y, bool& overflow) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb
+scalar_add(limb x, limb y, bool &overflow) noexcept {
   limb z;
 // gcc and clang
 #if defined(__has_builtin)
-  #if __has_builtin(__builtin_add_overflow)
-    if (!cpp20_and_in_constexpr()) {
-      overflow = __builtin_add_overflow(x, y, &z);
-      return z;
-    }
-  #endif
+#if __has_builtin(__builtin_add_overflow)
+  if (!cpp20_and_in_constexpr()) {
+    overflow = __builtin_add_overflow(x, y, &z);
+    return z;
+  }
+#endif
 #endif
 
   // generic, this still optimizes correctly on MSVC.
@@ -225,24 +218,24 @@ limb scalar_add(limb x, limb y, bool& overflow) noexcept {
 }
 
 // multiply two small integers, getting both the high and low bits.
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-limb scalar_mul(limb x, limb y, limb& carry) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb
+scalar_mul(limb x, limb y, limb &carry) noexcept {
 #ifdef FASTFLOAT_64BIT_LIMB
-  #if defined(__SIZEOF_INT128__)
+#if defined(__SIZEOF_INT128__)
   // GCC and clang both define it as an extension.
   __uint128_t z = __uint128_t(x) * __uint128_t(y) + __uint128_t(carry);
   carry = limb(z >> limb_bits);
   return limb(z);
-  #else
+#else
   // fallback, no native 128-bit integer multiplication with carry.
   // on msvc, this optimizes identically, somehow.
   value128 z = full_multiplication(x, y);
   bool overflow;
   z.low = scalar_add(z.low, carry, overflow);
-  z.high += uint64_t(overflow);  // cannot overflow
+  z.high += uint64_t(overflow); // cannot overflow
   carry = z.high;
   return z.low;
-  #endif
+#endif
 #else
   uint64_t z = uint64_t(x) * uint64_t(y) + uint64_t(carry);
   carry = limb(z >> limb_bits);
@@ -253,8 +246,8 @@ limb scalar_mul(limb x, limb y, limb& carry) noexcept {
 // add scalar value to bigint starting from offset.
 // used in grade school multiplication
 template <uint16_t size>
-inline FASTFLOAT_CONSTEXPR20
-bool small_add_from(stackvec<size>& vec, limb y, size_t start) noexcept {
+inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec<size> &vec, limb y,
+                                                 size_t start) noexcept {
   size_t index = start;
   limb carry = y;
   bool overflow;
@@ -271,15 +264,15 @@ bool small_add_from(stackvec<size>& vec, limb y, size_t start) noexcept {
 
 // add scalar value to bigint.
 template <uint16_t size>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool small_add(stackvec<size>& vec, limb y) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+small_add(stackvec<size> &vec, limb y) noexcept {
   return small_add_from(vec, y, 0);
 }
 
 // multiply bigint by scalar value.
 template <uint16_t size>
-inline FASTFLOAT_CONSTEXPR20
-bool small_mul(stackvec<size>& vec, limb y) noexcept {
+inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec<size> &vec,
+                                            limb y) noexcept {
   limb carry = 0;
   for (size_t index = 0; index < vec.len(); index++) {
     vec[index] = scalar_mul(vec[index], y, carry);
@@ -293,12 +286,12 @@ bool small_mul(stackvec<size>& vec, limb y) noexcept {
 // add bigint to bigint starting from index.
 // used in grade school multiplication
 template <uint16_t size>
-FASTFLOAT_CONSTEXPR20
-bool large_add_from(stackvec<size>& x, limb_span y, size_t start) noexcept {
+FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec<size> &x, limb_span y,
+                                          size_t start) noexcept {
   // the effective x buffer is from `xstart..x.len()`, so exit early
   // if we can't get that current range.
   if (x.len() < start || y.len() > x.len() - start) {
-      FASTFLOAT_TRY(x.try_resize(y.len() + start, 0));
+    FASTFLOAT_TRY(x.try_resize(y.len() + start, 0));
   }
 
   bool carry = false;
@@ -324,15 +317,14 @@ bool large_add_from(stackvec<size>& x, limb_span y, size_t start) noexcept {
 
 // add bigint to bigint.
 template <uint16_t size>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool large_add_from(stackvec<size>& x, limb_span y) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+large_add_from(stackvec<size> &x, limb_span y) noexcept {
   return large_add_from(x, y, 0);
 }
 
 // grade-school multiplication algorithm
 template <uint16_t size>
-FASTFLOAT_CONSTEXPR20
-bool long_mul(stackvec<size>& x, limb_span y) noexcept {
+FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec<size> &x, limb_span y) noexcept {
   limb_span xs = limb_span(x.data, x.len());
   stackvec<size> z(xs);
   limb_span zs = limb_span(z.data, z.len());
@@ -360,8 +352,7 @@ bool long_mul(stackvec<size>& x, limb_span y) noexcept {
 
 // grade-school multiplication algorithm
 template <uint16_t size>
-FASTFLOAT_CONSTEXPR20
-bool large_mul(stackvec<size>& x, limb_span y) noexcept {
+FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec<size> &x, limb_span y) noexcept {
   if (y.len() == 1) {
     FASTFLOAT_TRY(small_mul(x, y[0]));
   } else {
@@ -370,36 +361,58 @@ bool large_mul(stackvec<size>& x, limb_span y) noexcept {
   return true;
 }
 
-template <typename = void>
-struct pow5_tables {
+template <typename = void> struct pow5_tables {
   static constexpr uint32_t large_step = 135;
   static constexpr uint64_t small_power_of_5[] = {
-    1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL,
-    1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL,
-    6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL,
-    3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL,
-    2384185791015625UL, 11920928955078125UL, 59604644775390625UL,
-    298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL,
+      1UL,
+      5UL,
+      25UL,
+      125UL,
+      625UL,
+      3125UL,
+      15625UL,
+      78125UL,
+      390625UL,
+      1953125UL,
+      9765625UL,
+      48828125UL,
+      244140625UL,
+      1220703125UL,
+      6103515625UL,
+      30517578125UL,
+      152587890625UL,
+      762939453125UL,
+      3814697265625UL,
+      19073486328125UL,
+      95367431640625UL,
+      476837158203125UL,
+      2384185791015625UL,
+      11920928955078125UL,
+      59604644775390625UL,
+      298023223876953125UL,
+      1490116119384765625UL,
+      7450580596923828125UL,
   };
 #ifdef FASTFLOAT_64BIT_LIMB
   constexpr static limb large_power_of_5[] = {
-    1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL,
-    10482974169319127550UL, 198276706040285095UL};
+      1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL,
+      10482974169319127550UL, 198276706040285095UL};
 #else
   constexpr static limb large_power_of_5[] = {
-    4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U,
-    1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U};
+      4279965485U, 329373468U,  4020270615U, 2137533757U, 4287402176U,
+      1057042919U, 1071430142U, 2440757623U, 381945767U,  46164893U};
 #endif
 };
 
-template <typename T>
-constexpr uint32_t pow5_tables<T>::large_step;
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename T> constexpr uint32_t pow5_tables<T>::large_step;
 
-template <typename T>
-constexpr uint64_t pow5_tables<T>::small_power_of_5[];
+template <typename T> constexpr uint64_t pow5_tables<T>::small_power_of_5[];
 
-template <typename T>
-constexpr limb pow5_tables<T>::large_power_of_5[];
+template <typename T> constexpr limb pow5_tables<T>::large_power_of_5[];
+
+#endif
 
 // big integer type. implements a small subset of big integer
 // arithmetic, using simple algorithms since asymptotically
@@ -409,13 +422,13 @@ struct bigint : pow5_tables<> {
   // storage of the limbs, in little-endian order.
   stackvec<bigint_limbs> vec;
 
-  FASTFLOAT_CONSTEXPR20 bigint(): vec() {}
+  FASTFLOAT_CONSTEXPR20 bigint() : vec() {}
   bigint(const bigint &) = delete;
   bigint &operator=(const bigint &) = delete;
   bigint(bigint &&) = delete;
   bigint &operator=(bigint &&other) = delete;
 
-  FASTFLOAT_CONSTEXPR20 bigint(uint64_t value): vec() {
+  FASTFLOAT_CONSTEXPR20 bigint(uint64_t value) : vec() {
 #ifdef FASTFLOAT_64BIT_LIMB
     vec.push_unchecked(value);
 #else
@@ -427,7 +440,7 @@ struct bigint : pow5_tables<> {
 
   // get the high 64 bits from the vector, and if bits were truncated.
   // this is to get the significant digits for the float.
-  FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool& truncated) const noexcept {
+  FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool &truncated) const noexcept {
 #ifdef FASTFLOAT_64BIT_LIMB
     if (vec.len() == 0) {
       return empty_hi64(truncated);
@@ -446,7 +459,8 @@ struct bigint : pow5_tables<> {
     } else if (vec.len() == 2) {
       return uint32_hi64(vec.rindex(0), vec.rindex(1), truncated);
     } else {
-      uint64_t result = uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated);
+      uint64_t result =
+          uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated);
       truncated |= vec.nonzero(3);
       return result;
     }
@@ -459,7 +473,7 @@ struct bigint : pow5_tables<> {
   // positive, this is larger, otherwise they are equal.
   // the limbs are stored in little-endian order, so we
   // must compare the limbs in ever order.
-  FASTFLOAT_CONSTEXPR20 int compare(const bigint& other) const noexcept {
+  FASTFLOAT_CONSTEXPR20 int compare(const bigint &other) const noexcept {
     if (vec.len() > other.vec.len()) {
       return 1;
     } else if (vec.len() < other.vec.len()) {
@@ -512,12 +526,12 @@ struct bigint : pow5_tables<> {
       return false;
     } else if (!vec.is_empty()) {
       // move limbs
-      limb* dst = vec.data + n;
-      const limb* src = vec.data;
+      limb *dst = vec.data + n;
+      const limb *src = vec.data;
       std::copy_backward(src, src + vec.len(), dst + vec.len());
       // fill in empty limbs
-      limb* first = vec.data;
-      limb* last = first + n;
+      limb *first = vec.data;
+      limb *last = first + n;
       ::std::fill(first, last, 0);
       vec.set_len(n + vec.len());
       return true;
@@ -560,18 +574,12 @@ struct bigint : pow5_tables<> {
     return int(limb_bits * vec.len()) - lz;
   }
 
-  FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept {
-    return small_mul(vec, y);
-  }
+  FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); }
 
-  FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept {
-    return small_add(vec, y);
-  }
+  FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); }
 
   // multiply as if by 2 raised to a power.
-  FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept {
-    return shl(exp);
-  }
+  FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); }
 
   // multiply as if by 5 raised to a power.
   FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept {
@@ -597,9 +605,8 @@ struct bigint : pow5_tables<> {
       // Work around clang bug https://godbolt.org/z/zedh7rrhc
       // This is similar to https://github.com/llvm/llvm-project/issues/47746,
       // except the workaround described there don't work here
-      FASTFLOAT_TRY(
-        small_mul(vec, limb(((void)small_power_of_5[0], small_power_of_5[exp])))
-      );
+      FASTFLOAT_TRY(small_mul(
+          vec, limb(((void)small_power_of_5[0], small_power_of_5[exp]))));
     }
 
     return true;
diff --git a/third_party/fast_float/constexpr_feature_detect.h b/third_party/fast_float/constexpr_feature_detect.h
index ba8b65c64a16..7624beafcacf 100644
--- a/third_party/fast_float/constexpr_feature_detect.h
+++ b/third_party/fast_float/constexpr_feature_detect.h
@@ -20,16 +20,16 @@
 #define FASTFLOAT_HAS_BIT_CAST 0
 #endif
 
-#if defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L
+#if defined(__cpp_lib_is_constant_evaluated) &&                                \
+    __cpp_lib_is_constant_evaluated >= 201811L
 #define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1
 #else
 #define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0
 #endif
 
 // Testing for relevant C++20 constexpr library features
-#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED \
-    && FASTFLOAT_HAS_BIT_CAST \
-    && __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/
+#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED && FASTFLOAT_HAS_BIT_CAST &&           \
+    __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/
 #define FASTFLOAT_CONSTEXPR20 constexpr
 #define FASTFLOAT_IS_CONSTEXPR 1
 #else
@@ -37,4 +37,10 @@
 #define FASTFLOAT_IS_CONSTEXPR 0
 #endif
 
+#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 0
+#else
+#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 1
+#endif
+
 #endif // FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H
diff --git a/third_party/fast_float/decimal_to_binary.h b/third_party/fast_float/decimal_to_binary.h
index fec916f3a07b..70ecf73c8ea6 100644
--- a/third_party/fast_float/decimal_to_binary.h
+++ b/third_party/fast_float/decimal_to_binary.h
@@ -12,27 +12,34 @@
 
 namespace fast_float {
 
-// This will compute or rather approximate w * 5**q and return a pair of 64-bit words approximating
-// the result, with the "high" part corresponding to the most significant bits and the
-// low part corresponding to the least significant bits.
+// This will compute or rather approximate w * 5**q and return a pair of 64-bit
+// words approximating the result, with the "high" part corresponding to the
+// most significant bits and the low part corresponding to the least significant
+// bits.
 //
 template <int bit_precision>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-value128 compute_product_approximation(int64_t q, uint64_t w) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128
+compute_product_approximation(int64_t q, uint64_t w) {
   const int index = 2 * int(q - powers::smallest_power_of_five);
-  // For small values of q, e.g., q in [0,27], the answer is always exact because
-  // The line value128 firstproduct = full_multiplication(w, power_of_five_128[index]);
-  // gives the exact answer.
-  value128 firstproduct = full_multiplication(w, powers::power_of_five_128[index]);
-  static_assert((bit_precision >= 0) && (bit_precision <= 64), " precision should  be in (0,64]");
-  constexpr uint64_t precision_mask = (bit_precision < 64) ?
-               (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision)
-               : uint64_t(0xFFFFFFFFFFFFFFFF);
-  if((firstproduct.high & precision_mask) == precision_mask) { // could further guard with  (lower + w < lower)
-    // regarding the second product, we only need secondproduct.high, but our expectation is that the compiler will optimize this extra work away if needed.
-    value128 secondproduct = full_multiplication(w, powers::power_of_five_128[index + 1]);
+  // For small values of q, e.g., q in [0,27], the answer is always exact
+  // because The line value128 firstproduct = full_multiplication(w,
+  // power_of_five_128[index]); gives the exact answer.
+  value128 firstproduct =
+      full_multiplication(w, powers::power_of_five_128[index]);
+  static_assert((bit_precision >= 0) && (bit_precision <= 64),
+                " precision should  be in (0,64]");
+  constexpr uint64_t precision_mask =
+      (bit_precision < 64) ? (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision)
+                           : uint64_t(0xFFFFFFFFFFFFFFFF);
+  if ((firstproduct.high & precision_mask) ==
+      precision_mask) { // could further guard with  (lower + w < lower)
+    // regarding the second product, we only need secondproduct.high, but our
+    // expectation is that the compiler will optimize this extra work away if
+    // needed.
+    value128 secondproduct =
+        full_multiplication(w, powers::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
-    if(secondproduct.high > firstproduct.low) {
+    if (secondproduct.high > firstproduct.low) {
       firstproduct.high++;
     }
   }
@@ -55,43 +62,45 @@ namespace detail {
  * where
  *   p = log(5**-q)/log(2) = -q * log(5)/log(2)
  */
-  constexpr fastfloat_really_inline int32_t power(int32_t q)  noexcept  {
-    return (((152170 + 65536) * q) >> 16) + 63;
-  }
+constexpr fastfloat_really_inline int32_t power(int32_t q) noexcept {
+  return (((152170 + 65536) * q) >> 16) + 63;
+}
 } // namespace detail
 
 // create an adjusted mantissa, biased by the invalid power2
 // for significant digits already multiplied by 10 ** q.
 template <typename binary>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-adjusted_mantissa compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept  {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa
+compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept {
   int hilz = int(w >> 63) ^ 1;
   adjusted_mantissa answer;
   answer.mantissa = w << hilz;
   int bias = binary::mantissa_explicit_bits() - binary::minimum_exponent();
-  answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 + invalid_am_bias);
+  answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 +
+                          invalid_am_bias);
   return answer;
 }
 
 // w * 10 ** q, without rounding the representation up.
 // the power2 in the exponent will be adjusted by invalid_am_bias.
 template <typename binary>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa compute_error(int64_t q, uint64_t w)  noexcept  {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+compute_error(int64_t q, uint64_t w) noexcept {
   int lz = leading_zeroes(w);
   w <<= lz;
-  value128 product = compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
+  value128 product =
+      compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
   return compute_error_scaled<binary>(q, product.high, lz);
 }
 
 // w * 10 ** q
-// The returned value should be a valid ieee64 number that simply need to be packed.
-// However, in some very rare cases, the computation will fail. In such cases, we
-// return an adjusted_mantissa with a negative power of 2: the caller should recompute
-// in such cases.
+// The returned value should be a valid ieee64 number that simply need to be
+// packed. However, in some very rare cases, the computation will fail. In such
+// cases, we return an adjusted_mantissa with a negative power of 2: the caller
+// should recompute in such cases.
 template <typename binary>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa compute_float(int64_t q, uint64_t w)  noexcept  {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+compute_float(int64_t q, uint64_t w) noexcept {
   adjusted_mantissa answer;
   if ((w == 0) || (q < binary::smallest_power_of_ten())) {
     answer.power2 = 0;
@@ -105,7 +114,8 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w)  noexcept  {
     answer.mantissa = 0;
     return answer;
   }
-  // At this point in time q is in [powers::smallest_power_of_five, powers::largest_power_of_five].
+  // At this point in time q is in [powers::smallest_power_of_five,
+  // powers::largest_power_of_five].
 
   // We want the most significant bit of i to be 1. Shift if needed.
   int lz = leading_zeroes(w);
@@ -114,26 +124,32 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w)  noexcept  {
   // The required precision is binary::mantissa_explicit_bits() + 3 because
   // 1. We need the implicit bit
   // 2. We need an extra bit for rounding purposes
-  // 3. We might lose a bit due to the "upperbit" routine (result too small, requiring a shift)
+  // 3. We might lose a bit due to the "upperbit" routine (result too small,
+  // requiring a shift)
 
-  value128 product = compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
+  value128 product =
+      compute_product_approximation<binary::mantissa_explicit_bits() + 3>(q, w);
   // The computed 'product' is always sufficient.
   // Mathematical proof:
-  // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to appear)
-  // See script/mushtak_lemire.py
+  // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to
+  // appear) See script/mushtak_lemire.py
 
-  // The "compute_product_approximation" function can be slightly slower than a branchless approach:
-  // value128 product = compute_product(q, w);
-  // but in practice, we can win big with the compute_product_approximation if its additional branch
-  // is easily predicted. Which is best is data specific.
+  // The "compute_product_approximation" function can be slightly slower than a
+  // branchless approach: value128 product = compute_product(q, w); but in
+  // practice, we can win big with the compute_product_approximation if its
+  // additional branch is easily predicted. Which is best is data specific.
   int upperbit = int(product.high >> 63);
+  int shift = upperbit + 64 - binary::mantissa_explicit_bits() - 3;
 
-  answer.mantissa = product.high >> (upperbit + 64 - binary::mantissa_explicit_bits() - 3);
+  answer.mantissa = product.high >> shift;
 
-  answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz - binary::minimum_exponent());
+  answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz -
+                          binary::minimum_exponent());
   if (answer.power2 <= 0) { // we have a subnormal?
     // Here have that answer.power2 <= 0 so -answer.power2 >= 0
-    if(-answer.power2 + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure.
+    if (-answer.power2 + 1 >=
+        64) { // if we have more than 64 bits below the minimum exponent, you
+              // have a zero for sure.
       answer.power2 = 0;
       answer.mantissa = 0;
       // result should be zero
@@ -152,20 +168,26 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w)  noexcept  {
     // up 0x3fffffffffffff x 2^-1023-53  and once we do, we are no longer
     // subnormal, but we can only know this after rounding.
     // So we only declare a subnormal if we are smaller than the threshold.
-    answer.power2 = (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) ? 0 : 1;
+    answer.power2 =
+        (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits()))
+            ? 0
+            : 1;
     return answer;
   }
 
   // usually, we round *up*, but if we fall right in between and and we have an
   // even basis, we need to round down
   // We are only concerned with the cases where 5**q fits in single 64-bit word.
-  if ((product.low <= 1) &&  (q >= binary::min_exponent_round_to_even()) && (q <= binary::max_exponent_round_to_even()) &&
-      ((answer.mantissa & 3) == 1) ) { // we may fall between two floats!
+  if ((product.low <= 1) && (q >= binary::min_exponent_round_to_even()) &&
+      (q <= binary::max_exponent_round_to_even()) &&
+      ((answer.mantissa & 3) == 1)) { // we may fall between two floats!
     // To be in-between two floats we need that in doing
-    //   answer.mantissa = product.high >> (upperbit + 64 - binary::mantissa_explicit_bits() - 3);
-    // ... we dropped out only zeroes. But if this happened, then we can go back!!!
-    if((answer.mantissa  << (upperbit + 64 - binary::mantissa_explicit_bits() - 3)) ==  product.high) {
-      answer.mantissa &= ~uint64_t(1);          // flip it so that we do not round up
+    //   answer.mantissa = product.high >> (upperbit + 64 -
+    //   binary::mantissa_explicit_bits() - 3);
+    // ... we dropped out only zeroes. But if this happened, then we can go
+    // back!!!
+    if ((answer.mantissa << shift) == product.high) {
+      answer.mantissa &= ~uint64_t(1); // flip it so that we do not round up
     }
   }
 
diff --git a/third_party/fast_float/digit_comparison.h b/third_party/fast_float/digit_comparison.h
index 512a27f5a5f4..303fff91eb1f 100644
--- a/third_party/fast_float/digit_comparison.h
+++ b/third_party/fast_float/digit_comparison.h
@@ -13,19 +13,34 @@
 namespace fast_float {
 
 // 1e0 to 1e19
-constexpr static uint64_t powers_of_ten_uint64[] = {
-    1UL, 10UL, 100UL, 1000UL, 10000UL, 100000UL, 1000000UL, 10000000UL, 100000000UL,
-    1000000000UL, 10000000000UL, 100000000000UL, 1000000000000UL, 10000000000000UL,
-    100000000000000UL, 1000000000000000UL, 10000000000000000UL, 100000000000000000UL,
-    1000000000000000000UL, 10000000000000000000UL};
+constexpr static uint64_t powers_of_ten_uint64[] = {1UL,
+                                                    10UL,
+                                                    100UL,
+                                                    1000UL,
+                                                    10000UL,
+                                                    100000UL,
+                                                    1000000UL,
+                                                    10000000UL,
+                                                    100000000UL,
+                                                    1000000000UL,
+                                                    10000000000UL,
+                                                    100000000000UL,
+                                                    1000000000000UL,
+                                                    10000000000000UL,
+                                                    100000000000000UL,
+                                                    1000000000000000UL,
+                                                    10000000000000000UL,
+                                                    100000000000000000UL,
+                                                    1000000000000000000UL,
+                                                    10000000000000000000UL};
 
 // calculate the exponent, in scientific notation, of the number.
 // this algorithm is not even close to optimized, but it has no practical
 // effect on performance: in order to have a faster algorithm, we'd need
 // to slow down performance for faster algorithms, and this is still fast.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-int32_t scientific_exponent(parsed_number_string_t<UC> & num) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int32_t
+scientific_exponent(parsed_number_string_t<UC> &num) noexcept {
   uint64_t mantissa = num.mantissa;
   int32_t exponent = int32_t(num.exponent);
   while (mantissa >= 10000) {
@@ -45,15 +60,16 @@ int32_t scientific_exponent(parsed_number_string_t<UC> & num) noexcept {
 
 // this converts a native floating-point number to an extended-precision float.
 template <typename T>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa to_extended(T value) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+to_extended(T value) noexcept {
   using equiv_uint = typename binary_format<T>::equiv_uint;
   constexpr equiv_uint exponent_mask = binary_format<T>::exponent_mask();
   constexpr equiv_uint mantissa_mask = binary_format<T>::mantissa_mask();
   constexpr equiv_uint hidden_bit_mask = binary_format<T>::hidden_bit_mask();
 
   adjusted_mantissa am;
-  int32_t bias = binary_format<T>::mantissa_explicit_bits() - binary_format<T>::minimum_exponent();
+  int32_t bias = binary_format<T>::mantissa_explicit_bits() -
+                 binary_format<T>::minimum_exponent();
   equiv_uint bits;
 #if FASTFLOAT_HAS_BIT_CAST
   bits = std::bit_cast<equiv_uint>(value);
@@ -66,7 +82,8 @@ adjusted_mantissa to_extended(T value) noexcept {
     am.mantissa = bits & mantissa_mask;
   } else {
     // normal
-    am.power2 = int32_t((bits & exponent_mask) >> binary_format<T>::mantissa_explicit_bits());
+    am.power2 = int32_t((bits & exponent_mask) >>
+                        binary_format<T>::mantissa_explicit_bits());
     am.power2 -= bias;
     am.mantissa = (bits & mantissa_mask) | hidden_bit_mask;
   }
@@ -78,8 +95,8 @@ adjusted_mantissa to_extended(T value) noexcept {
 // we are given a native float that represents b, so we need to adjust it
 // halfway between b and b+u.
 template <typename T>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa to_extended_halfway(T value) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+to_extended_halfway(T value) noexcept {
   adjusted_mantissa am = to_extended(value);
   am.mantissa <<= 1;
   am.mantissa += 1;
@@ -89,15 +106,18 @@ adjusted_mantissa to_extended_halfway(T value) noexcept {
 
 // round an extended-precision float to the nearest machine float.
 template <typename T, typename callback>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-void round(adjusted_mantissa& am, callback cb) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am,
+                                                         callback cb) noexcept {
   int32_t mantissa_shift = 64 - binary_format<T>::mantissa_explicit_bits() - 1;
   if (-am.power2 >= mantissa_shift) {
     // have a denormal float
     int32_t shift = -am.power2 + 1;
     cb(am, std::min<int32_t>(shift, 64));
     // check for round-up: if rounding-nearest carried us to the hidden bit.
-    am.power2 = (am.mantissa < (uint64_t(1) << binary_format<T>::mantissa_explicit_bits())) ? 0 : 1;
+    am.power2 = (am.mantissa <
+                 (uint64_t(1) << binary_format<T>::mantissa_explicit_bits()))
+                    ? 0
+                    : 1;
     return;
   }
 
@@ -105,7 +125,8 @@ void round(adjusted_mantissa& am, callback cb) noexcept {
   cb(am, mantissa_shift);
 
   // check for carry
-  if (am.mantissa >= (uint64_t(2) << binary_format<T>::mantissa_explicit_bits())) {
+  if (am.mantissa >=
+      (uint64_t(2) << binary_format<T>::mantissa_explicit_bits())) {
     am.mantissa = (uint64_t(1) << binary_format<T>::mantissa_explicit_bits());
     am.power2++;
   }
@@ -119,16 +140,11 @@ void round(adjusted_mantissa& am, callback cb) noexcept {
 }
 
 template <typename callback>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) noexcept {
-  const uint64_t mask
-  = (shift == 64)
-    ? UINT64_MAX
-    : (uint64_t(1) << shift) - 1;
-  const uint64_t halfway
-  = (shift == 0)
-    ? 0
-    : uint64_t(1) << (shift - 1);
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
+round_nearest_tie_even(adjusted_mantissa &am, int32_t shift,
+                       callback cb) noexcept {
+  const uint64_t mask = (shift == 64) ? UINT64_MAX : (uint64_t(1) << shift) - 1;
+  const uint64_t halfway = (shift == 0) ? 0 : uint64_t(1) << (shift - 1);
   uint64_t truncated_bits = am.mantissa & mask;
   bool is_above = truncated_bits > halfway;
   bool is_halfway = truncated_bits == halfway;
@@ -145,8 +161,8 @@ void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) n
   am.mantissa += uint64_t(cb(is_odd, is_halfway, is_above));
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-void round_down(adjusted_mantissa& am, int32_t shift) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
+round_down(adjusted_mantissa &am, int32_t shift) noexcept {
   if (shift == 64) {
     am.mantissa = 0;
   } else {
@@ -155,10 +171,11 @@ void round_down(adjusted_mantissa& am, int32_t shift) noexcept {
   am.power2 += shift;
 }
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void skip_zeros(UC const * & first, UC const * last) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+skip_zeros(UC const *&first, UC const *last) noexcept {
   uint64_t val;
-  while (!cpp20_and_in_constexpr() && std::distance(first, last) >= int_cmp_len<UC>()) {
+  while (!cpp20_and_in_constexpr() &&
+         std::distance(first, last) >= int_cmp_len<UC>()) {
     ::memcpy(&val, first, sizeof(uint64_t));
     if (val != int_cmp_zeros<UC>()) {
       break;
@@ -176,11 +193,12 @@ void skip_zeros(UC const * & first, UC const * last) noexcept {
 // determine if any non-zero digits were truncated.
 // all characters must be valid digits.
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool is_truncated(UC const * first, UC const * last) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+is_truncated(UC const *first, UC const *last) noexcept {
   // do 8-bit optimizations, can just compare to 8 literal 0s.
   uint64_t val;
-  while (!cpp20_and_in_constexpr() && std::distance(first, last) >= int_cmp_len<UC>()) {
+  while (!cpp20_and_in_constexpr() &&
+         std::distance(first, last) >= int_cmp_len<UC>()) {
     ::memcpy(&val, first, sizeof(uint64_t));
     if (val != int_cmp_zeros<UC>()) {
       return true;
@@ -196,15 +214,15 @@ bool is_truncated(UC const * first, UC const * last) noexcept {
   return false;
 }
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-bool is_truncated(span<const UC> s) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool
+is_truncated(span<const UC> s) noexcept {
   return is_truncated(s.ptr, s.ptr + s.len());
 }
 
-
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void parse_eight_digits(const UC*& p, limb& value, size_t& counter, size_t& count) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+parse_eight_digits(const UC *&p, limb &value, size_t &counter,
+                   size_t &count) noexcept {
   value = value * 100000000 + parse_eight_digits_unrolled(p);
   p += 8;
   counter += 8;
@@ -212,22 +230,23 @@ void parse_eight_digits(const UC*& p, limb& value, size_t& counter, size_t& coun
 }
 
 template <typename UC>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-void parse_one_digit(UC const *& p, limb& value, size_t& counter, size_t& count) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void
+parse_one_digit(UC const *&p, limb &value, size_t &counter,
+                size_t &count) noexcept {
   value = value * 10 + limb(*p - UC('0'));
   p++;
   counter++;
   count++;
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void add_native(bigint& big, limb power, limb value) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+add_native(bigint &big, limb power, limb value) noexcept {
   big.mul(power);
   big.add(value);
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void round_up_bigint(bigint& big, size_t& count) noexcept {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+round_up_bigint(bigint &big, size_t &count) noexcept {
   // need to round-up the digits, but need to avoid rounding
   // ....9999 to ...10000, which could cause a false halfway point.
   add_native(big, 10, 1);
@@ -236,8 +255,9 @@ void round_up_bigint(bigint& big, size_t& count) noexcept {
 
 // parse the significant digits into a big integer
 template <typename UC>
-inline FASTFLOAT_CONSTEXPR20
-void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_digits, size_t& digits) noexcept {
+inline FASTFLOAT_CONSTEXPR20 void
+parse_mantissa(bigint &result, parsed_number_string_t<UC> &num,
+               size_t max_digits, size_t &digits) noexcept {
   // try to minimize the number of big integer and scalar multiplication.
   // therefore, try to parse 8 digits at a time, and multiply by the largest
   // scalar value (9 or 19 digits) for each step.
@@ -251,12 +271,13 @@ void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_
 #endif
 
   // process all integer digits.
-  UC const * p = num.integer.ptr;
-  UC const * pend = p + num.integer.len();
+  UC const *p = num.integer.ptr;
+  UC const *pend = p + num.integer.len();
   skip_zeros(p, pend);
   // process all digits, in increments of step per loop
   while (p != pend) {
-    while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
+    while ((std::distance(p, pend) >= 8) && (step - counter >= 8) &&
+           (max_digits - digits >= 8)) {
       parse_eight_digits(p, value, counter, digits);
     }
     while (counter < step && p != pend && digits < max_digits) {
@@ -289,7 +310,8 @@ void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_
     }
     // process all digits, in increments of step per loop
     while (p != pend) {
-      while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) {
+      while ((std::distance(p, pend) >= 8) && (step - counter >= 8) &&
+             (max_digits - digits >= 8)) {
         parse_eight_digits(p, value, counter, digits);
       }
       while (counter < step && p != pend && digits < max_digits) {
@@ -317,19 +339,23 @@ void parse_mantissa(bigint& result, parsed_number_string_t<UC>& num, size_t max_
 }
 
 template <typename T>
-inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa positive_digit_comp(bigint& bigmant, int32_t exponent) noexcept {
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+positive_digit_comp(bigint &bigmant, int32_t exponent) noexcept {
   FASTFLOAT_ASSERT(bigmant.pow10(uint32_t(exponent)));
   adjusted_mantissa answer;
   bool truncated;
   answer.mantissa = bigmant.hi64(truncated);
-  int bias = binary_format<T>::mantissa_explicit_bits() - binary_format<T>::minimum_exponent();
+  int bias = binary_format<T>::mantissa_explicit_bits() -
+             binary_format<T>::minimum_exponent();
   answer.power2 = bigmant.bit_length() - 64 + bias;
 
-  round<T>(answer, [truncated](adjusted_mantissa& a, int32_t shift) {
-    round_nearest_tie_even(a, shift, [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool {
-      return is_above || (is_halfway && truncated) || (is_odd && is_halfway);
-    });
+  round<T>(answer, [truncated](adjusted_mantissa &a, int32_t shift) {
+    round_nearest_tie_even(
+        a, shift,
+        [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool {
+          return is_above || (is_halfway && truncated) ||
+                 (is_odd && is_halfway);
+        });
   });
 
   return answer;
@@ -341,15 +367,17 @@ adjusted_mantissa positive_digit_comp(bigint& bigmant, int32_t exponent) noexcep
 // we then need to scale by `2^(f- e)`, and then the two significant digits
 // are of the same magnitude.
 template <typename T>
-inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa negative_digit_comp(bigint& bigmant, adjusted_mantissa am, int32_t exponent) noexcept {
-  bigint& real_digits = bigmant;
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa negative_digit_comp(
+    bigint &bigmant, adjusted_mantissa am, int32_t exponent) noexcept {
+  bigint &real_digits = bigmant;
   int32_t real_exp = exponent;
 
   // get the value of `b`, rounded down, and get a bigint representation of b+h
   adjusted_mantissa am_b = am;
-  // gcc7 buf: use a lambda to remove the noexcept qualifier bug with -Wnoexcept-type.
-  round<T>(am_b, [](adjusted_mantissa&a, int32_t shift) { round_down(a, shift); });
+  // gcc7 buf: use a lambda to remove the noexcept qualifier bug with
+  // -Wnoexcept-type.
+  round<T>(am_b,
+           [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); });
   T b;
   to_float(false, am_b, b);
   adjusted_mantissa theor = to_extended_halfway(b);
@@ -371,18 +399,19 @@ adjusted_mantissa negative_digit_comp(bigint& bigmant, adjusted_mantissa am, int
   // compare digits, and use it to director rounding
   int ord = real_digits.compare(theor_digits);
   adjusted_mantissa answer = am;
-  round<T>(answer, [ord](adjusted_mantissa& a, int32_t shift) {
-    round_nearest_tie_even(a, shift, [ord](bool is_odd, bool _, bool __) -> bool {
-      (void)_;  // not needed, since we've done our comparison
-      (void)__; // not needed, since we've done our comparison
-      if (ord > 0) {
-        return true;
-      } else if (ord < 0) {
-        return false;
-      } else {
-        return is_odd;
-      }
-    });
+  round<T>(answer, [ord](adjusted_mantissa &a, int32_t shift) {
+    round_nearest_tie_even(
+        a, shift, [ord](bool is_odd, bool _, bool __) -> bool {
+          (void)_;  // not needed, since we've done our comparison
+          (void)__; // not needed, since we've done our comparison
+          if (ord > 0) {
+            return true;
+          } else if (ord < 0) {
+            return false;
+          } else {
+            return is_odd;
+          }
+        });
   });
 
   return answer;
@@ -402,8 +431,8 @@ adjusted_mantissa negative_digit_comp(bigint& bigmant, adjusted_mantissa am, int
 // the actual digits. we then compare the big integer representations
 // of both, and use that to direct rounding.
 template <typename T, typename UC>
-inline FASTFLOAT_CONSTEXPR20
-adjusted_mantissa digit_comp(parsed_number_string_t<UC>& num, adjusted_mantissa am) noexcept {
+inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa
+digit_comp(parsed_number_string_t<UC> &num, adjusted_mantissa am) noexcept {
   // remove the invalid exponent bias
   am.power2 -= invalid_am_bias;
 
diff --git a/third_party/fast_float/fast_float.h b/third_party/fast_float/fast_float.h
index 04efa877ee7b..42a3cdfaf4c4 100644
--- a/third_party/fast_float/fast_float.h
+++ b/third_party/fast_float/fast_float.h
@@ -6,36 +6,50 @@
 
 namespace fast_float {
 /**
- * This function parses the character sequence [first,last) for a number. It parses floating-point numbers expecting
- * a locale-indepent format equivalent to what is used by std::strtod in the default ("C") locale.
- * The resulting floating-point value is the closest floating-point values (using either float or double),
- * using the "round to even" convention for values that would otherwise fall right in-between two values.
- * That is, we provide exact parsing according to the IEEE standard.
+ * This function parses the character sequence [first,last) for a number. It
+ * parses floating-point numbers expecting a locale-indepent format equivalent
+ * to what is used by std::strtod in the default ("C") locale. The resulting
+ * floating-point value is the closest floating-point values (using either float
+ * or double), using the "round to even" convention for values that would
+ * otherwise fall right in-between two values. That is, we provide exact parsing
+ * according to the IEEE standard.
  *
- * Given a successful parse, the pointer (`ptr`) in the returned value is set to point right after the
- * parsed number, and the `value` referenced is set to the parsed value. In case of error, the returned
- * `ec` contains a representative error, otherwise the default (`std::errc()`) value is stored.
+ * Given a successful parse, the pointer (`ptr`) in the returned value is set to
+ * point right after the parsed number, and the `value` referenced is set to the
+ * parsed value. In case of error, the returned `ec` contains a representative
+ * error, otherwise the default (`std::errc()`) value is stored.
  *
- * The implementation does not throw and does not allocate memory (e.g., with `new` or `malloc`).
+ * The implementation does not throw and does not allocate memory (e.g., with
+ * `new` or `malloc`).
  *
- * Like the C++17 standard, the `fast_float::from_chars` functions take an optional last argument of
- * the type `fast_float::chars_format`. It is a bitset value: we check whether
- * `fmt & fast_float::chars_format::fixed` and `fmt & fast_float::chars_format::scientific` are set
- * to determine whether we allow the fixed point and scientific notation respectively.
- * The default is  `fast_float::chars_format::general` which allows both `fixed` and `scientific`.
+ * Like the C++17 standard, the `fast_float::from_chars` functions take an
+ * optional last argument of the type `fast_float::chars_format`. It is a bitset
+ * value: we check whether `fmt & fast_float::chars_format::fixed` and `fmt &
+ * fast_float::chars_format::scientific` are set to determine whether we allow
+ * the fixed point and scientific notation respectively. The default is
+ * `fast_float::chars_format::general` which allows both `fixed` and
+ * `scientific`.
  */
-template<typename T, typename UC = char>
-FASTFLOAT_CONSTEXPR20
-from_chars_result_t<UC> from_chars(UC const * first, UC const * last,
-                             T &value, chars_format fmt = chars_format::general)  noexcept;
+template <typename T, typename UC = char,
+          typename = FASTFLOAT_ENABLE_IF(is_supported_float_type<T>())>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value,
+           chars_format fmt = chars_format::general) noexcept;
 
 /**
  * Like from_chars, but accepts an `options` argument to govern number parsing.
  */
-template<typename T, typename UC = char>
-FASTFLOAT_CONSTEXPR20
-from_chars_result_t<UC> from_chars_advanced(UC const * first, UC const * last,
-                                      T &value, parse_options_t<UC> options)  noexcept;
+template <typename T, typename UC = char>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars_advanced(UC const *first, UC const *last, T &value,
+                    parse_options_t<UC> options) noexcept;
+/**
+ * from_chars for integer types.
+ */
+template <typename T, typename UC = char,
+          typename = FASTFLOAT_ENABLE_IF(!is_supported_float_type<T>())>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value, int base = 10) noexcept;
 
 } // namespace fast_float
 #include "parse_number.h"
diff --git a/third_party/fast_float/fast_table.h b/third_party/fast_float/fast_table.h
index d8dc5690517c..69f9b2c9245f 100644
--- a/third_party/fast_float/fast_table.h
+++ b/third_party/fast_float/fast_table.h
@@ -29,669 +29,677 @@ namespace fast_float {
  * infinite in binary64 so we never need to worry about powers
  * of 5 greater than 308.
  */
-template <class unused = void>
-struct powers_template {
+template <class unused = void> struct powers_template {
 
-constexpr static int smallest_power_of_five = binary_format<double>::smallest_power_of_ten();
-constexpr static int largest_power_of_five = binary_format<double>::largest_power_of_ten();
-constexpr static int number_of_entries = 2 * (largest_power_of_five - smallest_power_of_five + 1);
-// Powers of five from 5^-342 all the way to 5^308 rounded toward one.
-constexpr static uint64_t power_of_five_128[number_of_entries] = {
-    0xeef453d6923bd65a,0x113faa2906a13b3f,
-    0x9558b4661b6565f8,0x4ac7ca59a424c507,
-    0xbaaee17fa23ebf76,0x5d79bcf00d2df649,
-    0xe95a99df8ace6f53,0xf4d82c2c107973dc,
-    0x91d8a02bb6c10594,0x79071b9b8a4be869,
-    0xb64ec836a47146f9,0x9748e2826cdee284,
-    0xe3e27a444d8d98b7,0xfd1b1b2308169b25,
-    0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7,
-    0xb208ef855c969f4f,0xbdbd2d335e51a935,
-    0xde8b2b66b3bc4723,0xad2c788035e61382,
-    0x8b16fb203055ac76,0x4c3bcb5021afcc31,
-    0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d,
-    0xd953e8624b85dd78,0xd71d6dad34a2af0d,
-    0x87d4713d6f33aa6b,0x8672648c40e5ad68,
-    0xa9c98d8ccb009506,0x680efdaf511f18c2,
-    0xd43bf0effdc0ba48,0x212bd1b2566def2,
-    0x84a57695fe98746d,0x14bb630f7604b57,
-    0xa5ced43b7e3e9188,0x419ea3bd35385e2d,
-    0xcf42894a5dce35ea,0x52064cac828675b9,
-    0x818995ce7aa0e1b2,0x7343efebd1940993,
-    0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8,
-    0xca66fa129f9b60a6,0xd41a26e077774ef6,
-    0xfd00b897478238d0,0x8920b098955522b4,
-    0x9e20735e8cb16382,0x55b46e5f5d5535b0,
-    0xc5a890362fddbc62,0xeb2189f734aa831d,
-    0xf712b443bbd52b7b,0xa5e9ec7501d523e4,
-    0x9a6bb0aa55653b2d,0x47b233c92125366e,
-    0xc1069cd4eabe89f8,0x999ec0bb696e840a,
-    0xf148440a256e2c76,0xc00670ea43ca250d,
-    0x96cd2a865764dbca,0x380406926a5e5728,
-    0xbc807527ed3e12bc,0xc605083704f5ecf2,
-    0xeba09271e88d976b,0xf7864a44c633682e,
-    0x93445b8731587ea3,0x7ab3ee6afbe0211d,
-    0xb8157268fdae9e4c,0x5960ea05bad82964,
-    0xe61acf033d1a45df,0x6fb92487298e33bd,
-    0x8fd0c16206306bab,0xa5d3b6d479f8e056,
-    0xb3c4f1ba87bc8696,0x8f48a4899877186c,
-    0xe0b62e2929aba83c,0x331acdabfe94de87,
-    0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14,
-    0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9,
-    0xdb71e91432b1a24a,0xc9e82cd9f69d6150,
-    0x892731ac9faf056e,0xbe311c083a225cd2,
-    0xab70fe17c79ac6ca,0x6dbd630a48aaf406,
-    0xd64d3d9db981787d,0x92cbbccdad5b108,
-    0x85f0468293f0eb4e,0x25bbf56008c58ea5,
-    0xa76c582338ed2621,0xaf2af2b80af6f24e,
-    0xd1476e2c07286faa,0x1af5af660db4aee1,
-    0x82cca4db847945ca,0x50d98d9fc890ed4d,
-    0xa37fce126597973c,0xe50ff107bab528a0,
-    0xcc5fc196fefd7d0c,0x1e53ed49a96272c8,
-    0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a,
-    0x9faacf3df73609b1,0x77b191618c54e9ac,
-    0xc795830d75038c1d,0xd59df5b9ef6a2417,
-    0xf97ae3d0d2446f25,0x4b0573286b44ad1d,
-    0x9becce62836ac577,0x4ee367f9430aec32,
-    0xc2e801fb244576d5,0x229c41f793cda73f,
-    0xf3a20279ed56d48a,0x6b43527578c1110f,
-    0x9845418c345644d6,0x830a13896b78aaa9,
-    0xbe5691ef416bd60c,0x23cc986bc656d553,
-    0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8,
-    0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9,
-    0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53,
-    0xe858ad248f5c22c9,0xd1b3400f8f9cff68,
-    0x91376c36d99995be,0x23100809b9c21fa1,
-    0xb58547448ffffb2d,0xabd40a0c2832a78a,
-    0xe2e69915b3fff9f9,0x16c90c8f323f516c,
-    0x8dd01fad907ffc3b,0xae3da7d97f6792e3,
-    0xb1442798f49ffb4a,0x99cd11cfdf41779c,
-    0xdd95317f31c7fa1d,0x40405643d711d583,
-    0x8a7d3eef7f1cfc52,0x482835ea666b2572,
-    0xad1c8eab5ee43b66,0xda3243650005eecf,
-    0xd863b256369d4a40,0x90bed43e40076a82,
-    0x873e4f75e2224e68,0x5a7744a6e804a291,
-    0xa90de3535aaae202,0x711515d0a205cb36,
-    0xd3515c2831559a83,0xd5a5b44ca873e03,
-    0x8412d9991ed58091,0xe858790afe9486c2,
-    0xa5178fff668ae0b6,0x626e974dbe39a872,
-    0xce5d73ff402d98e3,0xfb0a3d212dc8128f,
-    0x80fa687f881c7f8e,0x7ce66634bc9d0b99,
-    0xa139029f6a239f72,0x1c1fffc1ebc44e80,
-    0xc987434744ac874e,0xa327ffb266b56220,
-    0xfbe9141915d7a922,0x4bf1ff9f0062baa8,
-    0x9d71ac8fada6c9b5,0x6f773fc3603db4a9,
-    0xc4ce17b399107c22,0xcb550fb4384d21d3,
-    0xf6019da07f549b2b,0x7e2a53a146606a48,
-    0x99c102844f94e0fb,0x2eda7444cbfc426d,
-    0xc0314325637a1939,0xfa911155fefb5308,
-    0xf03d93eebc589f88,0x793555ab7eba27ca,
-    0x96267c7535b763b5,0x4bc1558b2f3458de,
-    0xbbb01b9283253ca2,0x9eb1aaedfb016f16,
-    0xea9c227723ee8bcb,0x465e15a979c1cadc,
-    0x92a1958a7675175f,0xbfacd89ec191ec9,
-    0xb749faed14125d36,0xcef980ec671f667b,
-    0xe51c79a85916f484,0x82b7e12780e7401a,
-    0x8f31cc0937ae58d2,0xd1b2ecb8b0908810,
-    0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15,
-    0xdfbdcece67006ac9,0x67a791e093e1d49a,
-    0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0,
-    0xaecc49914078536d,0x58fae9f773886e18,
-    0xda7f5bf590966848,0xaf39a475506a899e,
-    0x888f99797a5e012d,0x6d8406c952429603,
-    0xaab37fd7d8f58178,0xc8e5087ba6d33b83,
-    0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64,
-    0x855c3be0a17fcd26,0x5cf2eea09a55067f,
-    0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e,
-    0xd0601d8efc57b08b,0xf13b94daf124da26,
-    0x823c12795db6ce57,0x76c53d08d6b70858,
-    0xa2cb1717b52481ed,0x54768c4b0c64ca6e,
-    0xcb7ddcdda26da268,0xa9942f5dcf7dfd09,
-    0xfe5d54150b090b02,0xd3f93b35435d7c4c,
-    0x9efa548d26e5a6e1,0xc47bc5014a1a6daf,
-    0xc6b8e9b0709f109a,0x359ab6419ca1091b,
-    0xf867241c8cc6d4c0,0xc30163d203c94b62,
-    0x9b407691d7fc44f8,0x79e0de63425dcf1d,
-    0xc21094364dfb5636,0x985915fc12f542e4,
-    0xf294b943e17a2bc4,0x3e6f5b7b17b2939d,
-    0x979cf3ca6cec5b5a,0xa705992ceecf9c42,
-    0xbd8430bd08277231,0x50c6ff782a838353,
-    0xece53cec4a314ebd,0xa4f8bf5635246428,
-    0x940f4613ae5ed136,0x871b7795e136be99,
-    0xb913179899f68584,0x28e2557b59846e3f,
-    0xe757dd7ec07426e5,0x331aeada2fe589cf,
-    0x9096ea6f3848984f,0x3ff0d2c85def7621,
-    0xb4bca50b065abe63,0xfed077a756b53a9,
-    0xe1ebce4dc7f16dfb,0xd3e8495912c62894,
-    0x8d3360f09cf6e4bd,0x64712dd7abbbd95c,
-    0xb080392cc4349dec,0xbd8d794d96aacfb3,
-    0xdca04777f541c567,0xecf0d7a0fc5583a0,
-    0x89e42caaf9491b60,0xf41686c49db57244,
-    0xac5d37d5b79b6239,0x311c2875c522ced5,
-    0xd77485cb25823ac7,0x7d633293366b828b,
-    0x86a8d39ef77164bc,0xae5dff9c02033197,
-    0xa8530886b54dbdeb,0xd9f57f830283fdfc,
-    0xd267caa862a12d66,0xd072df63c324fd7b,
-    0x8380dea93da4bc60,0x4247cb9e59f71e6d,
-    0xa46116538d0deb78,0x52d9be85f074e608,
-    0xcd795be870516656,0x67902e276c921f8b,
-    0x806bd9714632dff6,0xba1cd8a3db53b6,
-    0xa086cfcd97bf97f3,0x80e8a40eccd228a4,
-    0xc8a883c0fdaf7df0,0x6122cd128006b2cd,
-    0xfad2a4b13d1b5d6c,0x796b805720085f81,
-    0x9cc3a6eec6311a63,0xcbe3303674053bb0,
-    0xc3f490aa77bd60fc,0xbedbfc4411068a9c,
-    0xf4f1b4d515acb93b,0xee92fb5515482d44,
-    0x991711052d8bf3c5,0x751bdd152d4d1c4a,
-    0xbf5cd54678eef0b6,0xd262d45a78a0635d,
-    0xef340a98172aace4,0x86fb897116c87c34,
-    0x9580869f0e7aac0e,0xd45d35e6ae3d4da0,
-    0xbae0a846d2195712,0x8974836059cca109,
-    0xe998d258869facd7,0x2bd1a438703fc94b,
-    0x91ff83775423cc06,0x7b6306a34627ddcf,
-    0xb67f6455292cbf08,0x1a3bc84c17b1d542,
-    0xe41f3d6a7377eeca,0x20caba5f1d9e4a93,
-    0x8e938662882af53e,0x547eb47b7282ee9c,
-    0xb23867fb2a35b28d,0xe99e619a4f23aa43,
-    0xdec681f9f4c31f31,0x6405fa00e2ec94d4,
-    0x8b3c113c38f9f37e,0xde83bc408dd3dd04,
-    0xae0b158b4738705e,0x9624ab50b148d445,
-    0xd98ddaee19068c76,0x3badd624dd9b0957,
-    0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6,
-    0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c,
-    0xd47487cc8470652b,0x7647c3200069671f,
-    0x84c8d4dfd2c63f3b,0x29ecd9f40041e073,
-    0xa5fb0a17c777cf09,0xf468107100525890,
-    0xcf79cc9db955c2cc,0x7182148d4066eeb4,
-    0x81ac1fe293d599bf,0xc6f14cd848405530,
-    0xa21727db38cb002f,0xb8ada00e5a506a7c,
-    0xca9cf1d206fdc03b,0xa6d90811f0e4851c,
-    0xfd442e4688bd304a,0x908f4a166d1da663,
-    0x9e4a9cec15763e2e,0x9a598e4e043287fe,
-    0xc5dd44271ad3cdba,0x40eff1e1853f29fd,
-    0xf7549530e188c128,0xd12bee59e68ef47c,
-    0x9a94dd3e8cf578b9,0x82bb74f8301958ce,
-    0xc13a148e3032d6e7,0xe36a52363c1faf01,
-    0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1,
-    0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9,
-    0xbcb2b812db11a5de,0x7415d448f6b6f0e7,
-    0xebdf661791d60f56,0x111b495b3464ad21,
-    0x936b9fcebb25c995,0xcab10dd900beec34,
-    0xb84687c269ef3bfb,0x3d5d514f40eea742,
-    0xe65829b3046b0afa,0xcb4a5a3112a5112,
-    0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab,
-    0xb3f4e093db73a093,0x59ed216765690f56,
-    0xe0f218b8d25088b8,0x306869c13ec3532c,
-    0x8c974f7383725573,0x1e414218c73a13fb,
-    0xafbd2350644eeacf,0xe5d1929ef90898fa,
-    0xdbac6c247d62a583,0xdf45f746b74abf39,
-    0x894bc396ce5da772,0x6b8bba8c328eb783,
-    0xab9eb47c81f5114f,0x66ea92f3f326564,
-    0xd686619ba27255a2,0xc80a537b0efefebd,
-    0x8613fd0145877585,0xbd06742ce95f5f36,
-    0xa798fc4196e952e7,0x2c48113823b73704,
-    0xd17f3b51fca3a7a0,0xf75a15862ca504c5,
-    0x82ef85133de648c4,0x9a984d73dbe722fb,
-    0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba,
-    0xcc963fee10b7d1b3,0x318df905079926a8,
-    0xffbbcfe994e5c61f,0xfdf17746497f7052,
-    0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633,
-    0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0,
-    0xf9bd690a1b68637b,0x3dfdce7aa3c673b0,
-    0x9c1661a651213e2d,0x6bea10ca65c084e,
-    0xc31bfa0fe5698db8,0x486e494fcff30a62,
-    0xf3e2f893dec3f126,0x5a89dba3c3efccfa,
-    0x986ddb5c6b3a76b7,0xf89629465a75e01c,
-    0xbe89523386091465,0xf6bbb397f1135823,
-    0xee2ba6c0678b597f,0x746aa07ded582e2c,
-    0x94db483840b717ef,0xa8c2a44eb4571cdc,
-    0xba121a4650e4ddeb,0x92f34d62616ce413,
-    0xe896a0d7e51e1566,0x77b020baf9c81d17,
-    0x915e2486ef32cd60,0xace1474dc1d122e,
-    0xb5b5ada8aaff80b8,0xd819992132456ba,
-    0xe3231912d5bf60e6,0x10e1fff697ed6c69,
-    0x8df5efabc5979c8f,0xca8d3ffa1ef463c1,
-    0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2,
-    0xddd0467c64bce4a0,0xac7cb3f6d05ddbde,
-    0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b,
-    0xad4ab7112eb3929d,0x86c16c98d2c953c6,
-    0xd89d64d57a607744,0xe871c7bf077ba8b7,
-    0x87625f056c7c4a8b,0x11471cd764ad4972,
-    0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf,
-    0xd389b47879823479,0x4aff1d108d4ec2c3,
-    0x843610cb4bf160cb,0xcedf722a585139ba,
-    0xa54394fe1eedb8fe,0xc2974eb4ee658828,
-    0xce947a3da6a9273e,0x733d226229feea32,
-    0x811ccc668829b887,0x806357d5a3f525f,
-    0xa163ff802a3426a8,0xca07c2dcb0cf26f7,
-    0xc9bcff6034c13052,0xfc89b393dd02f0b5,
-    0xfc2c3f3841f17c67,0xbbac2078d443ace2,
-    0x9d9ba7832936edc0,0xd54b944b84aa4c0d,
-    0xc5029163f384a931,0xa9e795e65d4df11,
-    0xf64335bcf065d37d,0x4d4617b5ff4a16d5,
-    0x99ea0196163fa42e,0x504bced1bf8e4e45,
-    0xc06481fb9bcf8d39,0xe45ec2862f71e1d6,
-    0xf07da27a82c37088,0x5d767327bb4e5a4c,
-    0x964e858c91ba2655,0x3a6a07f8d510f86f,
-    0xbbe226efb628afea,0x890489f70a55368b,
-    0xeadab0aba3b2dbe5,0x2b45ac74ccea842e,
-    0x92c8ae6b464fc96f,0x3b0b8bc90012929d,
-    0xb77ada0617e3bbcb,0x9ce6ebb40173744,
-    0xe55990879ddcaabd,0xcc420a6a101d0515,
-    0x8f57fa54c2a9eab6,0x9fa946824a12232d,
-    0xb32df8e9f3546564,0x47939822dc96abf9,
-    0xdff9772470297ebd,0x59787e2b93bc56f7,
-    0x8bfbea76c619ef36,0x57eb4edb3c55b65a,
-    0xaefae51477a06b03,0xede622920b6b23f1,
-    0xdab99e59958885c4,0xe95fab368e45eced,
-    0x88b402f7fd75539b,0x11dbcb0218ebb414,
-    0xaae103b5fcd2a881,0xd652bdc29f26a119,
-    0xd59944a37c0752a2,0x4be76d3346f0495f,
-    0x857fcae62d8493a5,0x6f70a4400c562ddb,
-    0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952,
-    0xd097ad07a71f26b2,0x7e2000a41346a7a7,
-    0x825ecc24c873782f,0x8ed400668c0c28c8,
-    0xa2f67f2dfa90563b,0x728900802f0f32fa,
-    0xcbb41ef979346bca,0x4f2b40a03ad2ffb9,
-    0xfea126b7d78186bc,0xe2f610c84987bfa8,
-    0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9,
-    0xc6ede63fa05d3143,0x91503d1c79720dbb,
-    0xf8a95fcf88747d94,0x75a44c6397ce912a,
-    0x9b69dbe1b548ce7c,0xc986afbe3ee11aba,
-    0xc24452da229b021b,0xfbe85badce996168,
-    0xf2d56790ab41c2a2,0xfae27299423fb9c3,
-    0x97c560ba6b0919a5,0xdccd879fc967d41a,
-    0xbdb6b8e905cb600f,0x5400e987bbc1c920,
-    0xed246723473e3813,0x290123e9aab23b68,
-    0x9436c0760c86e30b,0xf9a0b6720aaf6521,
-    0xb94470938fa89bce,0xf808e40e8d5b3e69,
-    0xe7958cb87392c2c2,0xb60b1d1230b20e04,
-    0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2,
-    0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3,
-    0xe2280b6c20dd5232,0x25c6da63c38de1b0,
-    0x8d590723948a535f,0x579c487e5a38ad0e,
-    0xb0af48ec79ace837,0x2d835a9df0c6d851,
-    0xdcdb1b2798182244,0xf8e431456cf88e65,
-    0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff,
-    0xac8b2d36eed2dac5,0xe272467e3d222f3f,
-    0xd7adf884aa879177,0x5b0ed81dcc6abb0f,
-    0x86ccbb52ea94baea,0x98e947129fc2b4e9,
-    0xa87fea27a539e9a5,0x3f2398d747b36224,
-    0xd29fe4b18e88640e,0x8eec7f0d19a03aad,
-    0x83a3eeeef9153e89,0x1953cf68300424ac,
-    0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7,
-    0xcdb02555653131b6,0x3792f412cb06794d,
-    0x808e17555f3ebf11,0xe2bbd88bbee40bd0,
-    0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4,
-    0xc8de047564d20a8b,0xf245825a5a445275,
-    0xfb158592be068d2e,0xeed6e2f0f0d56712,
-    0x9ced737bb6c4183d,0x55464dd69685606b,
-    0xc428d05aa4751e4c,0xaa97e14c3c26b886,
-    0xf53304714d9265df,0xd53dd99f4b3066a8,
-    0x993fe2c6d07b7fab,0xe546a8038efe4029,
-    0xbf8fdb78849a5f96,0xde98520472bdd033,
-    0xef73d256a5c0f77c,0x963e66858f6d4440,
-    0x95a8637627989aad,0xdde7001379a44aa8,
-    0xbb127c53b17ec159,0x5560c018580d5d52,
-    0xe9d71b689dde71af,0xaab8f01e6e10b4a6,
-    0x9226712162ab070d,0xcab3961304ca70e8,
-    0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22,
-    0xe45c10c42a2b3b05,0x8cb89a7db77c506a,
-    0x8eb98a7a9a5b04e3,0x77f3608e92adb242,
-    0xb267ed1940f1c61c,0x55f038b237591ed3,
-    0xdf01e85f912e37a3,0x6b6c46dec52f6688,
-    0x8b61313bbabce2c6,0x2323ac4b3b3da015,
-    0xae397d8aa96c1b77,0xabec975e0a0d081a,
-    0xd9c7dced53c72255,0x96e7bd358c904a21,
-    0x881cea14545c7575,0x7e50d64177da2e54,
-    0xaa242499697392d2,0xdde50bd1d5d0b9e9,
-    0xd4ad2dbfc3d07787,0x955e4ec64b44e864,
-    0x84ec3c97da624ab4,0xbd5af13bef0b113e,
-    0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e,
-    0xcfb11ead453994ba,0x67de18eda5814af2,
-    0x81ceb32c4b43fcf4,0x80eacf948770ced7,
-    0xa2425ff75e14fc31,0xa1258379a94d028d,
-    0xcad2f7f5359a3b3e,0x96ee45813a04330,
-    0xfd87b5f28300ca0d,0x8bca9d6e188853fc,
-    0x9e74d1b791e07e48,0x775ea264cf55347e,
-    0xc612062576589dda,0x95364afe032a819e,
-    0xf79687aed3eec551,0x3a83ddbd83f52205,
-    0x9abe14cd44753b52,0xc4926a9672793543,
-    0xc16d9a0095928a27,0x75b7053c0f178294,
-    0xf1c90080baf72cb1,0x5324c68b12dd6339,
-    0x971da05074da7bee,0xd3f6fc16ebca5e04,
-    0xbce5086492111aea,0x88f4bb1ca6bcf585,
-    0xec1e4a7db69561a5,0x2b31e9e3d06c32e6,
-    0x9392ee8e921d5d07,0x3aff322e62439fd0,
-    0xb877aa3236a4b449,0x9befeb9fad487c3,
-    0xe69594bec44de15b,0x4c2ebe687989a9b4,
-    0x901d7cf73ab0acd9,0xf9d37014bf60a11,
-    0xb424dc35095cd80f,0x538484c19ef38c95,
-    0xe12e13424bb40e13,0x2865a5f206b06fba,
-    0x8cbccc096f5088cb,0xf93f87b7442e45d4,
-    0xafebff0bcb24aafe,0xf78f69a51539d749,
-    0xdbe6fecebdedd5be,0xb573440e5a884d1c,
-    0x89705f4136b4a597,0x31680a88f8953031,
-    0xabcc77118461cefc,0xfdc20d2b36ba7c3e,
-    0xd6bf94d5e57a42bc,0x3d32907604691b4d,
-    0x8637bd05af6c69b5,0xa63f9a49c2c1b110,
-    0xa7c5ac471b478423,0xfcf80dc33721d54,
-    0xd1b71758e219652b,0xd3c36113404ea4a9,
-    0x83126e978d4fdf3b,0x645a1cac083126ea,
-    0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4,
-    0xcccccccccccccccc,0xcccccccccccccccd,
-    0x8000000000000000,0x0,
-    0xa000000000000000,0x0,
-    0xc800000000000000,0x0,
-    0xfa00000000000000,0x0,
-    0x9c40000000000000,0x0,
-    0xc350000000000000,0x0,
-    0xf424000000000000,0x0,
-    0x9896800000000000,0x0,
-    0xbebc200000000000,0x0,
-    0xee6b280000000000,0x0,
-    0x9502f90000000000,0x0,
-    0xba43b74000000000,0x0,
-    0xe8d4a51000000000,0x0,
-    0x9184e72a00000000,0x0,
-    0xb5e620f480000000,0x0,
-    0xe35fa931a0000000,0x0,
-    0x8e1bc9bf04000000,0x0,
-    0xb1a2bc2ec5000000,0x0,
-    0xde0b6b3a76400000,0x0,
-    0x8ac7230489e80000,0x0,
-    0xad78ebc5ac620000,0x0,
-    0xd8d726b7177a8000,0x0,
-    0x878678326eac9000,0x0,
-    0xa968163f0a57b400,0x0,
-    0xd3c21bcecceda100,0x0,
-    0x84595161401484a0,0x0,
-    0xa56fa5b99019a5c8,0x0,
-    0xcecb8f27f4200f3a,0x0,
-    0x813f3978f8940984,0x4000000000000000,
-    0xa18f07d736b90be5,0x5000000000000000,
-    0xc9f2c9cd04674ede,0xa400000000000000,
-    0xfc6f7c4045812296,0x4d00000000000000,
-    0x9dc5ada82b70b59d,0xf020000000000000,
-    0xc5371912364ce305,0x6c28000000000000,
-    0xf684df56c3e01bc6,0xc732000000000000,
-    0x9a130b963a6c115c,0x3c7f400000000000,
-    0xc097ce7bc90715b3,0x4b9f100000000000,
-    0xf0bdc21abb48db20,0x1e86d40000000000,
-    0x96769950b50d88f4,0x1314448000000000,
-    0xbc143fa4e250eb31,0x17d955a000000000,
-    0xeb194f8e1ae525fd,0x5dcfab0800000000,
-    0x92efd1b8d0cf37be,0x5aa1cae500000000,
-    0xb7abc627050305ad,0xf14a3d9e40000000,
-    0xe596b7b0c643c719,0x6d9ccd05d0000000,
-    0x8f7e32ce7bea5c6f,0xe4820023a2000000,
-    0xb35dbf821ae4f38b,0xdda2802c8a800000,
-    0xe0352f62a19e306e,0xd50b2037ad200000,
-    0x8c213d9da502de45,0x4526f422cc340000,
-    0xaf298d050e4395d6,0x9670b12b7f410000,
-    0xdaf3f04651d47b4c,0x3c0cdd765f114000,
-    0x88d8762bf324cd0f,0xa5880a69fb6ac800,
-    0xab0e93b6efee0053,0x8eea0d047a457a00,
-    0xd5d238a4abe98068,0x72a4904598d6d880,
-    0x85a36366eb71f041,0x47a6da2b7f864750,
-    0xa70c3c40a64e6c51,0x999090b65f67d924,
-    0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d,
-    0x82818f1281ed449f,0xbff8f10e7a8921a4,
-    0xa321f2d7226895c7,0xaff72d52192b6a0d,
-    0xcbea6f8ceb02bb39,0x9bf4f8a69f764490,
-    0xfee50b7025c36a08,0x2f236d04753d5b4,
-    0x9f4f2726179a2245,0x1d762422c946590,
-    0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5,
-    0xf8ebad2b84e0d58b,0xd2e0898765a7deb2,
-    0x9b934c3b330c8577,0x63cc55f49f88eb2f,
-    0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb,
-    0xf316271c7fc3908a,0x8bef464e3945ef7a,
-    0x97edd871cfda3a56,0x97758bf0e3cbb5ac,
-    0xbde94e8e43d0c8ec,0x3d52eeed1cbea317,
-    0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd,
-    0x945e455f24fb1cf8,0x8fe8caa93e74ef6a,
-    0xb975d6b6ee39e436,0xb3e2fd538e122b44,
-    0xe7d34c64a9c85d44,0x60dbbca87196b616,
-    0x90e40fbeea1d3a4a,0xbc8955e946fe31cd,
-    0xb51d13aea4a488dd,0x6babab6398bdbe41,
-    0xe264589a4dcdab14,0xc696963c7eed2dd1,
-    0x8d7eb76070a08aec,0xfc1e1de5cf543ca2,
-    0xb0de65388cc8ada8,0x3b25a55f43294bcb,
-    0xdd15fe86affad912,0x49ef0eb713f39ebe,
-    0x8a2dbf142dfcc7ab,0x6e3569326c784337,
-    0xacb92ed9397bf996,0x49c2c37f07965404,
-    0xd7e77a8f87daf7fb,0xdc33745ec97be906,
-    0x86f0ac99b4e8dafd,0x69a028bb3ded71a3,
-    0xa8acd7c0222311bc,0xc40832ea0d68ce0c,
-    0xd2d80db02aabd62b,0xf50a3fa490c30190,
-    0x83c7088e1aab65db,0x792667c6da79e0fa,
-    0xa4b8cab1a1563f52,0x577001b891185938,
-    0xcde6fd5e09abcf26,0xed4c0226b55e6f86,
-    0x80b05e5ac60b6178,0x544f8158315b05b4,
-    0xa0dc75f1778e39d6,0x696361ae3db1c721,
-    0xc913936dd571c84c,0x3bc3a19cd1e38e9,
-    0xfb5878494ace3a5f,0x4ab48a04065c723,
-    0x9d174b2dcec0e47b,0x62eb0d64283f9c76,
-    0xc45d1df942711d9a,0x3ba5d0bd324f8394,
-    0xf5746577930d6500,0xca8f44ec7ee36479,
-    0x9968bf6abbe85f20,0x7e998b13cf4e1ecb,
-    0xbfc2ef456ae276e8,0x9e3fedd8c321a67e,
-    0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e,
-    0x95d04aee3b80ece5,0xbba1f1d158724a12,
-    0xbb445da9ca61281f,0x2a8a6e45ae8edc97,
-    0xea1575143cf97226,0xf52d09d71a3293bd,
-    0x924d692ca61be758,0x593c2626705f9c56,
-    0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c,
-    0xe498f455c38b997a,0xb6dfb9c0f956447,
-    0x8edf98b59a373fec,0x4724bd4189bd5eac,
-    0xb2977ee300c50fe7,0x58edec91ec2cb657,
-    0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed,
-    0x8b865b215899f46c,0xbd79e0d20082ee74,
-    0xae67f1e9aec07187,0xecd8590680a3aa11,
-    0xda01ee641a708de9,0xe80e6f4820cc9495,
-    0x884134fe908658b2,0x3109058d147fdcdd,
-    0xaa51823e34a7eede,0xbd4b46f0599fd415,
-    0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a,
-    0x850fadc09923329e,0x3e2cf6bc604ddb0,
-    0xa6539930bf6bff45,0x84db8346b786151c,
-    0xcfe87f7cef46ff16,0xe612641865679a63,
-    0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e,
-    0xa26da3999aef7749,0xe3be5e330f38f09d,
-    0xcb090c8001ab551c,0x5cadf5bfd3072cc5,
-    0xfdcb4fa002162a63,0x73d9732fc7c8f7f6,
-    0x9e9f11c4014dda7e,0x2867e7fddcdd9afa,
-    0xc646d63501a1511d,0xb281e1fd541501b8,
-    0xf7d88bc24209a565,0x1f225a7ca91a4226,
-    0x9ae757596946075f,0x3375788de9b06958,
-    0xc1a12d2fc3978937,0x52d6b1641c83ae,
-    0xf209787bb47d6b84,0xc0678c5dbd23a49a,
-    0x9745eb4d50ce6332,0xf840b7ba963646e0,
-    0xbd176620a501fbff,0xb650e5a93bc3d898,
-    0xec5d3fa8ce427aff,0xa3e51f138ab4cebe,
-    0x93ba47c980e98cdf,0xc66f336c36b10137,
-    0xb8a8d9bbe123f017,0xb80b0047445d4184,
-    0xe6d3102ad96cec1d,0xa60dc059157491e5,
-    0x9043ea1ac7e41392,0x87c89837ad68db2f,
-    0xb454e4a179dd1877,0x29babe4598c311fb,
-    0xe16a1dc9d8545e94,0xf4296dd6fef3d67a,
-    0x8ce2529e2734bb1d,0x1899e4a65f58660c,
-    0xb01ae745b101e9e4,0x5ec05dcff72e7f8f,
-    0xdc21a1171d42645d,0x76707543f4fa1f73,
-    0x899504ae72497eba,0x6a06494a791c53a8,
-    0xabfa45da0edbde69,0x487db9d17636892,
-    0xd6f8d7509292d603,0x45a9d2845d3c42b6,
-    0x865b86925b9bc5c2,0xb8a2392ba45a9b2,
-    0xa7f26836f282b732,0x8e6cac7768d7141e,
-    0xd1ef0244af2364ff,0x3207d795430cd926,
-    0x8335616aed761f1f,0x7f44e6bd49e807b8,
-    0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6,
-    0xcd036837130890a1,0x36dba887c37a8c0f,
-    0x802221226be55a64,0xc2494954da2c9789,
-    0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c,
-    0xc83553c5c8965d3d,0x6f92829494e5acc7,
-    0xfa42a8b73abbf48c,0xcb772339ba1f17f9,
-    0x9c69a97284b578d7,0xff2a760414536efb,
-    0xc38413cf25e2d70d,0xfef5138519684aba,
-    0xf46518c2ef5b8cd1,0x7eb258665fc25d69,
-    0x98bf2f79d5993802,0xef2f773ffbd97a61,
-    0xbeeefb584aff8603,0xaafb550ffacfd8fa,
-    0xeeaaba2e5dbf6784,0x95ba2a53f983cf38,
-    0x952ab45cfa97a0b2,0xdd945a747bf26183,
-    0xba756174393d88df,0x94f971119aeef9e4,
-    0xe912b9d1478ceb17,0x7a37cd5601aab85d,
-    0x91abb422ccb812ee,0xac62e055c10ab33a,
-    0xb616a12b7fe617aa,0x577b986b314d6009,
-    0xe39c49765fdf9d94,0xed5a7e85fda0b80b,
-    0x8e41ade9fbebc27d,0x14588f13be847307,
-    0xb1d219647ae6b31c,0x596eb2d8ae258fc8,
-    0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb,
-    0x8aec23d680043bee,0x25de7bb9480d5854,
-    0xada72ccc20054ae9,0xaf561aa79a10ae6a,
-    0xd910f7ff28069da4,0x1b2ba1518094da04,
-    0x87aa9aff79042286,0x90fb44d2f05d0842,
-    0xa99541bf57452b28,0x353a1607ac744a53,
-    0xd3fa922f2d1675f2,0x42889b8997915ce8,
-    0x847c9b5d7c2e09b7,0x69956135febada11,
-    0xa59bc234db398c25,0x43fab9837e699095,
-    0xcf02b2c21207ef2e,0x94f967e45e03f4bb,
-    0x8161afb94b44f57d,0x1d1be0eebac278f5,
-    0xa1ba1ba79e1632dc,0x6462d92a69731732,
-    0xca28a291859bbf93,0x7d7b8f7503cfdcfe,
-    0xfcb2cb35e702af78,0x5cda735244c3d43e,
-    0x9defbf01b061adab,0x3a0888136afa64a7,
-    0xc56baec21c7a1916,0x88aaa1845b8fdd0,
-    0xf6c69a72a3989f5b,0x8aad549e57273d45,
-    0x9a3c2087a63f6399,0x36ac54e2f678864b,
-    0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd,
-    0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5,
-    0x969eb7c47859e743,0x9f644ae5a4b1b325,
-    0xbc4665b596706114,0x873d5d9f0dde1fee,
-    0xeb57ff22fc0c7959,0xa90cb506d155a7ea,
-    0x9316ff75dd87cbd8,0x9a7f12442d588f2,
-    0xb7dcbf5354e9bece,0xc11ed6d538aeb2f,
-    0xe5d3ef282a242e81,0x8f1668c8a86da5fa,
-    0x8fa475791a569d10,0xf96e017d694487bc,
-    0xb38d92d760ec4455,0x37c981dcc395a9ac,
-    0xe070f78d3927556a,0x85bbe253f47b1417,
-    0x8c469ab843b89562,0x93956d7478ccec8e,
-    0xaf58416654a6babb,0x387ac8d1970027b2,
-    0xdb2e51bfe9d0696a,0x6997b05fcc0319e,
-    0x88fcf317f22241e2,0x441fece3bdf81f03,
-    0xab3c2fddeeaad25a,0xd527e81cad7626c3,
-    0xd60b3bd56a5586f1,0x8a71e223d8d3b074,
-    0x85c7056562757456,0xf6872d5667844e49,
-    0xa738c6bebb12d16c,0xb428f8ac016561db,
-    0xd106f86e69d785c7,0xe13336d701beba52,
-    0x82a45b450226b39c,0xecc0024661173473,
-    0xa34d721642b06084,0x27f002d7f95d0190,
-    0xcc20ce9bd35c78a5,0x31ec038df7b441f4,
-    0xff290242c83396ce,0x7e67047175a15271,
-    0x9f79a169bd203e41,0xf0062c6e984d386,
-    0xc75809c42c684dd1,0x52c07b78a3e60868,
-    0xf92e0c3537826145,0xa7709a56ccdf8a82,
-    0x9bbcc7a142b17ccb,0x88a66076400bb691,
-    0xc2abf989935ddbfe,0x6acff893d00ea435,
-    0xf356f7ebf83552fe,0x583f6b8c4124d43,
-    0x98165af37b2153de,0xc3727a337a8b704a,
-    0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c,
-    0xeda2ee1c7064130c,0x1162def06f79df73,
-    0x9485d4d1c63e8be7,0x8addcb5645ac2ba8,
-    0xb9a74a0637ce2ee1,0x6d953e2bd7173692,
-    0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437,
-    0x910ab1d4db9914a0,0x1d9c9892400a22a2,
-    0xb54d5e4a127f59c8,0x2503beb6d00cab4b,
-    0xe2a0b5dc971f303a,0x2e44ae64840fd61d,
-    0x8da471a9de737e24,0x5ceaecfed289e5d2,
-    0xb10d8e1456105dad,0x7425a83e872c5f47,
-    0xdd50f1996b947518,0xd12f124e28f77719,
-    0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f,
-    0xace73cbfdc0bfb7b,0x636cc64d1001550b,
-    0xd8210befd30efa5a,0x3c47f7e05401aa4e,
-    0x8714a775e3e95c78,0x65acfaec34810a71,
-    0xa8d9d1535ce3b396,0x7f1839a741a14d0d,
-    0xd31045a8341ca07c,0x1ede48111209a050,
-    0x83ea2b892091e44d,0x934aed0aab460432,
-    0xa4e4b66b68b65d60,0xf81da84d5617853f,
-    0xce1de40642e3f4b9,0x36251260ab9d668e,
-    0x80d2ae83e9ce78f3,0xc1d72b7c6b426019,
-    0xa1075a24e4421730,0xb24cf65b8612f81f,
-    0xc94930ae1d529cfc,0xdee033f26797b627,
-    0xfb9b7cd9a4a7443c,0x169840ef017da3b1,
-    0x9d412e0806e88aa5,0x8e1f289560ee864e,
-    0xc491798a08a2ad4e,0xf1a6f2bab92a27e2,
-    0xf5b5d7ec8acb58a2,0xae10af696774b1db,
-    0x9991a6f3d6bf1765,0xacca6da1e0a8ef29,
-    0xbff610b0cc6edd3f,0x17fd090a58d32af3,
-    0xeff394dcff8a948e,0xddfc4b4cef07f5b0,
-    0x95f83d0a1fb69cd9,0x4abdaf101564f98e,
-    0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1,
-    0xea53df5fd18d5513,0x84c86189216dc5ed,
-    0x92746b9be2f8552c,0x32fd3cf5b4e49bb4,
-    0xb7118682dbb66a77,0x3fbc8c33221dc2a1,
-    0xe4d5e82392a40515,0xfabaf3feaa5334a,
-    0x8f05b1163ba6832d,0x29cb4d87f2a7400e,
-    0xb2c71d5bca9023f8,0x743e20e9ef511012,
-    0xdf78e4b2bd342cf6,0x914da9246b255416,
-    0x8bab8eefb6409c1a,0x1ad089b6c2f7548e,
-    0xae9672aba3d0c320,0xa184ac2473b529b1,
-    0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e,
-    0x8865899617fb1871,0x7e2fa67c7a658892,
-    0xaa7eebfb9df9de8d,0xddbb901b98feeab7,
-    0xd51ea6fa85785631,0x552a74227f3ea565,
-    0x8533285c936b35de,0xd53a88958f87275f,
-    0xa67ff273b8460356,0x8a892abaf368f137,
-    0xd01fef10a657842c,0x2d2b7569b0432d85,
-    0x8213f56a67f6b29b,0x9c3b29620e29fc73,
-    0xa298f2c501f45f42,0x8349f3ba91b47b8f,
-    0xcb3f2f7642717713,0x241c70a936219a73,
-    0xfe0efb53d30dd4d7,0xed238cd383aa0110,
-    0x9ec95d1463e8a506,0xf4363804324a40aa,
-    0xc67bb4597ce2ce48,0xb143c6053edcd0d5,
-    0xf81aa16fdc1b81da,0xdd94b7868e94050a,
-    0x9b10a4e5e9913128,0xca7cf2b4191c8326,
-    0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0,
-    0xf24a01a73cf2dccf,0xbc633b39673c8cec,
-    0x976e41088617ca01,0xd5be0503e085d813,
-    0xbd49d14aa79dbc82,0x4b2d8644d8a74e18,
-    0xec9c459d51852ba2,0xddf8e7d60ed1219e,
-    0x93e1ab8252f33b45,0xcabb90e5c942b503,
-    0xb8da1662e7b00a17,0x3d6a751f3b936243,
-    0xe7109bfba19c0c9d,0xcc512670a783ad4,
-    0x906a617d450187e2,0x27fb2b80668b24c5,
-    0xb484f9dc9641e9da,0xb1f9f660802dedf6,
-    0xe1a63853bbd26451,0x5e7873f8a0396973,
-    0x8d07e33455637eb2,0xdb0b487b6423e1e8,
-    0xb049dc016abc5e5f,0x91ce1a9a3d2cda62,
-    0xdc5c5301c56b75f7,0x7641a140cc7810fb,
-    0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d,
-    0xac2820d9623bf429,0x546345fa9fbdcd44,
-    0xd732290fbacaf133,0xa97c177947ad4095,
-    0x867f59a9d4bed6c0,0x49ed8eabcccc485d,
-    0xa81f301449ee8c70,0x5c68f256bfff5a74,
-    0xd226fc195c6a2f8c,0x73832eec6fff3111,
-    0x83585d8fd9c25db7,0xc831fd53c5ff7eab,
-    0xa42e74f3d032f525,0xba3e7ca8b77f5e55,
-    0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb,
-    0x80444b5e7aa7cf85,0x7980d163cf5b81b3,
-    0xa0555e361951c366,0xd7e105bcc332621f,
-    0xc86ab5c39fa63440,0x8dd9472bf3fefaa7,
-    0xfa856334878fc150,0xb14f98f6f0feb951,
-    0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3,
-    0xc3b8358109e84f07,0xa862f80ec4700c8,
-    0xf4a642e14c6262c8,0xcd27bb612758c0fa,
-    0x98e7e9cccfbd7dbd,0x8038d51cb897789c,
-    0xbf21e44003acdd2c,0xe0470a63e6bd56c3,
-    0xeeea5d5004981478,0x1858ccfce06cac74,
-    0x95527a5202df0ccb,0xf37801e0c43ebc8,
-    0xbaa718e68396cffd,0xd30560258f54e6ba,
-    0xe950df20247c83fd,0x47c6b82ef32a2069,
-    0x91d28b7416cdd27e,0x4cdc331d57fa5441,
-    0xb6472e511c81471d,0xe0133fe4adf8e952,
-    0xe3d8f9e563a198e5,0x58180fddd97723a6,
-    0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,};
+  constexpr static int smallest_power_of_five =
+      binary_format<double>::smallest_power_of_ten();
+  constexpr static int largest_power_of_five =
+      binary_format<double>::largest_power_of_ten();
+  constexpr static int number_of_entries =
+      2 * (largest_power_of_five - smallest_power_of_five + 1);
+  // Powers of five from 5^-342 all the way to 5^308 rounded toward one.
+  constexpr static uint64_t power_of_five_128[number_of_entries] = {
+      0xeef453d6923bd65a, 0x113faa2906a13b3f,
+      0x9558b4661b6565f8, 0x4ac7ca59a424c507,
+      0xbaaee17fa23ebf76, 0x5d79bcf00d2df649,
+      0xe95a99df8ace6f53, 0xf4d82c2c107973dc,
+      0x91d8a02bb6c10594, 0x79071b9b8a4be869,
+      0xb64ec836a47146f9, 0x9748e2826cdee284,
+      0xe3e27a444d8d98b7, 0xfd1b1b2308169b25,
+      0x8e6d8c6ab0787f72, 0xfe30f0f5e50e20f7,
+      0xb208ef855c969f4f, 0xbdbd2d335e51a935,
+      0xde8b2b66b3bc4723, 0xad2c788035e61382,
+      0x8b16fb203055ac76, 0x4c3bcb5021afcc31,
+      0xaddcb9e83c6b1793, 0xdf4abe242a1bbf3d,
+      0xd953e8624b85dd78, 0xd71d6dad34a2af0d,
+      0x87d4713d6f33aa6b, 0x8672648c40e5ad68,
+      0xa9c98d8ccb009506, 0x680efdaf511f18c2,
+      0xd43bf0effdc0ba48, 0x212bd1b2566def2,
+      0x84a57695fe98746d, 0x14bb630f7604b57,
+      0xa5ced43b7e3e9188, 0x419ea3bd35385e2d,
+      0xcf42894a5dce35ea, 0x52064cac828675b9,
+      0x818995ce7aa0e1b2, 0x7343efebd1940993,
+      0xa1ebfb4219491a1f, 0x1014ebe6c5f90bf8,
+      0xca66fa129f9b60a6, 0xd41a26e077774ef6,
+      0xfd00b897478238d0, 0x8920b098955522b4,
+      0x9e20735e8cb16382, 0x55b46e5f5d5535b0,
+      0xc5a890362fddbc62, 0xeb2189f734aa831d,
+      0xf712b443bbd52b7b, 0xa5e9ec7501d523e4,
+      0x9a6bb0aa55653b2d, 0x47b233c92125366e,
+      0xc1069cd4eabe89f8, 0x999ec0bb696e840a,
+      0xf148440a256e2c76, 0xc00670ea43ca250d,
+      0x96cd2a865764dbca, 0x380406926a5e5728,
+      0xbc807527ed3e12bc, 0xc605083704f5ecf2,
+      0xeba09271e88d976b, 0xf7864a44c633682e,
+      0x93445b8731587ea3, 0x7ab3ee6afbe0211d,
+      0xb8157268fdae9e4c, 0x5960ea05bad82964,
+      0xe61acf033d1a45df, 0x6fb92487298e33bd,
+      0x8fd0c16206306bab, 0xa5d3b6d479f8e056,
+      0xb3c4f1ba87bc8696, 0x8f48a4899877186c,
+      0xe0b62e2929aba83c, 0x331acdabfe94de87,
+      0x8c71dcd9ba0b4925, 0x9ff0c08b7f1d0b14,
+      0xaf8e5410288e1b6f, 0x7ecf0ae5ee44dd9,
+      0xdb71e91432b1a24a, 0xc9e82cd9f69d6150,
+      0x892731ac9faf056e, 0xbe311c083a225cd2,
+      0xab70fe17c79ac6ca, 0x6dbd630a48aaf406,
+      0xd64d3d9db981787d, 0x92cbbccdad5b108,
+      0x85f0468293f0eb4e, 0x25bbf56008c58ea5,
+      0xa76c582338ed2621, 0xaf2af2b80af6f24e,
+      0xd1476e2c07286faa, 0x1af5af660db4aee1,
+      0x82cca4db847945ca, 0x50d98d9fc890ed4d,
+      0xa37fce126597973c, 0xe50ff107bab528a0,
+      0xcc5fc196fefd7d0c, 0x1e53ed49a96272c8,
+      0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7a,
+      0x9faacf3df73609b1, 0x77b191618c54e9ac,
+      0xc795830d75038c1d, 0xd59df5b9ef6a2417,
+      0xf97ae3d0d2446f25, 0x4b0573286b44ad1d,
+      0x9becce62836ac577, 0x4ee367f9430aec32,
+      0xc2e801fb244576d5, 0x229c41f793cda73f,
+      0xf3a20279ed56d48a, 0x6b43527578c1110f,
+      0x9845418c345644d6, 0x830a13896b78aaa9,
+      0xbe5691ef416bd60c, 0x23cc986bc656d553,
+      0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa8,
+      0x94b3a202eb1c3f39, 0x7bf7d71432f3d6a9,
+      0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc53,
+      0xe858ad248f5c22c9, 0xd1b3400f8f9cff68,
+      0x91376c36d99995be, 0x23100809b9c21fa1,
+      0xb58547448ffffb2d, 0xabd40a0c2832a78a,
+      0xe2e69915b3fff9f9, 0x16c90c8f323f516c,
+      0x8dd01fad907ffc3b, 0xae3da7d97f6792e3,
+      0xb1442798f49ffb4a, 0x99cd11cfdf41779c,
+      0xdd95317f31c7fa1d, 0x40405643d711d583,
+      0x8a7d3eef7f1cfc52, 0x482835ea666b2572,
+      0xad1c8eab5ee43b66, 0xda3243650005eecf,
+      0xd863b256369d4a40, 0x90bed43e40076a82,
+      0x873e4f75e2224e68, 0x5a7744a6e804a291,
+      0xa90de3535aaae202, 0x711515d0a205cb36,
+      0xd3515c2831559a83, 0xd5a5b44ca873e03,
+      0x8412d9991ed58091, 0xe858790afe9486c2,
+      0xa5178fff668ae0b6, 0x626e974dbe39a872,
+      0xce5d73ff402d98e3, 0xfb0a3d212dc8128f,
+      0x80fa687f881c7f8e, 0x7ce66634bc9d0b99,
+      0xa139029f6a239f72, 0x1c1fffc1ebc44e80,
+      0xc987434744ac874e, 0xa327ffb266b56220,
+      0xfbe9141915d7a922, 0x4bf1ff9f0062baa8,
+      0x9d71ac8fada6c9b5, 0x6f773fc3603db4a9,
+      0xc4ce17b399107c22, 0xcb550fb4384d21d3,
+      0xf6019da07f549b2b, 0x7e2a53a146606a48,
+      0x99c102844f94e0fb, 0x2eda7444cbfc426d,
+      0xc0314325637a1939, 0xfa911155fefb5308,
+      0xf03d93eebc589f88, 0x793555ab7eba27ca,
+      0x96267c7535b763b5, 0x4bc1558b2f3458de,
+      0xbbb01b9283253ca2, 0x9eb1aaedfb016f16,
+      0xea9c227723ee8bcb, 0x465e15a979c1cadc,
+      0x92a1958a7675175f, 0xbfacd89ec191ec9,
+      0xb749faed14125d36, 0xcef980ec671f667b,
+      0xe51c79a85916f484, 0x82b7e12780e7401a,
+      0x8f31cc0937ae58d2, 0xd1b2ecb8b0908810,
+      0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa15,
+      0xdfbdcece67006ac9, 0x67a791e093e1d49a,
+      0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e0,
+      0xaecc49914078536d, 0x58fae9f773886e18,
+      0xda7f5bf590966848, 0xaf39a475506a899e,
+      0x888f99797a5e012d, 0x6d8406c952429603,
+      0xaab37fd7d8f58178, 0xc8e5087ba6d33b83,
+      0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a64,
+      0x855c3be0a17fcd26, 0x5cf2eea09a55067f,
+      0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481e,
+      0xd0601d8efc57b08b, 0xf13b94daf124da26,
+      0x823c12795db6ce57, 0x76c53d08d6b70858,
+      0xa2cb1717b52481ed, 0x54768c4b0c64ca6e,
+      0xcb7ddcdda26da268, 0xa9942f5dcf7dfd09,
+      0xfe5d54150b090b02, 0xd3f93b35435d7c4c,
+      0x9efa548d26e5a6e1, 0xc47bc5014a1a6daf,
+      0xc6b8e9b0709f109a, 0x359ab6419ca1091b,
+      0xf867241c8cc6d4c0, 0xc30163d203c94b62,
+      0x9b407691d7fc44f8, 0x79e0de63425dcf1d,
+      0xc21094364dfb5636, 0x985915fc12f542e4,
+      0xf294b943e17a2bc4, 0x3e6f5b7b17b2939d,
+      0x979cf3ca6cec5b5a, 0xa705992ceecf9c42,
+      0xbd8430bd08277231, 0x50c6ff782a838353,
+      0xece53cec4a314ebd, 0xa4f8bf5635246428,
+      0x940f4613ae5ed136, 0x871b7795e136be99,
+      0xb913179899f68584, 0x28e2557b59846e3f,
+      0xe757dd7ec07426e5, 0x331aeada2fe589cf,
+      0x9096ea6f3848984f, 0x3ff0d2c85def7621,
+      0xb4bca50b065abe63, 0xfed077a756b53a9,
+      0xe1ebce4dc7f16dfb, 0xd3e8495912c62894,
+      0x8d3360f09cf6e4bd, 0x64712dd7abbbd95c,
+      0xb080392cc4349dec, 0xbd8d794d96aacfb3,
+      0xdca04777f541c567, 0xecf0d7a0fc5583a0,
+      0x89e42caaf9491b60, 0xf41686c49db57244,
+      0xac5d37d5b79b6239, 0x311c2875c522ced5,
+      0xd77485cb25823ac7, 0x7d633293366b828b,
+      0x86a8d39ef77164bc, 0xae5dff9c02033197,
+      0xa8530886b54dbdeb, 0xd9f57f830283fdfc,
+      0xd267caa862a12d66, 0xd072df63c324fd7b,
+      0x8380dea93da4bc60, 0x4247cb9e59f71e6d,
+      0xa46116538d0deb78, 0x52d9be85f074e608,
+      0xcd795be870516656, 0x67902e276c921f8b,
+      0x806bd9714632dff6, 0xba1cd8a3db53b6,
+      0xa086cfcd97bf97f3, 0x80e8a40eccd228a4,
+      0xc8a883c0fdaf7df0, 0x6122cd128006b2cd,
+      0xfad2a4b13d1b5d6c, 0x796b805720085f81,
+      0x9cc3a6eec6311a63, 0xcbe3303674053bb0,
+      0xc3f490aa77bd60fc, 0xbedbfc4411068a9c,
+      0xf4f1b4d515acb93b, 0xee92fb5515482d44,
+      0x991711052d8bf3c5, 0x751bdd152d4d1c4a,
+      0xbf5cd54678eef0b6, 0xd262d45a78a0635d,
+      0xef340a98172aace4, 0x86fb897116c87c34,
+      0x9580869f0e7aac0e, 0xd45d35e6ae3d4da0,
+      0xbae0a846d2195712, 0x8974836059cca109,
+      0xe998d258869facd7, 0x2bd1a438703fc94b,
+      0x91ff83775423cc06, 0x7b6306a34627ddcf,
+      0xb67f6455292cbf08, 0x1a3bc84c17b1d542,
+      0xe41f3d6a7377eeca, 0x20caba5f1d9e4a93,
+      0x8e938662882af53e, 0x547eb47b7282ee9c,
+      0xb23867fb2a35b28d, 0xe99e619a4f23aa43,
+      0xdec681f9f4c31f31, 0x6405fa00e2ec94d4,
+      0x8b3c113c38f9f37e, 0xde83bc408dd3dd04,
+      0xae0b158b4738705e, 0x9624ab50b148d445,
+      0xd98ddaee19068c76, 0x3badd624dd9b0957,
+      0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d6,
+      0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4c,
+      0xd47487cc8470652b, 0x7647c3200069671f,
+      0x84c8d4dfd2c63f3b, 0x29ecd9f40041e073,
+      0xa5fb0a17c777cf09, 0xf468107100525890,
+      0xcf79cc9db955c2cc, 0x7182148d4066eeb4,
+      0x81ac1fe293d599bf, 0xc6f14cd848405530,
+      0xa21727db38cb002f, 0xb8ada00e5a506a7c,
+      0xca9cf1d206fdc03b, 0xa6d90811f0e4851c,
+      0xfd442e4688bd304a, 0x908f4a166d1da663,
+      0x9e4a9cec15763e2e, 0x9a598e4e043287fe,
+      0xc5dd44271ad3cdba, 0x40eff1e1853f29fd,
+      0xf7549530e188c128, 0xd12bee59e68ef47c,
+      0x9a94dd3e8cf578b9, 0x82bb74f8301958ce,
+      0xc13a148e3032d6e7, 0xe36a52363c1faf01,
+      0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac1,
+      0x96f5600f15a7b7e5, 0x29ab103a5ef8c0b9,
+      0xbcb2b812db11a5de, 0x7415d448f6b6f0e7,
+      0xebdf661791d60f56, 0x111b495b3464ad21,
+      0x936b9fcebb25c995, 0xcab10dd900beec34,
+      0xb84687c269ef3bfb, 0x3d5d514f40eea742,
+      0xe65829b3046b0afa, 0xcb4a5a3112a5112,
+      0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ab,
+      0xb3f4e093db73a093, 0x59ed216765690f56,
+      0xe0f218b8d25088b8, 0x306869c13ec3532c,
+      0x8c974f7383725573, 0x1e414218c73a13fb,
+      0xafbd2350644eeacf, 0xe5d1929ef90898fa,
+      0xdbac6c247d62a583, 0xdf45f746b74abf39,
+      0x894bc396ce5da772, 0x6b8bba8c328eb783,
+      0xab9eb47c81f5114f, 0x66ea92f3f326564,
+      0xd686619ba27255a2, 0xc80a537b0efefebd,
+      0x8613fd0145877585, 0xbd06742ce95f5f36,
+      0xa798fc4196e952e7, 0x2c48113823b73704,
+      0xd17f3b51fca3a7a0, 0xf75a15862ca504c5,
+      0x82ef85133de648c4, 0x9a984d73dbe722fb,
+      0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebba,
+      0xcc963fee10b7d1b3, 0x318df905079926a8,
+      0xffbbcfe994e5c61f, 0xfdf17746497f7052,
+      0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa633,
+      0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc0,
+      0xf9bd690a1b68637b, 0x3dfdce7aa3c673b0,
+      0x9c1661a651213e2d, 0x6bea10ca65c084e,
+      0xc31bfa0fe5698db8, 0x486e494fcff30a62,
+      0xf3e2f893dec3f126, 0x5a89dba3c3efccfa,
+      0x986ddb5c6b3a76b7, 0xf89629465a75e01c,
+      0xbe89523386091465, 0xf6bbb397f1135823,
+      0xee2ba6c0678b597f, 0x746aa07ded582e2c,
+      0x94db483840b717ef, 0xa8c2a44eb4571cdc,
+      0xba121a4650e4ddeb, 0x92f34d62616ce413,
+      0xe896a0d7e51e1566, 0x77b020baf9c81d17,
+      0x915e2486ef32cd60, 0xace1474dc1d122e,
+      0xb5b5ada8aaff80b8, 0xd819992132456ba,
+      0xe3231912d5bf60e6, 0x10e1fff697ed6c69,
+      0x8df5efabc5979c8f, 0xca8d3ffa1ef463c1,
+      0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb2,
+      0xddd0467c64bce4a0, 0xac7cb3f6d05ddbde,
+      0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96b,
+      0xad4ab7112eb3929d, 0x86c16c98d2c953c6,
+      0xd89d64d57a607744, 0xe871c7bf077ba8b7,
+      0x87625f056c7c4a8b, 0x11471cd764ad4972,
+      0xa93af6c6c79b5d2d, 0xd598e40d3dd89bcf,
+      0xd389b47879823479, 0x4aff1d108d4ec2c3,
+      0x843610cb4bf160cb, 0xcedf722a585139ba,
+      0xa54394fe1eedb8fe, 0xc2974eb4ee658828,
+      0xce947a3da6a9273e, 0x733d226229feea32,
+      0x811ccc668829b887, 0x806357d5a3f525f,
+      0xa163ff802a3426a8, 0xca07c2dcb0cf26f7,
+      0xc9bcff6034c13052, 0xfc89b393dd02f0b5,
+      0xfc2c3f3841f17c67, 0xbbac2078d443ace2,
+      0x9d9ba7832936edc0, 0xd54b944b84aa4c0d,
+      0xc5029163f384a931, 0xa9e795e65d4df11,
+      0xf64335bcf065d37d, 0x4d4617b5ff4a16d5,
+      0x99ea0196163fa42e, 0x504bced1bf8e4e45,
+      0xc06481fb9bcf8d39, 0xe45ec2862f71e1d6,
+      0xf07da27a82c37088, 0x5d767327bb4e5a4c,
+      0x964e858c91ba2655, 0x3a6a07f8d510f86f,
+      0xbbe226efb628afea, 0x890489f70a55368b,
+      0xeadab0aba3b2dbe5, 0x2b45ac74ccea842e,
+      0x92c8ae6b464fc96f, 0x3b0b8bc90012929d,
+      0xb77ada0617e3bbcb, 0x9ce6ebb40173744,
+      0xe55990879ddcaabd, 0xcc420a6a101d0515,
+      0x8f57fa54c2a9eab6, 0x9fa946824a12232d,
+      0xb32df8e9f3546564, 0x47939822dc96abf9,
+      0xdff9772470297ebd, 0x59787e2b93bc56f7,
+      0x8bfbea76c619ef36, 0x57eb4edb3c55b65a,
+      0xaefae51477a06b03, 0xede622920b6b23f1,
+      0xdab99e59958885c4, 0xe95fab368e45eced,
+      0x88b402f7fd75539b, 0x11dbcb0218ebb414,
+      0xaae103b5fcd2a881, 0xd652bdc29f26a119,
+      0xd59944a37c0752a2, 0x4be76d3346f0495f,
+      0x857fcae62d8493a5, 0x6f70a4400c562ddb,
+      0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb952,
+      0xd097ad07a71f26b2, 0x7e2000a41346a7a7,
+      0x825ecc24c873782f, 0x8ed400668c0c28c8,
+      0xa2f67f2dfa90563b, 0x728900802f0f32fa,
+      0xcbb41ef979346bca, 0x4f2b40a03ad2ffb9,
+      0xfea126b7d78186bc, 0xe2f610c84987bfa8,
+      0x9f24b832e6b0f436, 0xdd9ca7d2df4d7c9,
+      0xc6ede63fa05d3143, 0x91503d1c79720dbb,
+      0xf8a95fcf88747d94, 0x75a44c6397ce912a,
+      0x9b69dbe1b548ce7c, 0xc986afbe3ee11aba,
+      0xc24452da229b021b, 0xfbe85badce996168,
+      0xf2d56790ab41c2a2, 0xfae27299423fb9c3,
+      0x97c560ba6b0919a5, 0xdccd879fc967d41a,
+      0xbdb6b8e905cb600f, 0x5400e987bbc1c920,
+      0xed246723473e3813, 0x290123e9aab23b68,
+      0x9436c0760c86e30b, 0xf9a0b6720aaf6521,
+      0xb94470938fa89bce, 0xf808e40e8d5b3e69,
+      0xe7958cb87392c2c2, 0xb60b1d1230b20e04,
+      0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c2,
+      0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af3,
+      0xe2280b6c20dd5232, 0x25c6da63c38de1b0,
+      0x8d590723948a535f, 0x579c487e5a38ad0e,
+      0xb0af48ec79ace837, 0x2d835a9df0c6d851,
+      0xdcdb1b2798182244, 0xf8e431456cf88e65,
+      0x8a08f0f8bf0f156b, 0x1b8e9ecb641b58ff,
+      0xac8b2d36eed2dac5, 0xe272467e3d222f3f,
+      0xd7adf884aa879177, 0x5b0ed81dcc6abb0f,
+      0x86ccbb52ea94baea, 0x98e947129fc2b4e9,
+      0xa87fea27a539e9a5, 0x3f2398d747b36224,
+      0xd29fe4b18e88640e, 0x8eec7f0d19a03aad,
+      0x83a3eeeef9153e89, 0x1953cf68300424ac,
+      0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd7,
+      0xcdb02555653131b6, 0x3792f412cb06794d,
+      0x808e17555f3ebf11, 0xe2bbd88bbee40bd0,
+      0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec4,
+      0xc8de047564d20a8b, 0xf245825a5a445275,
+      0xfb158592be068d2e, 0xeed6e2f0f0d56712,
+      0x9ced737bb6c4183d, 0x55464dd69685606b,
+      0xc428d05aa4751e4c, 0xaa97e14c3c26b886,
+      0xf53304714d9265df, 0xd53dd99f4b3066a8,
+      0x993fe2c6d07b7fab, 0xe546a8038efe4029,
+      0xbf8fdb78849a5f96, 0xde98520472bdd033,
+      0xef73d256a5c0f77c, 0x963e66858f6d4440,
+      0x95a8637627989aad, 0xdde7001379a44aa8,
+      0xbb127c53b17ec159, 0x5560c018580d5d52,
+      0xe9d71b689dde71af, 0xaab8f01e6e10b4a6,
+      0x9226712162ab070d, 0xcab3961304ca70e8,
+      0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d22,
+      0xe45c10c42a2b3b05, 0x8cb89a7db77c506a,
+      0x8eb98a7a9a5b04e3, 0x77f3608e92adb242,
+      0xb267ed1940f1c61c, 0x55f038b237591ed3,
+      0xdf01e85f912e37a3, 0x6b6c46dec52f6688,
+      0x8b61313bbabce2c6, 0x2323ac4b3b3da015,
+      0xae397d8aa96c1b77, 0xabec975e0a0d081a,
+      0xd9c7dced53c72255, 0x96e7bd358c904a21,
+      0x881cea14545c7575, 0x7e50d64177da2e54,
+      0xaa242499697392d2, 0xdde50bd1d5d0b9e9,
+      0xd4ad2dbfc3d07787, 0x955e4ec64b44e864,
+      0x84ec3c97da624ab4, 0xbd5af13bef0b113e,
+      0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58e,
+      0xcfb11ead453994ba, 0x67de18eda5814af2,
+      0x81ceb32c4b43fcf4, 0x80eacf948770ced7,
+      0xa2425ff75e14fc31, 0xa1258379a94d028d,
+      0xcad2f7f5359a3b3e, 0x96ee45813a04330,
+      0xfd87b5f28300ca0d, 0x8bca9d6e188853fc,
+      0x9e74d1b791e07e48, 0x775ea264cf55347e,
+      0xc612062576589dda, 0x95364afe032a819e,
+      0xf79687aed3eec551, 0x3a83ddbd83f52205,
+      0x9abe14cd44753b52, 0xc4926a9672793543,
+      0xc16d9a0095928a27, 0x75b7053c0f178294,
+      0xf1c90080baf72cb1, 0x5324c68b12dd6339,
+      0x971da05074da7bee, 0xd3f6fc16ebca5e04,
+      0xbce5086492111aea, 0x88f4bb1ca6bcf585,
+      0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6,
+      0x9392ee8e921d5d07, 0x3aff322e62439fd0,
+      0xb877aa3236a4b449, 0x9befeb9fad487c3,
+      0xe69594bec44de15b, 0x4c2ebe687989a9b4,
+      0x901d7cf73ab0acd9, 0xf9d37014bf60a11,
+      0xb424dc35095cd80f, 0x538484c19ef38c95,
+      0xe12e13424bb40e13, 0x2865a5f206b06fba,
+      0x8cbccc096f5088cb, 0xf93f87b7442e45d4,
+      0xafebff0bcb24aafe, 0xf78f69a51539d749,
+      0xdbe6fecebdedd5be, 0xb573440e5a884d1c,
+      0x89705f4136b4a597, 0x31680a88f8953031,
+      0xabcc77118461cefc, 0xfdc20d2b36ba7c3e,
+      0xd6bf94d5e57a42bc, 0x3d32907604691b4d,
+      0x8637bd05af6c69b5, 0xa63f9a49c2c1b110,
+      0xa7c5ac471b478423, 0xfcf80dc33721d54,
+      0xd1b71758e219652b, 0xd3c36113404ea4a9,
+      0x83126e978d4fdf3b, 0x645a1cac083126ea,
+      0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4,
+      0xcccccccccccccccc, 0xcccccccccccccccd,
+      0x8000000000000000, 0x0,
+      0xa000000000000000, 0x0,
+      0xc800000000000000, 0x0,
+      0xfa00000000000000, 0x0,
+      0x9c40000000000000, 0x0,
+      0xc350000000000000, 0x0,
+      0xf424000000000000, 0x0,
+      0x9896800000000000, 0x0,
+      0xbebc200000000000, 0x0,
+      0xee6b280000000000, 0x0,
+      0x9502f90000000000, 0x0,
+      0xba43b74000000000, 0x0,
+      0xe8d4a51000000000, 0x0,
+      0x9184e72a00000000, 0x0,
+      0xb5e620f480000000, 0x0,
+      0xe35fa931a0000000, 0x0,
+      0x8e1bc9bf04000000, 0x0,
+      0xb1a2bc2ec5000000, 0x0,
+      0xde0b6b3a76400000, 0x0,
+      0x8ac7230489e80000, 0x0,
+      0xad78ebc5ac620000, 0x0,
+      0xd8d726b7177a8000, 0x0,
+      0x878678326eac9000, 0x0,
+      0xa968163f0a57b400, 0x0,
+      0xd3c21bcecceda100, 0x0,
+      0x84595161401484a0, 0x0,
+      0xa56fa5b99019a5c8, 0x0,
+      0xcecb8f27f4200f3a, 0x0,
+      0x813f3978f8940984, 0x4000000000000000,
+      0xa18f07d736b90be5, 0x5000000000000000,
+      0xc9f2c9cd04674ede, 0xa400000000000000,
+      0xfc6f7c4045812296, 0x4d00000000000000,
+      0x9dc5ada82b70b59d, 0xf020000000000000,
+      0xc5371912364ce305, 0x6c28000000000000,
+      0xf684df56c3e01bc6, 0xc732000000000000,
+      0x9a130b963a6c115c, 0x3c7f400000000000,
+      0xc097ce7bc90715b3, 0x4b9f100000000000,
+      0xf0bdc21abb48db20, 0x1e86d40000000000,
+      0x96769950b50d88f4, 0x1314448000000000,
+      0xbc143fa4e250eb31, 0x17d955a000000000,
+      0xeb194f8e1ae525fd, 0x5dcfab0800000000,
+      0x92efd1b8d0cf37be, 0x5aa1cae500000000,
+      0xb7abc627050305ad, 0xf14a3d9e40000000,
+      0xe596b7b0c643c719, 0x6d9ccd05d0000000,
+      0x8f7e32ce7bea5c6f, 0xe4820023a2000000,
+      0xb35dbf821ae4f38b, 0xdda2802c8a800000,
+      0xe0352f62a19e306e, 0xd50b2037ad200000,
+      0x8c213d9da502de45, 0x4526f422cc340000,
+      0xaf298d050e4395d6, 0x9670b12b7f410000,
+      0xdaf3f04651d47b4c, 0x3c0cdd765f114000,
+      0x88d8762bf324cd0f, 0xa5880a69fb6ac800,
+      0xab0e93b6efee0053, 0x8eea0d047a457a00,
+      0xd5d238a4abe98068, 0x72a4904598d6d880,
+      0x85a36366eb71f041, 0x47a6da2b7f864750,
+      0xa70c3c40a64e6c51, 0x999090b65f67d924,
+      0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d,
+      0x82818f1281ed449f, 0xbff8f10e7a8921a4,
+      0xa321f2d7226895c7, 0xaff72d52192b6a0d,
+      0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490,
+      0xfee50b7025c36a08, 0x2f236d04753d5b4,
+      0x9f4f2726179a2245, 0x1d762422c946590,
+      0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5,
+      0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2,
+      0x9b934c3b330c8577, 0x63cc55f49f88eb2f,
+      0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb,
+      0xf316271c7fc3908a, 0x8bef464e3945ef7a,
+      0x97edd871cfda3a56, 0x97758bf0e3cbb5ac,
+      0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317,
+      0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd,
+      0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a,
+      0xb975d6b6ee39e436, 0xb3e2fd538e122b44,
+      0xe7d34c64a9c85d44, 0x60dbbca87196b616,
+      0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd,
+      0xb51d13aea4a488dd, 0x6babab6398bdbe41,
+      0xe264589a4dcdab14, 0xc696963c7eed2dd1,
+      0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2,
+      0xb0de65388cc8ada8, 0x3b25a55f43294bcb,
+      0xdd15fe86affad912, 0x49ef0eb713f39ebe,
+      0x8a2dbf142dfcc7ab, 0x6e3569326c784337,
+      0xacb92ed9397bf996, 0x49c2c37f07965404,
+      0xd7e77a8f87daf7fb, 0xdc33745ec97be906,
+      0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3,
+      0xa8acd7c0222311bc, 0xc40832ea0d68ce0c,
+      0xd2d80db02aabd62b, 0xf50a3fa490c30190,
+      0x83c7088e1aab65db, 0x792667c6da79e0fa,
+      0xa4b8cab1a1563f52, 0x577001b891185938,
+      0xcde6fd5e09abcf26, 0xed4c0226b55e6f86,
+      0x80b05e5ac60b6178, 0x544f8158315b05b4,
+      0xa0dc75f1778e39d6, 0x696361ae3db1c721,
+      0xc913936dd571c84c, 0x3bc3a19cd1e38e9,
+      0xfb5878494ace3a5f, 0x4ab48a04065c723,
+      0x9d174b2dcec0e47b, 0x62eb0d64283f9c76,
+      0xc45d1df942711d9a, 0x3ba5d0bd324f8394,
+      0xf5746577930d6500, 0xca8f44ec7ee36479,
+      0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb,
+      0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e,
+      0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e,
+      0x95d04aee3b80ece5, 0xbba1f1d158724a12,
+      0xbb445da9ca61281f, 0x2a8a6e45ae8edc97,
+      0xea1575143cf97226, 0xf52d09d71a3293bd,
+      0x924d692ca61be758, 0x593c2626705f9c56,
+      0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c,
+      0xe498f455c38b997a, 0xb6dfb9c0f956447,
+      0x8edf98b59a373fec, 0x4724bd4189bd5eac,
+      0xb2977ee300c50fe7, 0x58edec91ec2cb657,
+      0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed,
+      0x8b865b215899f46c, 0xbd79e0d20082ee74,
+      0xae67f1e9aec07187, 0xecd8590680a3aa11,
+      0xda01ee641a708de9, 0xe80e6f4820cc9495,
+      0x884134fe908658b2, 0x3109058d147fdcdd,
+      0xaa51823e34a7eede, 0xbd4b46f0599fd415,
+      0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a,
+      0x850fadc09923329e, 0x3e2cf6bc604ddb0,
+      0xa6539930bf6bff45, 0x84db8346b786151c,
+      0xcfe87f7cef46ff16, 0xe612641865679a63,
+      0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e,
+      0xa26da3999aef7749, 0xe3be5e330f38f09d,
+      0xcb090c8001ab551c, 0x5cadf5bfd3072cc5,
+      0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6,
+      0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa,
+      0xc646d63501a1511d, 0xb281e1fd541501b8,
+      0xf7d88bc24209a565, 0x1f225a7ca91a4226,
+      0x9ae757596946075f, 0x3375788de9b06958,
+      0xc1a12d2fc3978937, 0x52d6b1641c83ae,
+      0xf209787bb47d6b84, 0xc0678c5dbd23a49a,
+      0x9745eb4d50ce6332, 0xf840b7ba963646e0,
+      0xbd176620a501fbff, 0xb650e5a93bc3d898,
+      0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe,
+      0x93ba47c980e98cdf, 0xc66f336c36b10137,
+      0xb8a8d9bbe123f017, 0xb80b0047445d4184,
+      0xe6d3102ad96cec1d, 0xa60dc059157491e5,
+      0x9043ea1ac7e41392, 0x87c89837ad68db2f,
+      0xb454e4a179dd1877, 0x29babe4598c311fb,
+      0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a,
+      0x8ce2529e2734bb1d, 0x1899e4a65f58660c,
+      0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f,
+      0xdc21a1171d42645d, 0x76707543f4fa1f73,
+      0x899504ae72497eba, 0x6a06494a791c53a8,
+      0xabfa45da0edbde69, 0x487db9d17636892,
+      0xd6f8d7509292d603, 0x45a9d2845d3c42b6,
+      0x865b86925b9bc5c2, 0xb8a2392ba45a9b2,
+      0xa7f26836f282b732, 0x8e6cac7768d7141e,
+      0xd1ef0244af2364ff, 0x3207d795430cd926,
+      0x8335616aed761f1f, 0x7f44e6bd49e807b8,
+      0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6,
+      0xcd036837130890a1, 0x36dba887c37a8c0f,
+      0x802221226be55a64, 0xc2494954da2c9789,
+      0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c,
+      0xc83553c5c8965d3d, 0x6f92829494e5acc7,
+      0xfa42a8b73abbf48c, 0xcb772339ba1f17f9,
+      0x9c69a97284b578d7, 0xff2a760414536efb,
+      0xc38413cf25e2d70d, 0xfef5138519684aba,
+      0xf46518c2ef5b8cd1, 0x7eb258665fc25d69,
+      0x98bf2f79d5993802, 0xef2f773ffbd97a61,
+      0xbeeefb584aff8603, 0xaafb550ffacfd8fa,
+      0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38,
+      0x952ab45cfa97a0b2, 0xdd945a747bf26183,
+      0xba756174393d88df, 0x94f971119aeef9e4,
+      0xe912b9d1478ceb17, 0x7a37cd5601aab85d,
+      0x91abb422ccb812ee, 0xac62e055c10ab33a,
+      0xb616a12b7fe617aa, 0x577b986b314d6009,
+      0xe39c49765fdf9d94, 0xed5a7e85fda0b80b,
+      0x8e41ade9fbebc27d, 0x14588f13be847307,
+      0xb1d219647ae6b31c, 0x596eb2d8ae258fc8,
+      0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb,
+      0x8aec23d680043bee, 0x25de7bb9480d5854,
+      0xada72ccc20054ae9, 0xaf561aa79a10ae6a,
+      0xd910f7ff28069da4, 0x1b2ba1518094da04,
+      0x87aa9aff79042286, 0x90fb44d2f05d0842,
+      0xa99541bf57452b28, 0x353a1607ac744a53,
+      0xd3fa922f2d1675f2, 0x42889b8997915ce8,
+      0x847c9b5d7c2e09b7, 0x69956135febada11,
+      0xa59bc234db398c25, 0x43fab9837e699095,
+      0xcf02b2c21207ef2e, 0x94f967e45e03f4bb,
+      0x8161afb94b44f57d, 0x1d1be0eebac278f5,
+      0xa1ba1ba79e1632dc, 0x6462d92a69731732,
+      0xca28a291859bbf93, 0x7d7b8f7503cfdcfe,
+      0xfcb2cb35e702af78, 0x5cda735244c3d43e,
+      0x9defbf01b061adab, 0x3a0888136afa64a7,
+      0xc56baec21c7a1916, 0x88aaa1845b8fdd0,
+      0xf6c69a72a3989f5b, 0x8aad549e57273d45,
+      0x9a3c2087a63f6399, 0x36ac54e2f678864b,
+      0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd,
+      0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5,
+      0x969eb7c47859e743, 0x9f644ae5a4b1b325,
+      0xbc4665b596706114, 0x873d5d9f0dde1fee,
+      0xeb57ff22fc0c7959, 0xa90cb506d155a7ea,
+      0x9316ff75dd87cbd8, 0x9a7f12442d588f2,
+      0xb7dcbf5354e9bece, 0xc11ed6d538aeb2f,
+      0xe5d3ef282a242e81, 0x8f1668c8a86da5fa,
+      0x8fa475791a569d10, 0xf96e017d694487bc,
+      0xb38d92d760ec4455, 0x37c981dcc395a9ac,
+      0xe070f78d3927556a, 0x85bbe253f47b1417,
+      0x8c469ab843b89562, 0x93956d7478ccec8e,
+      0xaf58416654a6babb, 0x387ac8d1970027b2,
+      0xdb2e51bfe9d0696a, 0x6997b05fcc0319e,
+      0x88fcf317f22241e2, 0x441fece3bdf81f03,
+      0xab3c2fddeeaad25a, 0xd527e81cad7626c3,
+      0xd60b3bd56a5586f1, 0x8a71e223d8d3b074,
+      0x85c7056562757456, 0xf6872d5667844e49,
+      0xa738c6bebb12d16c, 0xb428f8ac016561db,
+      0xd106f86e69d785c7, 0xe13336d701beba52,
+      0x82a45b450226b39c, 0xecc0024661173473,
+      0xa34d721642b06084, 0x27f002d7f95d0190,
+      0xcc20ce9bd35c78a5, 0x31ec038df7b441f4,
+      0xff290242c83396ce, 0x7e67047175a15271,
+      0x9f79a169bd203e41, 0xf0062c6e984d386,
+      0xc75809c42c684dd1, 0x52c07b78a3e60868,
+      0xf92e0c3537826145, 0xa7709a56ccdf8a82,
+      0x9bbcc7a142b17ccb, 0x88a66076400bb691,
+      0xc2abf989935ddbfe, 0x6acff893d00ea435,
+      0xf356f7ebf83552fe, 0x583f6b8c4124d43,
+      0x98165af37b2153de, 0xc3727a337a8b704a,
+      0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c,
+      0xeda2ee1c7064130c, 0x1162def06f79df73,
+      0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8,
+      0xb9a74a0637ce2ee1, 0x6d953e2bd7173692,
+      0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437,
+      0x910ab1d4db9914a0, 0x1d9c9892400a22a2,
+      0xb54d5e4a127f59c8, 0x2503beb6d00cab4b,
+      0xe2a0b5dc971f303a, 0x2e44ae64840fd61d,
+      0x8da471a9de737e24, 0x5ceaecfed289e5d2,
+      0xb10d8e1456105dad, 0x7425a83e872c5f47,
+      0xdd50f1996b947518, 0xd12f124e28f77719,
+      0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f,
+      0xace73cbfdc0bfb7b, 0x636cc64d1001550b,
+      0xd8210befd30efa5a, 0x3c47f7e05401aa4e,
+      0x8714a775e3e95c78, 0x65acfaec34810a71,
+      0xa8d9d1535ce3b396, 0x7f1839a741a14d0d,
+      0xd31045a8341ca07c, 0x1ede48111209a050,
+      0x83ea2b892091e44d, 0x934aed0aab460432,
+      0xa4e4b66b68b65d60, 0xf81da84d5617853f,
+      0xce1de40642e3f4b9, 0x36251260ab9d668e,
+      0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019,
+      0xa1075a24e4421730, 0xb24cf65b8612f81f,
+      0xc94930ae1d529cfc, 0xdee033f26797b627,
+      0xfb9b7cd9a4a7443c, 0x169840ef017da3b1,
+      0x9d412e0806e88aa5, 0x8e1f289560ee864e,
+      0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2,
+      0xf5b5d7ec8acb58a2, 0xae10af696774b1db,
+      0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29,
+      0xbff610b0cc6edd3f, 0x17fd090a58d32af3,
+      0xeff394dcff8a948e, 0xddfc4b4cef07f5b0,
+      0x95f83d0a1fb69cd9, 0x4abdaf101564f98e,
+      0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1,
+      0xea53df5fd18d5513, 0x84c86189216dc5ed,
+      0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4,
+      0xb7118682dbb66a77, 0x3fbc8c33221dc2a1,
+      0xe4d5e82392a40515, 0xfabaf3feaa5334a,
+      0x8f05b1163ba6832d, 0x29cb4d87f2a7400e,
+      0xb2c71d5bca9023f8, 0x743e20e9ef511012,
+      0xdf78e4b2bd342cf6, 0x914da9246b255416,
+      0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e,
+      0xae9672aba3d0c320, 0xa184ac2473b529b1,
+      0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e,
+      0x8865899617fb1871, 0x7e2fa67c7a658892,
+      0xaa7eebfb9df9de8d, 0xddbb901b98feeab7,
+      0xd51ea6fa85785631, 0x552a74227f3ea565,
+      0x8533285c936b35de, 0xd53a88958f87275f,
+      0xa67ff273b8460356, 0x8a892abaf368f137,
+      0xd01fef10a657842c, 0x2d2b7569b0432d85,
+      0x8213f56a67f6b29b, 0x9c3b29620e29fc73,
+      0xa298f2c501f45f42, 0x8349f3ba91b47b8f,
+      0xcb3f2f7642717713, 0x241c70a936219a73,
+      0xfe0efb53d30dd4d7, 0xed238cd383aa0110,
+      0x9ec95d1463e8a506, 0xf4363804324a40aa,
+      0xc67bb4597ce2ce48, 0xb143c6053edcd0d5,
+      0xf81aa16fdc1b81da, 0xdd94b7868e94050a,
+      0x9b10a4e5e9913128, 0xca7cf2b4191c8326,
+      0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0,
+      0xf24a01a73cf2dccf, 0xbc633b39673c8cec,
+      0x976e41088617ca01, 0xd5be0503e085d813,
+      0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18,
+      0xec9c459d51852ba2, 0xddf8e7d60ed1219e,
+      0x93e1ab8252f33b45, 0xcabb90e5c942b503,
+      0xb8da1662e7b00a17, 0x3d6a751f3b936243,
+      0xe7109bfba19c0c9d, 0xcc512670a783ad4,
+      0x906a617d450187e2, 0x27fb2b80668b24c5,
+      0xb484f9dc9641e9da, 0xb1f9f660802dedf6,
+      0xe1a63853bbd26451, 0x5e7873f8a0396973,
+      0x8d07e33455637eb2, 0xdb0b487b6423e1e8,
+      0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62,
+      0xdc5c5301c56b75f7, 0x7641a140cc7810fb,
+      0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d,
+      0xac2820d9623bf429, 0x546345fa9fbdcd44,
+      0xd732290fbacaf133, 0xa97c177947ad4095,
+      0x867f59a9d4bed6c0, 0x49ed8eabcccc485d,
+      0xa81f301449ee8c70, 0x5c68f256bfff5a74,
+      0xd226fc195c6a2f8c, 0x73832eec6fff3111,
+      0x83585d8fd9c25db7, 0xc831fd53c5ff7eab,
+      0xa42e74f3d032f525, 0xba3e7ca8b77f5e55,
+      0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb,
+      0x80444b5e7aa7cf85, 0x7980d163cf5b81b3,
+      0xa0555e361951c366, 0xd7e105bcc332621f,
+      0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7,
+      0xfa856334878fc150, 0xb14f98f6f0feb951,
+      0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3,
+      0xc3b8358109e84f07, 0xa862f80ec4700c8,
+      0xf4a642e14c6262c8, 0xcd27bb612758c0fa,
+      0x98e7e9cccfbd7dbd, 0x8038d51cb897789c,
+      0xbf21e44003acdd2c, 0xe0470a63e6bd56c3,
+      0xeeea5d5004981478, 0x1858ccfce06cac74,
+      0x95527a5202df0ccb, 0xf37801e0c43ebc8,
+      0xbaa718e68396cffd, 0xd30560258f54e6ba,
+      0xe950df20247c83fd, 0x47c6b82ef32a2069,
+      0x91d28b7416cdd27e, 0x4cdc331d57fa5441,
+      0xb6472e511c81471d, 0xe0133fe4adf8e952,
+      0xe3d8f9e563a198e5, 0x58180fddd97723a6,
+      0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648,
+  };
 };
 
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
 template <class unused>
-constexpr uint64_t powers_template<unused>::power_of_five_128[number_of_entries];
+constexpr uint64_t
+    powers_template<unused>::power_of_five_128[number_of_entries];
+
+#endif
 
 using powers = powers_template<>;
 
diff --git a/third_party/fast_float/float_common.h b/third_party/fast_float/float_common.h
index bee882152025..edc163cb472e 100644
--- a/third_party/fast_float/float_common.h
+++ b/third_party/fast_float/float_common.h
@@ -7,7 +7,11 @@
 #include <cstring>
 #include <type_traits>
 #include <system_error>
-
+#ifdef __has_include
+#if __has_include(<stdfloat>) && (__cplusplus > 202002L || _MSVC_LANG > 202002L)
+#include <stdfloat>
+#endif
+#endif
 #include "constexpr_feature_detect.h"
 
 namespace fast_float {
@@ -28,18 +32,16 @@ enum chars_format {
   general = fixed | scientific
 };
 
-template <typename UC>
-struct from_chars_result_t {
-  UC const* ptr;
+template <typename UC> struct from_chars_result_t {
+  UC const *ptr;
   std::errc ec;
 };
 using from_chars_result = from_chars_result_t<char>;
 
-template <typename UC>
-struct parse_options_t {
+template <typename UC> struct parse_options_t {
   constexpr explicit parse_options_t(chars_format fmt = chars_format::general,
-    UC dot = UC('.'))
-    : format(fmt), decimal_point(dot) {}
+                                     UC dot = UC('.'))
+      : format(fmt), decimal_point(dot) {}
 
   /** Which number formats are accepted */
   chars_format format;
@@ -48,39 +50,41 @@ struct parse_options_t {
 };
 using parse_options = parse_options_t<char>;
 
-}
+} // namespace fast_float
 
 #if FASTFLOAT_HAS_BIT_CAST
 #include <bit>
 #endif
 
-#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64)   \
-       || defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) \
-       || defined(__MINGW64__)                                          \
-       || defined(__s390x__)                                            \
-       || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) \
-       || defined(__loongarch64) )
+#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) ||            \
+     defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) ||          \
+     defined(__MINGW64__) || defined(__s390x__) ||                             \
+     (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) ||      \
+      defined(__PPC64LE__)) ||                                                 \
+     defined(__loongarch64))
 #define FASTFLOAT_64BIT 1
-#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86)   \
-     || defined(__arm__) || defined(_M_ARM) || defined(__ppc__)   \
-     || defined(__MINGW32__) || defined(__EMSCRIPTEN__))
+#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) ||             \
+       defined(__arm__) || defined(_M_ARM) || defined(__ppc__) ||              \
+       defined(__MINGW32__) || defined(__EMSCRIPTEN__))
 #define FASTFLOAT_32BIT 1
 #else
   // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow.
-  // We can never tell the register width, but the SIZE_MAX is a good approximation.
-  // UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max portability.
-  #if SIZE_MAX == 0xffff
-    #error Unknown platform (16-bit, unsupported)
-  #elif SIZE_MAX == 0xffffffff
-    #define FASTFLOAT_32BIT 1
-  #elif SIZE_MAX == 0xffffffffffffffff
-    #define FASTFLOAT_64BIT 1
-  #else
-    #error Unknown platform (not 32-bit, not 64-bit?)
-  #endif
-#endif
-
-#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__))
+// We can never tell the register width, but the SIZE_MAX is a good
+// approximation. UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max
+// portability.
+#if SIZE_MAX == 0xffff
+#error Unknown platform (16-bit, unsupported)
+#elif SIZE_MAX == 0xffffffff
+#define FASTFLOAT_32BIT 1
+#elif SIZE_MAX == 0xffffffffffffffff
+#define FASTFLOAT_64BIT 1
+#else
+#error Unknown platform (not 32-bit, not 64-bit?)
+#endif
+#endif
+
+#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__)) ||           \
+    (defined(_M_ARM64) && !defined(__MINGW32__))
 #include <intrin.h>
 #endif
 
@@ -124,9 +128,9 @@ using parse_options = parse_options_t<char>;
 #endif
 #endif
 
-#if defined(__SSE2__) || \
-  (defined(FASTFLOAT_VISUAL_STUDIO) && \
-    (defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2)))
+#if defined(__SSE2__) || (defined(FASTFLOAT_VISUAL_STUDIO) &&                  \
+                          (defined(_M_AMD64) || defined(_M_X64) ||             \
+                           (defined(_M_IX86_FP) && _M_IX86_FP == 2)))
 #define FASTFLOAT_SSE2 1
 #endif
 
@@ -134,28 +138,25 @@ using parse_options = parse_options_t<char>;
 #define FASTFLOAT_NEON 1
 #endif
 
-#if defined(FASTFLOAT_SSE2) || defined(FASTFLOAT_ARM64)
+#if defined(FASTFLOAT_SSE2) || defined(FASTFLOAT_NEON)
 #define FASTFLOAT_HAS_SIMD 1
 #endif
 
 #if defined(__GNUC__)
 // disable -Wcast-align=strict (GCC only)
-#define FASTFLOAT_SIMD_DISABLE_WARNINGS \
-  _Pragma("GCC diagnostic push") \
-  _Pragma("GCC diagnostic ignored \"-Wcast-align\"")
+#define FASTFLOAT_SIMD_DISABLE_WARNINGS                                        \
+  _Pragma("GCC diagnostic push")                                               \
+      _Pragma("GCC diagnostic ignored \"-Wcast-align\"")
 #else
 #define FASTFLOAT_SIMD_DISABLE_WARNINGS
 #endif
 
 #if defined(__GNUC__)
-#define FASTFLOAT_SIMD_RESTORE_WARNINGS \
-  _Pragma("GCC diagnostic pop")
+#define FASTFLOAT_SIMD_RESTORE_WARNINGS _Pragma("GCC diagnostic pop")
 #else
 #define FASTFLOAT_SIMD_RESTORE_WARNINGS
 #endif
 
-
-
 #ifdef FASTFLOAT_VISUAL_STUDIO
 #define fastfloat_really_inline __forceinline
 #else
@@ -163,18 +164,24 @@ using parse_options = parse_options_t<char>;
 #endif
 
 #ifndef FASTFLOAT_ASSERT
-#define FASTFLOAT_ASSERT(x)  { ((void)(x)); }
+#define FASTFLOAT_ASSERT(x)                                                    \
+  { ((void)(x)); }
 #endif
 
 #ifndef FASTFLOAT_DEBUG_ASSERT
-#define FASTFLOAT_DEBUG_ASSERT(x) { ((void)(x)); }
+#define FASTFLOAT_DEBUG_ASSERT(x)                                              \
+  { ((void)(x)); }
 #endif
 
 // rust style `try!()` macro, or `?` operator
-#define FASTFLOAT_TRY(x) { if (!(x)) return false; }
-
-#define FASTFLOAT_ENABLE_IF(...) typename std::enable_if<(__VA_ARGS__), int>::type = 0
+#define FASTFLOAT_TRY(x)                                                       \
+  {                                                                            \
+    if (!(x))                                                                  \
+      return false;                                                            \
+  }
 
+#define FASTFLOAT_ENABLE_IF(...)                                               \
+  typename std::enable_if<(__VA_ARGS__), int>::type
 
 namespace fast_float {
 
@@ -186,10 +193,28 @@ fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() {
 #endif
 }
 
+template <typename T>
+fastfloat_really_inline constexpr bool is_supported_float_type() {
+  return std::is_same<T, float>::value || std::is_same<T, double>::value
+#if __STDCPP_FLOAT32_T__
+         || std::is_same<T, std::float32_t>::value
+#endif
+#if __STDCPP_FLOAT64_T__
+         || std::is_same<T, std::float64_t>::value
+#endif
+      ;
+}
+
+template <typename UC>
+fastfloat_really_inline constexpr bool is_supported_char_type() {
+  return std::is_same<UC, char>::value || std::is_same<UC, wchar_t>::value ||
+         std::is_same<UC, char16_t>::value || std::is_same<UC, char32_t>::value;
+}
+
 // Compares two ASCII strings in a case insensitive manner.
 template <typename UC>
 inline FASTFLOAT_CONSTEXPR14 bool
-fastfloat_strncasecmp(UC const * input1, UC const * input2, size_t length) {
+fastfloat_strncasecmp(UC const *input1, UC const *input2, size_t length) {
   char running_diff{0};
   for (size_t i = 0; i < length; ++i) {
     running_diff |= (char(input1[i]) ^ char(input2[i]));
@@ -202,18 +227,15 @@ fastfloat_strncasecmp(UC const * input1, UC const * input2, size_t length) {
 #endif
 
 // a pointer and a length to a contiguous block of memory
-template <typename T>
-struct span {
-  const T* ptr;
+template <typename T> struct span {
+  const T *ptr;
   size_t length;
-  constexpr span(const T* _ptr, size_t _length) : ptr(_ptr), length(_length) {}
+  constexpr span(const T *_ptr, size_t _length) : ptr(_ptr), length(_length) {}
   constexpr span() : ptr(nullptr), length(0) {}
 
-  constexpr size_t len() const noexcept {
-    return length;
-  }
+  constexpr size_t len() const noexcept { return length; }
 
-  FASTFLOAT_CONSTEXPR14 const T& operator[](size_t index) const noexcept {
+  FASTFLOAT_CONSTEXPR14 const T &operator[](size_t index) const noexcept {
     FASTFLOAT_DEBUG_ASSERT(index < length);
     return ptr[index];
   }
@@ -227,34 +249,51 @@ struct value128 {
 };
 
 /* Helper C++14 constexpr generic implementation of leading_zeroes */
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-int leading_zeroes_generic(uint64_t input_num, int last_bit = 0) {
-    if(input_num & uint64_t(0xffffffff00000000)) { input_num >>= 32; last_bit |= 32; }
-    if(input_num & uint64_t(        0xffff0000)) { input_num >>= 16; last_bit |= 16; }
-    if(input_num & uint64_t(            0xff00)) { input_num >>=  8; last_bit |=  8; }
-    if(input_num & uint64_t(              0xf0)) { input_num >>=  4; last_bit |=  4; }
-    if(input_num & uint64_t(               0xc)) { input_num >>=  2; last_bit |=  2; }
-    if(input_num & uint64_t(               0x2)) { input_num >>=  1; last_bit |=  1; }
-    return 63 - last_bit;
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int
+leading_zeroes_generic(uint64_t input_num, int last_bit = 0) {
+  if (input_num & uint64_t(0xffffffff00000000)) {
+    input_num >>= 32;
+    last_bit |= 32;
+  }
+  if (input_num & uint64_t(0xffff0000)) {
+    input_num >>= 16;
+    last_bit |= 16;
+  }
+  if (input_num & uint64_t(0xff00)) {
+    input_num >>= 8;
+    last_bit |= 8;
+  }
+  if (input_num & uint64_t(0xf0)) {
+    input_num >>= 4;
+    last_bit |= 4;
+  }
+  if (input_num & uint64_t(0xc)) {
+    input_num >>= 2;
+    last_bit |= 2;
+  }
+  if (input_num & uint64_t(0x2)) { /* input_num >>=  1; */
+    last_bit |= 1;
+  }
+  return 63 - last_bit;
 }
 
 /* result might be undefined when input_num is zero */
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-int leading_zeroes(uint64_t input_num) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 int
+leading_zeroes(uint64_t input_num) {
   assert(input_num > 0);
   if (cpp20_and_in_constexpr()) {
     return leading_zeroes_generic(input_num);
   }
 #ifdef FASTFLOAT_VISUAL_STUDIO
-  #if defined(_M_X64) || defined(_M_ARM64)
+#if defined(_M_X64) || defined(_M_ARM64)
   unsigned long leading_zero = 0;
   // Search the mask data from most significant bit (MSB)
   // to least significant bit (LSB) for a set bit (1).
   _BitScanReverse64(&leading_zero, input_num);
   return (int)(63 - leading_zero);
-  #else
+#else
   return leading_zeroes_generic(input_num);
-  #endif
+#endif
 #else
   return __builtin_clzll(input_num);
 #endif
@@ -262,18 +301,18 @@ int leading_zeroes(uint64_t input_num) {
 
 // slow emulation routine for 32-bit
 fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) {
-    return x * (uint64_t)y;
+  return x * (uint64_t)y;
 }
 
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t
+umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) {
   uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd);
   uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd);
   uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32));
-  uint64_t adbc_carry = !!(adbc < ad);
+  uint64_t adbc_carry = (uint64_t)(adbc < ad);
   uint64_t lo = bd + (adbc << 32);
   *hi = emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) +
-        (adbc_carry << 32) + !!(lo < bd);
+        (adbc_carry << 32) + (uint64_t)(lo < bd);
   return lo;
 }
 
@@ -281,18 +320,18 @@ uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) {
 
 // slow emulation routine for 32-bit
 #if !defined(__MINGW64__)
-fastfloat_really_inline FASTFLOAT_CONSTEXPR14
-uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t _umul128(uint64_t ab,
+                                                                uint64_t cd,
+                                                                uint64_t *hi) {
   return umul128_generic(ab, cd, hi);
 }
 #endif // !__MINGW64__
 
 #endif // FASTFLOAT_32BIT
 
-
 // compute 64-bit a*b
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-value128 full_multiplication(uint64_t a, uint64_t b) {
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128
+full_multiplication(uint64_t a, uint64_t b) {
   if (cpp20_and_in_constexpr()) {
     value128 answer;
     answer.low = umul128_generic(a, b, &answer.high);
@@ -304,9 +343,10 @@ value128 full_multiplication(uint64_t a, uint64_t b) {
   // But MinGW on ARM64 doesn't have native support for 64-bit multiplications
   answer.high = __umulh(a, b);
   answer.low = a * b;
-#elif defined(FASTFLOAT_32BIT) || (defined(_WIN64) && !defined(__clang__))
+#elif defined(FASTFLOAT_32BIT) ||                                              \
+    (defined(_WIN64) && !defined(__clang__) && !defined(_M_ARM64))
   answer.low = _umul128(a, b, &answer.high); // _umul128 not available on ARM64
-#elif defined(FASTFLOAT_64BIT)
+#elif defined(FASTFLOAT_64BIT) && defined(__SIZEOF_INT128__)
   __uint128_t r = ((__uint128_t)a) * b;
   answer.low = uint64_t(r);
   answer.high = uint64_t(r >> 64);
@@ -334,22 +374,24 @@ constexpr static int32_t invalid_am_bias = -0x8000;
 // used for binary_format_lookup_tables<T>::max_mantissa
 constexpr uint64_t constant_55555 = 5 * 5 * 5 * 5 * 5;
 
-template <typename T, typename U = void>
-struct binary_format_lookup_tables;
+template <typename T, typename U = void> struct binary_format_lookup_tables;
 
 template <typename T> struct binary_format : binary_format_lookup_tables<T> {
-  using equiv_uint = typename std::conditional<sizeof(T) == 4, uint32_t, uint64_t>::type;
+  using equiv_uint =
+      typename std::conditional<sizeof(T) == 4, uint32_t, uint64_t>::type;
 
   static inline constexpr int mantissa_explicit_bits();
   static inline constexpr int minimum_exponent();
   static inline constexpr int infinite_power();
   static inline constexpr int sign_index();
-  static inline constexpr int min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST
+  static inline constexpr int
+  min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST
   static inline constexpr int max_exponent_fast_path();
   static inline constexpr int max_exponent_round_to_even();
   static inline constexpr int min_exponent_round_to_even();
   static inline constexpr uint64_t max_mantissa_fast_path(int64_t power);
-  static inline constexpr uint64_t max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST
+  static inline constexpr uint64_t
+  max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST
   static inline constexpr int largest_power_of_ten();
   static inline constexpr int smallest_power_of_ten();
   static inline constexpr T exact_power_of_ten(int64_t power);
@@ -359,76 +401,91 @@ template <typename T> struct binary_format : binary_format_lookup_tables<T> {
   static inline constexpr equiv_uint hidden_bit_mask();
 };
 
-template <typename U>
-struct binary_format_lookup_tables<double, U> {
+template <typename U> struct binary_format_lookup_tables<double, U> {
   static constexpr double powers_of_ten[] = {
       1e0,  1e1,  1e2,  1e3,  1e4,  1e5,  1e6,  1e7,  1e8,  1e9,  1e10, 1e11,
       1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22};
 
   // Largest integer value v so that (5**index * v) <= 1<<53.
-  // 0x10000000000000 == 1 << 53
+  // 0x20000000000000 == 1 << 53
   static constexpr uint64_t max_mantissa[] = {
-      0x10000000000000,
-      0x10000000000000 / 5,
-      0x10000000000000 / (5 * 5),
-      0x10000000000000 / (5 * 5 * 5),
-      0x10000000000000 / (5 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555),
-      0x10000000000000 / (constant_55555 * 5),
-      0x10000000000000 / (constant_55555 * 5 * 5),
-      0x10000000000000 / (constant_55555 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * 5 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555),
-      0x10000000000000 / (constant_55555 * constant_55555 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5),
-      0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5)};
+      0x20000000000000,
+      0x20000000000000 / 5,
+      0x20000000000000 / (5 * 5),
+      0x20000000000000 / (5 * 5 * 5),
+      0x20000000000000 / (5 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555),
+      0x20000000000000 / (constant_55555 * 5),
+      0x20000000000000 / (constant_55555 * 5 * 5),
+      0x20000000000000 / (constant_55555 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * 5 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555),
+      0x20000000000000 / (constant_55555 * constant_55555 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * 5 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5),
+      0x20000000000000 /
+          (constant_55555 * constant_55555 * constant_55555 * constant_55555),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5 * 5 * 5),
+      0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 *
+                          constant_55555 * 5 * 5 * 5 * 5)};
 };
 
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
 template <typename U>
 constexpr double binary_format_lookup_tables<double, U>::powers_of_ten[];
 
 template <typename U>
 constexpr uint64_t binary_format_lookup_tables<double, U>::max_mantissa[];
 
-template <typename U>
-struct binary_format_lookup_tables<float, U> {
+#endif
+
+template <typename U> struct binary_format_lookup_tables<float, U> {
   static constexpr float powers_of_ten[] = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f,
-                                     1e6f, 1e7f, 1e8f, 1e9f, 1e10f};
+                                            1e6f, 1e7f, 1e8f, 1e9f, 1e10f};
 
   // Largest integer value v so that (5**index * v) <= 1<<24.
   // 0x1000000 == 1<<24
   static constexpr uint64_t max_mantissa[] = {
-        0x1000000,
-        0x1000000 / 5,
-        0x1000000 / (5 * 5),
-        0x1000000 / (5 * 5 * 5),
-        0x1000000 / (5 * 5 * 5 * 5),
-        0x1000000 / (constant_55555),
-        0x1000000 / (constant_55555 * 5),
-        0x1000000 / (constant_55555 * 5 * 5),
-        0x1000000 / (constant_55555 * 5 * 5 * 5),
-        0x1000000 / (constant_55555 * 5 * 5 * 5 * 5),
-        0x1000000 / (constant_55555 * constant_55555),
-        0x1000000 / (constant_55555 * constant_55555 * 5)};
+      0x1000000,
+      0x1000000 / 5,
+      0x1000000 / (5 * 5),
+      0x1000000 / (5 * 5 * 5),
+      0x1000000 / (5 * 5 * 5 * 5),
+      0x1000000 / (constant_55555),
+      0x1000000 / (constant_55555 * 5),
+      0x1000000 / (constant_55555 * 5 * 5),
+      0x1000000 / (constant_55555 * 5 * 5 * 5),
+      0x1000000 / (constant_55555 * 5 * 5 * 5 * 5),
+      0x1000000 / (constant_55555 * constant_55555),
+      0x1000000 / (constant_55555 * constant_55555 * 5)};
 };
 
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
 template <typename U>
 constexpr float binary_format_lookup_tables<float, U>::powers_of_ten[];
 
 template <typename U>
 constexpr uint64_t binary_format_lookup_tables<float, U>::max_mantissa[];
 
-template <> inline constexpr int binary_format<double>::min_exponent_fast_path() {
+#endif
+
+template <>
+inline constexpr int binary_format<double>::min_exponent_fast_path() {
 #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
   return 0;
 #else
@@ -436,7 +493,8 @@ template <> inline constexpr int binary_format<double>::min_exponent_fast_path()
 #endif
 }
 
-template <> inline constexpr int binary_format<float>::min_exponent_fast_path() {
+template <>
+inline constexpr int binary_format<float>::min_exponent_fast_path() {
 #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0)
   return 0;
 #else
@@ -444,26 +502,32 @@ template <> inline constexpr int binary_format<float>::min_exponent_fast_path()
 #endif
 }
 
-template <> inline constexpr int binary_format<double>::mantissa_explicit_bits() {
+template <>
+inline constexpr int binary_format<double>::mantissa_explicit_bits() {
   return 52;
 }
-template <> inline constexpr int binary_format<float>::mantissa_explicit_bits() {
+template <>
+inline constexpr int binary_format<float>::mantissa_explicit_bits() {
   return 23;
 }
 
-template <> inline constexpr int binary_format<double>::max_exponent_round_to_even() {
+template <>
+inline constexpr int binary_format<double>::max_exponent_round_to_even() {
   return 23;
 }
 
-template <> inline constexpr int binary_format<float>::max_exponent_round_to_even() {
+template <>
+inline constexpr int binary_format<float>::max_exponent_round_to_even() {
   return 10;
 }
 
-template <> inline constexpr int binary_format<double>::min_exponent_round_to_even() {
+template <>
+inline constexpr int binary_format<double>::min_exponent_round_to_even() {
   return -4;
 }
 
-template <> inline constexpr int binary_format<float>::min_exponent_round_to_even() {
+template <>
+inline constexpr int binary_format<float>::min_exponent_round_to_even() {
   return -17;
 }
 
@@ -481,30 +545,42 @@ template <> inline constexpr int binary_format<float>::infinite_power() {
   return 0xFF;
 }
 
-template <> inline constexpr int binary_format<double>::sign_index() { return 63; }
-template <> inline constexpr int binary_format<float>::sign_index() { return 31; }
+template <> inline constexpr int binary_format<double>::sign_index() {
+  return 63;
+}
+template <> inline constexpr int binary_format<float>::sign_index() {
+  return 31;
+}
 
-template <> inline constexpr int binary_format<double>::max_exponent_fast_path() {
+template <>
+inline constexpr int binary_format<double>::max_exponent_fast_path() {
   return 22;
 }
-template <> inline constexpr int binary_format<float>::max_exponent_fast_path() {
+template <>
+inline constexpr int binary_format<float>::max_exponent_fast_path() {
   return 10;
 }
 
-template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() {
+template <>
+inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() {
   return uint64_t(2) << mantissa_explicit_bits();
 }
-template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path(int64_t power) {
+template <>
+inline constexpr uint64_t
+binary_format<double>::max_mantissa_fast_path(int64_t power) {
   // caller is responsible to ensure that
   // power >= 0 && power <= 22
   //
   // Work around clang bug https://godbolt.org/z/zedh7rrhc
   return (void)max_mantissa[0], max_mantissa[power];
 }
-template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() {
+template <>
+inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() {
   return uint64_t(2) << mantissa_explicit_bits();
 }
-template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path(int64_t power) {
+template <>
+inline constexpr uint64_t
+binary_format<float>::max_mantissa_fast_path(int64_t power) {
   // caller is responsible to ensure that
   // power >= 0 && power <= 10
   //
@@ -513,7 +589,8 @@ template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_pa
 }
 
 template <>
-inline constexpr double binary_format<double>::exact_power_of_ten(int64_t power) {
+inline constexpr double
+binary_format<double>::exact_power_of_ten(int64_t power) {
   // Work around clang bug https://godbolt.org/z/zedh7rrhc
   return (void)powers_of_ten[0], powers_of_ten[power];
 }
@@ -523,13 +600,10 @@ inline constexpr float binary_format<float>::exact_power_of_ten(int64_t power) {
   return (void)powers_of_ten[0], powers_of_ten[power];
 }
 
-
-template <>
-inline constexpr int binary_format<double>::largest_power_of_ten() {
+template <> inline constexpr int binary_format<double>::largest_power_of_ten() {
   return 308;
 }
-template <>
-inline constexpr int binary_format<float>::largest_power_of_ten() {
+template <> inline constexpr int binary_format<float>::largest_power_of_ten() {
   return 38;
 }
 
@@ -537,9 +611,8 @@ template <>
 inline constexpr int binary_format<double>::smallest_power_of_ten() {
   return -342;
 }
-template <>
-inline constexpr int binary_format<float>::smallest_power_of_ten() {
-  return -65;
+template <> inline constexpr int binary_format<float>::smallest_power_of_ten() {
+  return -64;
 }
 
 template <> inline constexpr size_t binary_format<double>::max_digits() {
@@ -549,39 +622,46 @@ template <> inline constexpr size_t binary_format<float>::max_digits() {
   return 114;
 }
 
-template <> inline constexpr binary_format<float>::equiv_uint
-    binary_format<float>::exponent_mask() {
+template <>
+inline constexpr binary_format<float>::equiv_uint
+binary_format<float>::exponent_mask() {
   return 0x7F800000;
 }
-template <> inline constexpr binary_format<double>::equiv_uint
-    binary_format<double>::exponent_mask() {
+template <>
+inline constexpr binary_format<double>::equiv_uint
+binary_format<double>::exponent_mask() {
   return 0x7FF0000000000000;
 }
 
-template <> inline constexpr binary_format<float>::equiv_uint
-    binary_format<float>::mantissa_mask() {
+template <>
+inline constexpr binary_format<float>::equiv_uint
+binary_format<float>::mantissa_mask() {
   return 0x007FFFFF;
 }
-template <> inline constexpr binary_format<double>::equiv_uint
-    binary_format<double>::mantissa_mask() {
+template <>
+inline constexpr binary_format<double>::equiv_uint
+binary_format<double>::mantissa_mask() {
   return 0x000FFFFFFFFFFFFF;
 }
 
-template <> inline constexpr binary_format<float>::equiv_uint
-    binary_format<float>::hidden_bit_mask() {
+template <>
+inline constexpr binary_format<float>::equiv_uint
+binary_format<float>::hidden_bit_mask() {
   return 0x00800000;
 }
-template <> inline constexpr binary_format<double>::equiv_uint
-    binary_format<double>::hidden_bit_mask() {
+template <>
+inline constexpr binary_format<double>::equiv_uint
+binary_format<double>::hidden_bit_mask() {
   return 0x0010000000000000;
 }
 
-template<typename T>
-fastfloat_really_inline FASTFLOAT_CONSTEXPR20
-void to_float(bool negative, adjusted_mantissa am, T &value) {
+template <typename T>
+fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void
+to_float(bool negative, adjusted_mantissa am, T &value) {
   using fastfloat_uint = typename binary_format<T>::equiv_uint;
   fastfloat_uint word = (fastfloat_uint)am.mantissa;
-  word |= fastfloat_uint(am.power2) << binary_format<T>::mantissa_explicit_bits();
+  word |= fastfloat_uint(am.power2)
+          << binary_format<T>::mantissa_explicit_bits();
   word |= fastfloat_uint(negative) << binary_format<T>::sign_index();
 #if FASTFLOAT_HAS_BIT_CAST
   value = std::bit_cast<T>(word);
@@ -591,89 +671,132 @@ void to_float(bool negative, adjusted_mantissa am, T &value) {
 }
 
 #ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default
-template <typename = void>
-struct space_lut {
+template <typename = void> struct space_lut {
   static constexpr bool value[] = {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 };
 
-template <typename T>
-constexpr bool space_lut<T>::value[];
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename T> constexpr bool space_lut<T>::value[];
+
+#endif
 
 inline constexpr bool is_space(uint8_t c) { return space_lut<>::value[c]; }
 #endif
 
-template<typename UC>
-static constexpr uint64_t int_cmp_zeros()
-{
-    static_assert((sizeof(UC) == 1) || (sizeof(UC) == 2) || (sizeof(UC) == 4), "Unsupported character size");
-    return (sizeof(UC) == 1) ? 0x3030303030303030 : (sizeof(UC) == 2) ? (uint64_t(UC('0')) << 48 | uint64_t(UC('0')) << 32 | uint64_t(UC('0')) << 16 | UC('0')) : (uint64_t(UC('0')) << 32 | UC('0'));
-}
-template<typename UC>
-static constexpr int int_cmp_len()
-{
-    return sizeof(uint64_t) / sizeof(UC);
-}
-template<typename UC>
-static constexpr UC const * str_const_nan()
-{
-    return nullptr;
-}
-template<>
-constexpr char const * str_const_nan<char>()
-{
-    return "nan";
-}
-template<>
-constexpr wchar_t const * str_const_nan<wchar_t>()
-{
-    return L"nan";
-}
-template<>
-constexpr char16_t const * str_const_nan<char16_t>()
-{
-    return u"nan";
-}
-template<>
-constexpr char32_t const * str_const_nan<char32_t>()
-{
-    return U"nan";
-}
-template<typename UC>
-static constexpr UC const * str_const_inf()
-{
-    return nullptr;
-}
-template<>
-constexpr char const * str_const_inf<char>()
-{
-    return "infinity";
-}
-template<>
-constexpr wchar_t const * str_const_inf<wchar_t>()
-{
-    return L"infinity";
-}
-template<>
-constexpr char16_t const * str_const_inf<char16_t>()
-{
-    return u"infinity";
-}
-template<>
-constexpr char32_t const * str_const_inf<char32_t>()
-{
-    return U"infinity";
+template <typename UC> static constexpr uint64_t int_cmp_zeros() {
+  static_assert((sizeof(UC) == 1) || (sizeof(UC) == 2) || (sizeof(UC) == 4),
+                "Unsupported character size");
+  return (sizeof(UC) == 1) ? 0x3030303030303030
+         : (sizeof(UC) == 2)
+             ? (uint64_t(UC('0')) << 48 | uint64_t(UC('0')) << 32 |
+                uint64_t(UC('0')) << 16 | UC('0'))
+             : (uint64_t(UC('0')) << 32 | UC('0'));
+}
+template <typename UC> static constexpr int int_cmp_len() {
+  return sizeof(uint64_t) / sizeof(UC);
+}
+template <typename UC> static constexpr UC const *str_const_nan() {
+  return nullptr;
+}
+template <> constexpr char const *str_const_nan<char>() { return "nan"; }
+template <> constexpr wchar_t const *str_const_nan<wchar_t>() { return L"nan"; }
+template <> constexpr char16_t const *str_const_nan<char16_t>() {
+  return u"nan";
+}
+template <> constexpr char32_t const *str_const_nan<char32_t>() {
+  return U"nan";
+}
+template <typename UC> static constexpr UC const *str_const_inf() {
+  return nullptr;
+}
+template <> constexpr char const *str_const_inf<char>() { return "infinity"; }
+template <> constexpr wchar_t const *str_const_inf<wchar_t>() {
+  return L"infinity";
+}
+template <> constexpr char16_t const *str_const_inf<char16_t>() {
+  return u"infinity";
+}
+template <> constexpr char32_t const *str_const_inf<char32_t>() {
+  return U"infinity";
+}
+
+template <typename = void> struct int_luts {
+  static constexpr uint8_t chdigit[] = {
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   255, 255,
+      255, 255, 255, 255, 255, 10,  11,  12,  13,  14,  15,  16,  17,  18,  19,
+      20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,
+      35,  255, 255, 255, 255, 255, 255, 10,  11,  12,  13,  14,  15,  16,  17,
+      18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,
+      33,  34,  35,  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+      255};
+
+  static constexpr size_t maxdigits_u64[] = {
+      64, 41, 32, 28, 25, 23, 22, 21, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16,
+      15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13};
+
+  static constexpr uint64_t min_safe_u64[] = {
+      9223372036854775808ull,  12157665459056928801ull, 4611686018427387904,
+      7450580596923828125,     4738381338321616896,     3909821048582988049,
+      9223372036854775808ull,  12157665459056928801ull, 10000000000000000000ull,
+      5559917313492231481,     2218611106740436992,     8650415919381337933,
+      2177953337809371136,     6568408355712890625,     1152921504606846976,
+      2862423051509815793,     6746640616477458432,     15181127029874798299ull,
+      1638400000000000000,     3243919932521508681,     6221821273427820544,
+      11592836324538749809ull, 876488338465357824,      1490116119384765625,
+      2481152873203736576,     4052555153018976267,     6502111422497947648,
+      10260628712958602189ull, 15943230000000000000ull, 787662783788549761,
+      1152921504606846976,     1667889514952984961,     2386420683693101056,
+      3379220508056640625,     4738381338321616896};
+};
+
+#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE
+
+template <typename T> constexpr uint8_t int_luts<T>::chdigit[];
+
+template <typename T> constexpr size_t int_luts<T>::maxdigits_u64[];
+
+template <typename T> constexpr uint64_t int_luts<T>::min_safe_u64[];
+
+#endif
+
+template <typename UC>
+fastfloat_really_inline constexpr uint8_t ch_to_digit(UC c) {
+  return int_luts<>::chdigit[static_cast<unsigned char>(c)];
 }
+
+fastfloat_really_inline constexpr size_t max_digits_u64(int base) {
+  return int_luts<>::maxdigits_u64[base - 2];
+}
+
+// If a u64 is exactly max_digits_u64() in length, this is
+// the value below which it has definitely overflowed.
+fastfloat_really_inline constexpr uint64_t min_safe_u64(int base) {
+  return int_luts<>::min_safe_u64[base - 2];
+}
+
 } // namespace fast_float
 
 #endif
diff --git a/third_party/fast_float/parse_number.h b/third_party/fast_float/parse_number.h
index a011a8cbf4df..6d883fb96ea1 100644
--- a/third_party/fast_float/parse_number.h
+++ b/third_party/fast_float/parse_number.h
@@ -10,10 +10,8 @@
 #include <cstring>
 #include <limits>
 #include <system_error>
-
 namespace fast_float {
 
-
 namespace detail {
 /**
  * Special case +inf, -inf, nan, infinity, -infinity.
@@ -21,45 +19,53 @@ namespace detail {
  * strings a null-free and fixed.
  **/
 template <typename T, typename UC>
-from_chars_result_t<UC> FASTFLOAT_CONSTEXPR14
-parse_infnan(UC const * first, UC const * last, T &value)  noexcept  {
+from_chars_result_t<UC> FASTFLOAT_CONSTEXPR14 parse_infnan(UC const *first,
+                                                           UC const *last,
+                                                           T &value) noexcept {
   from_chars_result_t<UC> answer{};
   answer.ptr = first;
   answer.ec = std::errc(); // be optimistic
   bool minusSign = false;
-  if (*first == UC('-')) { // assume first < last, so dereference without checks; C++17 20.19.3.(7.1) explicitly forbids '+' here
-      minusSign = true;
-      ++first;
+  if (*first ==
+      UC('-')) { // assume first < last, so dereference without checks;
+                 // C++17 20.19.3.(7.1) explicitly forbids '+' here
+    minusSign = true;
+    ++first;
   }
 #ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default
   if (*first == UC('+')) {
-      ++first;
+    ++first;
   }
 #endif
   if (last - first >= 3) {
     if (fastfloat_strncasecmp(first, str_const_nan<UC>(), 3)) {
       answer.ptr = (first += 3);
-      value = minusSign ? -std::numeric_limits<T>::quiet_NaN() : std::numeric_limits<T>::quiet_NaN();
-      // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan).
-      if(first != last && *first == UC('(')) {
-        for(UC const * ptr = first + 1; ptr != last; ++ptr) {
+      value = minusSign ? -std::numeric_limits<T>::quiet_NaN()
+                        : std::numeric_limits<T>::quiet_NaN();
+      // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7,
+      // C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan).
+      if (first != last && *first == UC('(')) {
+        for (UC const *ptr = first + 1; ptr != last; ++ptr) {
           if (*ptr == UC(')')) {
             answer.ptr = ptr + 1; // valid nan(n-char-seq-opt)
             break;
-          }
-          else if(!((UC('a') <= *ptr && *ptr <= UC('z')) || (UC('A') <= *ptr && *ptr <= UC('Z')) || (UC('0') <= *ptr && *ptr <= UC('9')) || *ptr == UC('_')))
+          } else if (!((UC('a') <= *ptr && *ptr <= UC('z')) ||
+                       (UC('A') <= *ptr && *ptr <= UC('Z')) ||
+                       (UC('0') <= *ptr && *ptr <= UC('9')) || *ptr == UC('_')))
             break; // forbidden char, not nan(n-char-seq-opt)
         }
       }
       return answer;
     }
     if (fastfloat_strncasecmp(first, str_const_inf<UC>(), 3)) {
-      if ((last - first >= 8) && fastfloat_strncasecmp(first + 3, str_const_inf<UC>() + 3, 5)) {
+      if ((last - first >= 8) &&
+          fastfloat_strncasecmp(first + 3, str_const_inf<UC>() + 3, 5)) {
         answer.ptr = first + 8;
       } else {
         answer.ptr = first + 3;
       }
-      value = minusSign ? -std::numeric_limits<T>::infinity() : std::numeric_limits<T>::infinity();
+      value = minusSign ? -std::numeric_limits<T>::infinity()
+                        : std::numeric_limits<T>::infinity();
       return answer;
     }
   }
@@ -89,98 +95,128 @@ fastfloat_really_inline bool rounds_to_nearest() noexcept {
   //
   // The volatile keywoard prevents the compiler from computing the function
   // at compile-time.
-  // There might be other ways to prevent compile-time optimizations (e.g., asm).
-  // The value does not need to be std::numeric_limits<float>::min(), any small
-  // value so that 1 + x should round to 1 would do (after accounting for excess
-  // precision, as in 387 instructions).
+  // There might be other ways to prevent compile-time optimizations (e.g.,
+  // asm). The value does not need to be std::numeric_limits<float>::min(), any
+  // small value so that 1 + x should round to 1 would do (after accounting for
+  // excess precision, as in 387 instructions).
   static volatile float fmin = std::numeric_limits<float>::min();
   float fmini = fmin; // we copy it so that it gets loaded at most once.
-  //
-  // Explanation:
-  // Only when fegetround() == FE_TONEAREST do we have that
-  // fmin + 1.0f == 1.0f - fmin.
-  //
-  // FE_UPWARD:
-  //  fmin + 1.0f > 1
-  //  1.0f - fmin == 1
-  //
-  // FE_DOWNWARD or  FE_TOWARDZERO:
-  //  fmin + 1.0f == 1
-  //  1.0f - fmin < 1
-  //
-  // Note: This may fail to be accurate if fast-math has been
-  // enabled, as rounding conventions may not apply.
-  #ifdef FASTFLOAT_VISUAL_STUDIO
-  #   pragma warning(push)
-  //  todo: is there a VS warning?
-  //  see https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013
-  #elif defined(__clang__)
-  #   pragma clang diagnostic push
-  #   pragma clang diagnostic ignored "-Wfloat-equal"
-  #elif defined(__GNUC__)
-  #   pragma GCC diagnostic push
-  #   pragma GCC diagnostic ignored "-Wfloat-equal"
-  #endif
+//
+// Explanation:
+// Only when fegetround() == FE_TONEAREST do we have that
+// fmin + 1.0f == 1.0f - fmin.
+//
+// FE_UPWARD:
+//  fmin + 1.0f > 1
+//  1.0f - fmin == 1
+//
+// FE_DOWNWARD or  FE_TOWARDZERO:
+//  fmin + 1.0f == 1
+//  1.0f - fmin < 1
+//
+// Note: This may fail to be accurate if fast-math has been
+// enabled, as rounding conventions may not apply.
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(push)
+//  todo: is there a VS warning?
+//  see
+//  https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013
+#elif defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wfloat-equal"
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
   return (fmini + 1.0f == 1.0f - fmini);
-  #ifdef FASTFLOAT_VISUAL_STUDIO
-  #   pragma warning(pop)
-  #elif defined(__clang__)
-  #   pragma clang diagnostic pop
-  #elif defined(__GNUC__)
-  #   pragma GCC diagnostic pop
-  #endif
+#ifdef FASTFLOAT_VISUAL_STUDIO
+#pragma warning(pop)
+#elif defined(__clang__)
+#pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
 }
 
 } // namespace detail
 
-template<typename T, typename UC>
-FASTFLOAT_CONSTEXPR20
-from_chars_result_t<UC> from_chars(UC const * first, UC const * last,
-                             T &value, chars_format fmt /*= chars_format::general*/)  noexcept  {
-  return from_chars_advanced(first, last, value, parse_options_t<UC>{fmt});
+template <typename T> struct from_chars_caller {
+  template <typename UC>
+  FASTFLOAT_CONSTEXPR20 static from_chars_result_t<UC>
+  call(UC const *first, UC const *last, T &value,
+       parse_options_t<UC> options) noexcept {
+    return from_chars_advanced(first, last, value, options);
+  }
+};
+
+#if __STDCPP_FLOAT32_T__ == 1
+template <> struct from_chars_caller<std::float32_t> {
+  template <typename UC>
+  FASTFLOAT_CONSTEXPR20 static from_chars_result_t<UC>
+  call(UC const *first, UC const *last, std::float32_t &value,
+       parse_options_t<UC> options) noexcept {
+    // if std::float32_t is defined, and we are in C++23 mode; macro set for
+    // float32; set value to float due to equivalence between float and
+    // float32_t
+    float val;
+    auto ret = from_chars_advanced(first, last, val, options);
+    value = val;
+    return ret;
+  }
+};
+#endif
+
+#if __STDCPP_FLOAT64_T__ == 1
+template <> struct from_chars_caller<std::float64_t> {
+  template <typename UC>
+  FASTFLOAT_CONSTEXPR20 static from_chars_result_t<UC>
+  call(UC const *first, UC const *last, std::float64_t &value,
+       parse_options_t<UC> options) noexcept {
+    // if std::float64_t is defined, and we are in C++23 mode; macro set for
+    // float64; set value as double due to equivalence between double and
+    // float64_t
+    double val;
+    auto ret = from_chars_advanced(first, last, val, options);
+    value = val;
+    return ret;
+  }
+};
+#endif
+
+template <typename T, typename UC, typename>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value,
+           chars_format fmt /*= chars_format::general*/) noexcept {
+  return from_chars_caller<T>::call(first, last, value,
+                                    parse_options_t<UC>(fmt));
 }
 
-template<typename T, typename UC>
-FASTFLOAT_CONSTEXPR20
-from_chars_result_t<UC> from_chars_advanced(UC const * first, UC const * last,
-                                      T &value, parse_options_t<UC> options)  noexcept  {
+/**
+ * This function overload takes parsed_number_string_t structure that is created
+ * and populated either by from_chars_advanced function taking chars range and
+ * parsing options or other parsing custom function implemented by user.
+ */
+template <typename T, typename UC>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars_advanced(parsed_number_string_t<UC> &pns, T &value) noexcept {
 
-  static_assert (std::is_same<T, double>::value || std::is_same<T, float>::value, "only float and double are supported");
-  static_assert (std::is_same<UC, char>::value ||
-                 std::is_same<UC, wchar_t>::value ||
-                 std::is_same<UC, char16_t>::value ||
-                 std::is_same<UC, char32_t>::value , "only char, wchar_t, char16_t and char32_t are supported");
+  static_assert(is_supported_float_type<T>(),
+                "only some floating-point types are supported");
+  static_assert(is_supported_char_type<UC>(),
+                "only char, wchar_t, char16_t and char32_t are supported");
 
   from_chars_result_t<UC> answer;
-#ifdef FASTFLOAT_SKIP_WHITE_SPACE  // disabled by default
-  while ((first != last) && fast_float::is_space(uint8_t(*first))) {
-    first++;
-  }
-#endif
-  if (first == last) {
-    answer.ec = std::errc::invalid_argument;
-    answer.ptr = first;
-    return answer;
-  }
-  parsed_number_string_t<UC> pns = parse_number_string<UC>(first, last, options);
-  if (!pns.valid) {
-    if (options.format & chars_format::no_infnan) {
-      answer.ec = std::errc::invalid_argument;
-      answer.ptr = first;
-      return answer;
-    } else {
-      return detail::parse_infnan(first, last, value);
-    }
-  }
 
   answer.ec = std::errc(); // be optimistic
   answer.ptr = pns.lastmatch;
   // The implementation of the Clinger's fast path is convoluted because
   // we want round-to-nearest in all cases, irrespective of the rounding mode
   // selected on the thread.
-  // We proceed optimistically, assuming that detail::rounds_to_nearest() returns
-  // true.
-  if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format<T>::max_exponent_fast_path() && !pns.too_many_digits) {
+  // We proceed optimistically, assuming that detail::rounds_to_nearest()
+  // returns true.
+  if (binary_format<T>::min_exponent_fast_path() <= pns.exponent &&
+      pns.exponent <= binary_format<T>::max_exponent_fast_path() &&
+      !pns.too_many_digits) {
     // Unfortunately, the conventional Clinger's fast path is only possible
     // when the system rounds to the nearest float.
     //
@@ -188,50 +224,123 @@ from_chars_result_t<UC> from_chars_advanced(UC const * first, UC const * last,
     // We could check it first (before the previous branch), but
     // there might be performance advantages at having the check
     // be last.
-    if(!cpp20_and_in_constexpr() && detail::rounds_to_nearest())  {
+    if (!cpp20_and_in_constexpr() && detail::rounds_to_nearest()) {
       // We have that fegetround() == FE_TONEAREST.
       // Next is Clinger's fast path.
-      if (pns.mantissa <=binary_format<T>::max_mantissa_fast_path()) {
+      if (pns.mantissa <= binary_format<T>::max_mantissa_fast_path()) {
         value = T(pns.mantissa);
-        if (pns.exponent < 0) { value = value / binary_format<T>::exact_power_of_ten(-pns.exponent); }
-        else { value = value * binary_format<T>::exact_power_of_ten(pns.exponent); }
-        if (pns.negative) { value = -value; }
+        if (pns.exponent < 0) {
+          value = value / binary_format<T>::exact_power_of_ten(-pns.exponent);
+        } else {
+          value = value * binary_format<T>::exact_power_of_ten(pns.exponent);
+        }
+        if (pns.negative) {
+          value = -value;
+        }
         return answer;
       }
     } else {
       // We do not have that fegetround() == FE_TONEAREST.
-      // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal
-      if (pns.exponent >= 0 && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
-#if defined(__clang__)
+      // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's
+      // proposal
+      if (pns.exponent >= 0 &&
+          pns.mantissa <=
+              binary_format<T>::max_mantissa_fast_path(pns.exponent)) {
+#if defined(__clang__) || defined(FASTFLOAT_32BIT)
         // Clang may map 0 to -0.0 when fegetround() == FE_DOWNWARD
-        if(pns.mantissa == 0) {
-          value = pns.negative ? -0. : 0.;
+        if (pns.mantissa == 0) {
+          value = pns.negative ? T(-0.) : T(0.);
           return answer;
         }
 #endif
-        value = T(pns.mantissa) * binary_format<T>::exact_power_of_ten(pns.exponent);
-        if (pns.negative) { value = -value; }
+        value = T(pns.mantissa) *
+                binary_format<T>::exact_power_of_ten(pns.exponent);
+        if (pns.negative) {
+          value = -value;
+        }
         return answer;
       }
     }
   }
-  adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
-  if(pns.too_many_digits && am.power2 >= 0) {
-    if(am != compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {
+  adjusted_mantissa am =
+      compute_float<binary_format<T>>(pns.exponent, pns.mantissa);
+  if (pns.too_many_digits && am.power2 >= 0) {
+    if (am != compute_float<binary_format<T>>(pns.exponent, pns.mantissa + 1)) {
       am = compute_error<binary_format<T>>(pns.exponent, pns.mantissa);
     }
   }
-  // If we called compute_float<binary_format<T>>(pns.exponent, pns.mantissa) and we have an invalid power (am.power2 < 0),
-  // then we need to go the long way around again. This is very uncommon.
-  if(am.power2 < 0) { am = digit_comp<T>(pns, am); }
+  // If we called compute_float<binary_format<T>>(pns.exponent, pns.mantissa)
+  // and we have an invalid power (am.power2 < 0), then we need to go the long
+  // way around again. This is very uncommon.
+  if (am.power2 < 0) {
+    am = digit_comp<T>(pns, am);
+  }
   to_float(pns.negative, am, value);
   // Test for over/underflow.
-  if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || am.power2 == binary_format<T>::infinite_power()) {
+  if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) ||
+      am.power2 == binary_format<T>::infinite_power()) {
     answer.ec = std::errc::result_out_of_range;
   }
   return answer;
 }
 
+template <typename T, typename UC>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars_advanced(UC const *first, UC const *last, T &value,
+                    parse_options_t<UC> options) noexcept {
+
+  static_assert(is_supported_float_type<T>(),
+                "only some floating-point types are supported");
+  static_assert(is_supported_char_type<UC>(),
+                "only char, wchar_t, char16_t and char32_t are supported");
+
+  from_chars_result_t<UC> answer;
+#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default
+  while ((first != last) && fast_float::is_space(uint8_t(*first))) {
+    first++;
+  }
+#endif
+  if (first == last) {
+    answer.ec = std::errc::invalid_argument;
+    answer.ptr = first;
+    return answer;
+  }
+  parsed_number_string_t<UC> pns =
+      parse_number_string<UC>(first, last, options);
+  if (!pns.valid) {
+    if (options.format & chars_format::no_infnan) {
+      answer.ec = std::errc::invalid_argument;
+      answer.ptr = first;
+      return answer;
+    } else {
+      return detail::parse_infnan(first, last, value);
+    }
+  }
+
+  // call overload that takes parsed_number_string_t directly.
+  return from_chars_advanced(pns, value);
+}
+
+template <typename T, typename UC, typename>
+FASTFLOAT_CONSTEXPR20 from_chars_result_t<UC>
+from_chars(UC const *first, UC const *last, T &value, int base) noexcept {
+  static_assert(is_supported_char_type<UC>(),
+                "only char, wchar_t, char16_t and char32_t are supported");
+
+  from_chars_result_t<UC> answer;
+#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default
+  while ((first != last) && fast_float::is_space(uint8_t(*first))) {
+    first++;
+  }
+#endif
+  if (first == last || base < 2 || base > 36) {
+    answer.ec = std::errc::invalid_argument;
+    answer.ptr = first;
+    return answer;
+  }
+  return parse_int_string(first, last, value, base);
+}
+
 } // namespace fast_float
 
 #endif