diff --git a/.github/workflows/android_cmake/start.sh b/.github/workflows/android_cmake/start.sh index 4b275f01a3af..e7e6a090c4e1 100755 --- a/.github/workflows/android_cmake/start.sh +++ b/.github/workflows/android_cmake/start.sh @@ -88,6 +88,10 @@ PKG_CONFIG_LIBDIR=/tmp/install/lib/pkgconfig cmake .. \ -DSFCGAL_CONFIG=disabled \ -DHDF5_C_COMPILER_EXECUTABLE=disabled \ -DHDF5_CXX_COMPILER_EXECUTABLE=disabled + +echo "Check that GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON" +(grep "GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON" CMakeCache.txt > /dev/null && echo "yes") || (echo "Missing" && /bin/false) + make -j$(nproc) make install cd .. diff --git a/.github/workflows/cmake_builds.yml b/.github/workflows/cmake_builds.yml index 910c84966de4..938179c23dbd 100644 --- a/.github/workflows/cmake_builds.yml +++ b/.github/workflows/cmake_builds.yml @@ -606,7 +606,7 @@ jobs: - name: Setup xcode uses: maxim-lobanov/setup-xcode@60606e260d2fc5762a71e64e74b2174e8ea3c8bd # v1.6.0 with: - xcode-version: 14.3 + xcode-version: '15.4.0' - name: Checkout GDAL uses: actions/checkout@d632683dd7b4114ad314bca15554477dd762a938 # v4.2.0 - name: Setup cache diff --git a/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt b/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt index ef95d09f9d76..cd35f8279933 100644 --- a/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt +++ b/.github/workflows/ubuntu_24.04/expected_ogrinfo_formats.txt @@ -86,4 +86,5 @@ Supported Formats: (ro:read-only, rw:read-write, +:update, v:virtual-I/O s:subda TIGER -vector- (rov): U.S. Census TIGER/Line AVCBin -vector- (rov): Arc/Info Binary Coverage AVCE00 -vector- (rov): Arc/Info E00 (ASCII) Coverage (*.e00) + AIVector -vector- (ro): Artificial Intelligence powered vector driver HTTP -raster,vector- (ro): HTTP Fetching Wrapper diff --git a/.github/workflows/windows_conda_expected_ogrinfo_formats.txt b/.github/workflows/windows_conda_expected_ogrinfo_formats.txt index 908f8769c49c..c910b318a6cf 100644 --- a/.github/workflows/windows_conda_expected_ogrinfo_formats.txt +++ b/.github/workflows/windows_conda_expected_ogrinfo_formats.txt @@ -81,4 +81,5 @@ Supported Formats: (ro:read-only, rw:read-write, +:update, v:virtual-I/O s:subda TIGER -vector- (rov): U.S. Census TIGER/Line AVCBin -vector- (rov): Arc/Info Binary Coverage AVCE00 -vector- (rov): Arc/Info E00 (ASCII) Coverage (*.e00) + AIVector -vector- (ro): Artificial Intelligence powered vector driver HTTP -raster,vector- (ro): HTTP Fetching Wrapper diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f45a709ff7d0..058d13cf2877 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -56,6 +56,7 @@ repos: frmts/pcidsk/sdk| frmts/grib/degrib/degrib| frmts/grib/degrib/g2clib| + gcore/sse2neon.h| port/utf8.h| ogr/ogrsf_frmts/adbc/ogr_adbc_internal.h| ogr/ogrsf_frmts/cad/libopencad/| diff --git a/CITATION.cff b/CITATION.cff index a4b3a817404c..f55417d0e5bb 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -2,8 +2,8 @@ cff-version: 1.2.0 message: Please cite this software using these metadata or in the CITATION file. type: software title: GDAL -version: 3.8.3 -date-released: 2024-01-02 +version: 3.10.0 +date-released: 2024-11-01 doi: 10.5281/zenodo.5884351 abstract: GDAL is a translator library for raster and vector geospatial data formats that is released under an MIT style Open Source License by the Open diff --git a/CMakeLists.txt b/CMakeLists.txt index 07d1a09d5f2c..460537e2fcda 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,6 +94,16 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(x86|AMD64)") endif () endif () +else() + + # Check ability to use Arm Neon optimizations + include(CheckCXXSourceCompiles) + include(CMakePushCheckState) + cmake_push_check_state(RESET) + set(CMAKE_REQUIRED_INCLUDES "${CMAKE_CURRENT_SOURCE_DIR}/gcore") + check_cxx_source_compiles("#include \"include_sse2neon.h\"\nint main() { return 0; }" SSE2NEON_COMPILES) + cmake_pop_check_state() + endif () # option(CLANG_TIDY_ENABLED "Run clang-tidy with the compiler." OFF) diff --git a/Doxyfile b/Doxyfile index b22e38ec20c7..4a896612e813 100644 --- a/Doxyfile +++ b/Doxyfile @@ -415,7 +415,9 @@ RECURSIVE = NO # subdirectory from a directory tree whose root is specified with the INPUT tag. EXCLUDE = gcore/rawdataset.cpp \ - gcore/rawdataset.h + gcore/rawdataset.h \ + gcore/include_sse2neon.h \ + gcore/sse2neon.h # The EXCLUDE_SYMLINKS tag can be used select whether or not files or # directories that are symbolic links (a Unix filesystem feature) are excluded diff --git a/HOWTO-RELEASE b/HOWTO-RELEASE index 684d3b589afd..46c735f003b6 100644 --- a/HOWTO-RELEASE +++ b/HOWTO-RELEASE @@ -358,3 +358,10 @@ or your message manually approved, with an administrator of the list. 23) For bugfixes releases, forward port to master changes done in doc/source/about_no_title.rst, doc/source/download.rst and doc/source/download_past.rst + +24) For a feature release, enable a new version in the ReadTheDocs administration panel. + a) Go to https://readthedocs.org/projects/gdal/versions/ + b) In the "Activate a version" tab, enter "release/X.Y" in the text entry and click on the Filter button + c) Click on the Activate button + d) Go to https://readthedocs.org/projects/gdal/, and in the "Compile a version" drop-down list, + select the newt "release-X.Y" label and click on the "Compile a version" button diff --git a/apps/argparse/argparse.hpp b/apps/argparse/argparse.hpp index a52142848192..029fa03c2c42 100644 --- a/apps/argparse/argparse.hpp +++ b/apps/argparse/argparse.hpp @@ -2086,7 +2086,7 @@ class ArgumentParser { std::stringstream stream; std::string curline("Usage: "); - curline += this->m_program_name; + curline += this->m_parser_path; const bool multiline_usage = this->m_usage_max_line_width < std::numeric_limits::max(); const size_t indent_size = curline.size(); diff --git a/apps/gdalargumentparser.cpp b/apps/gdalargumentparser.cpp index 724dcd91957b..3b795325345b 100644 --- a/apps/gdalargumentparser.cpp +++ b/apps/gdalargumentparser.cpp @@ -37,7 +37,7 @@ GDALArgumentParser::GDALArgumentParser(const std::string &program_name, [this](const auto &) { std::cout << usage() << std::endl << std::endl; - std::cout << _("Note: ") << m_program_name + std::cout << _("Note: ") << m_parser_path << _(" --long-usage for full help.") << std::endl; std::exit(0); }) diff --git a/autotest/cpp/CMakeLists.txt b/autotest/cpp/CMakeLists.txt index 493f1d59f9bd..d8420177a9ff 100644 --- a/autotest/cpp/CMakeLists.txt +++ b/autotest/cpp/CMakeLists.txt @@ -77,6 +77,7 @@ add_executable( test_gdal_aaigrid.cpp test_gdal_dted.cpp test_gdal_gtiff.cpp + test_gdal_minmax_element.cpp test_gdal_pixelfn.cpp test_gdal_typetraits.cpp test_ogr.cpp diff --git a/autotest/cpp/googletest/CMakeLists.txt.in b/autotest/cpp/googletest/CMakeLists.txt.in index 1cdcf819370b..5055e6628fae 100644 --- a/autotest/cpp/googletest/CMakeLists.txt.in +++ b/autotest/cpp/googletest/CMakeLists.txt.in @@ -1,5 +1,5 @@ # Source https://github.com/google/googletest/blob/master/googletest/README.md -cmake_minimum_required(VERSION 3.9) +cmake_minimum_required(VERSION 3.16) project(googletest-download NONE) @@ -10,8 +10,8 @@ endif() include(ExternalProject) ExternalProject_Add(googletest - URL https://github.com/google/googletest/archive/release-1.12.1.zip - URL_HASH SHA1=973e464e8936d4b79bb24f27b058aaef4150b06e + URL https://github.com/google/googletest/releases/download/v1.15.2/googletest-1.15.2.tar.gz + URL_HASH SHA1=568d58e26bd4e838449ca7ab8ebc152b3cbd210d DOWNLOAD_NO_PROGRESS ON SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" diff --git a/autotest/cpp/test_gdal.cpp b/autotest/cpp/test_gdal.cpp index 45ebfb4ab870..9be606d82c6b 100644 --- a/autotest/cpp/test_gdal.cpp +++ b/autotest/cpp/test_gdal.cpp @@ -4777,4 +4777,143 @@ TEST_F(test_gdal, ReadRaster) } } +// Test GDALComputeRasterMinMaxLocation +TEST_F(test_gdal, GDALComputeRasterMinMaxLocation) +{ + GDALDatasetH hDS = GDALOpen(GCORE_DATA_DIR "byte.tif", GA_ReadOnly); + ASSERT_NE(hDS, nullptr); + GDALRasterBandH hBand = GDALGetRasterBand(hDS, 1); + { + double dfMin = 0; + double dfMax = 0; + int nMinX = -1; + int nMinY = -1; + int nMaxX = -1; + int nMaxY = -1; + EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, &dfMin, &dfMax, &nMinX, + &nMinY, &nMaxX, &nMaxY), + CE_None); + EXPECT_EQ(dfMin, 74.0); + EXPECT_EQ(dfMax, 255.0); + EXPECT_EQ(nMinX, 9); + EXPECT_EQ(nMinY, 17); + EXPECT_EQ(nMaxX, 2); + EXPECT_EQ(nMaxY, 18); + GByte val = 0; + EXPECT_EQ(GDALRasterIO(hBand, GF_Read, nMinX, nMinY, 1, 1, &val, 1, 1, + GDT_Byte, 0, 0), + CE_None); + EXPECT_EQ(val, 74); + EXPECT_EQ(GDALRasterIO(hBand, GF_Read, nMaxX, nMaxY, 1, 1, &val, 1, 1, + GDT_Byte, 0, 0), + CE_None); + EXPECT_EQ(val, 255); + } + { + int nMinX = -1; + int nMinY = -1; + EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, nullptr, nullptr, + &nMinX, &nMinY, nullptr, + nullptr), + CE_None); + EXPECT_EQ(nMinX, 9); + EXPECT_EQ(nMinY, 17); + } + { + int nMaxX = -1; + int nMaxY = -1; + EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, nullptr, nullptr, + nullptr, nullptr, &nMaxX, + &nMaxY), + CE_None); + EXPECT_EQ(nMaxX, 2); + EXPECT_EQ(nMaxY, 18); + } + { + EXPECT_EQ(GDALComputeRasterMinMaxLocation(hBand, nullptr, nullptr, + nullptr, nullptr, nullptr, + nullptr), + CE_None); + } + GDALClose(hDS); +} + +// Test GDALComputeRasterMinMaxLocation +TEST_F(test_gdal, GDALComputeRasterMinMaxLocation_byte_min_max_optim) +{ + GDALDatasetUniquePtr poDS(GDALDriver::FromHandle(GDALGetDriverByName("MEM")) + ->Create("", 1, 4, 1, GDT_Byte, nullptr)); + std::array buffer = { + 1, ////////////////////////////////////////////////////////// + 0, ////////////////////////////////////////////////////////// + 255, ////////////////////////////////////////////////////////// + 1, ////////////////////////////////////////////////////////// + }; + GDALRasterIOExtraArg sExtraArg; + INIT_RASTERIO_EXTRA_ARG(sExtraArg); + EXPECT_EQ(poDS->GetRasterBand(1)->RasterIO( + GF_Write, 0, 0, 1, 4, buffer.data(), 1, 4, GDT_Byte, + sizeof(uint8_t), 1 * sizeof(uint8_t), &sExtraArg), + CE_None); + + double dfMin = 0; + double dfMax = 0; + int nMinX = -1; + int nMinY = -1; + int nMaxX = -1; + int nMaxY = -1; + EXPECT_EQ(poDS->GetRasterBand(1)->ComputeRasterMinMaxLocation( + &dfMin, &dfMax, &nMinX, &nMinY, &nMaxX, &nMaxY), + CE_None); + EXPECT_EQ(dfMin, 0); + EXPECT_EQ(dfMax, 255); + EXPECT_EQ(nMinX, 0); + EXPECT_EQ(nMinY, 1); + EXPECT_EQ(nMaxX, 0); + EXPECT_EQ(nMaxY, 2); +} + +// Test GDALComputeRasterMinMaxLocation +TEST_F(test_gdal, GDALComputeRasterMinMaxLocation_with_mask) +{ + GDALDatasetUniquePtr poDS(GDALDriver::FromHandle(GDALGetDriverByName("MEM")) + ->Create("", 2, 2, 1, GDT_Byte, nullptr)); + std::array buffer = { + 2, 10, ////////////////////////////////////////////////////////// + 4, 20, ////////////////////////////////////////////////////////// + }; + GDALRasterIOExtraArg sExtraArg; + INIT_RASTERIO_EXTRA_ARG(sExtraArg); + EXPECT_EQ(poDS->GetRasterBand(1)->RasterIO( + GF_Write, 0, 0, 2, 2, buffer.data(), 2, 2, GDT_Byte, + sizeof(uint8_t), 2 * sizeof(uint8_t), &sExtraArg), + CE_None); + + poDS->GetRasterBand(1)->CreateMaskBand(0); + std::array buffer_mask = { + 0, 255, ////////////////////////////////////////////////////////// + 255, 0, ////////////////////////////////////////////////////////// + }; + EXPECT_EQ(poDS->GetRasterBand(1)->GetMaskBand()->RasterIO( + GF_Write, 0, 0, 2, 2, buffer_mask.data(), 2, 2, GDT_Byte, + sizeof(uint8_t), 2 * sizeof(uint8_t), &sExtraArg), + CE_None); + + double dfMin = 0; + double dfMax = 0; + int nMinX = -1; + int nMinY = -1; + int nMaxX = -1; + int nMaxY = -1; + EXPECT_EQ(poDS->GetRasterBand(1)->ComputeRasterMinMaxLocation( + &dfMin, &dfMax, &nMinX, &nMinY, &nMaxX, &nMaxY), + CE_None); + EXPECT_EQ(dfMin, 4); + EXPECT_EQ(dfMax, 10); + EXPECT_EQ(nMinX, 0); + EXPECT_EQ(nMinY, 1); + EXPECT_EQ(nMaxX, 1); + EXPECT_EQ(nMaxY, 0); +} + } // namespace diff --git a/autotest/cpp/test_gdal_minmax_element.cpp b/autotest/cpp/test_gdal_minmax_element.cpp new file mode 100644 index 000000000000..b6d681dda76d --- /dev/null +++ b/autotest/cpp/test_gdal_minmax_element.cpp @@ -0,0 +1,900 @@ +/////////////////////////////////////////////////////////////////////////////// +// +// Project: C++ Test Suite for GDAL/OGR +// Purpose: Test gdal_minmax_element.hpp +// Author: Even Rouault +// +/////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2023, Even Rouault +/* + * SPDX-License-Identifier: MIT + ****************************************************************************/ + +#include "gdal_unit_test.h" + +#include "gdal_minmax_element.hpp" + +#include "gtest_include.h" + +#include + +namespace +{ + +struct test_gdal_minmax_element : public ::testing::Test +{ +}; + +TEST_F(test_gdal_minmax_element, uint8) +{ + using T = uint8_t; + constexpr GDALDataType eDT = GDT_Byte; + T min_v = 3; + T max_v = 7; + { + T nodata = 0; + std::vector v{max_v, nodata, min_v}; + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = + gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + auto [idx_min, idx_max] = + gdal::minmax_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + EXPECT_EQ(v[idx_max], max_v); + } + } + { + T nodata = 0; + std::vector v{nodata, max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{static_cast((min_v + max_v) / 2), + static_cast(max_v - 1), + max_v, + static_cast(max_v - 1), + static_cast(min_v + 1), + min_v, + static_cast(min_v + 1)}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[125] = static_cast(min_v + 1); + v[126] = min_v; + v[127] = static_cast(min_v + 1); + v[128] = static_cast(max_v - 1); + v[129] = max_v; + v[130] = static_cast(max_v - 1); + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast(min_v + 2)); + v[128] = static_cast(min_v + 1); + v[256] = min_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, + static_cast(min_v + 1)); + EXPECT_EQ(v[idx_min], min_v); + } + { + std::vector v(257, 0); + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, 0); + EXPECT_TRUE(idx_min == 0 || idx_min == 256) << idx_min; + } + { + std::vector v(257, 0); + v[127] = static_cast(min_v + 1); + v[255] = min_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, 0); + EXPECT_EQ(v[idx_min], min_v); + } + { + std::vector v(259, static_cast((min_v + max_v) / 2)); + v[0] = min_v; + v[256] = static_cast(max_v - 1); + v[257] = max_v; + v[258] = static_cast(max_v - 1); + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[0] = min_v; + v[127] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[127] = min_v; + v[0] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[0] = min_v; + v[129] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[129] = min_v; + v[0] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[129] = min_v; + v[256] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[256] = min_v; + v[129] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, 0); + v[65] = static_cast(max_v - 2); + v[66] = static_cast(max_v - 1); + v[129] = max_v; + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, 0); + EXPECT_EQ(v[idx_max], max_v); + } +} + +TEST_F(test_gdal_minmax_element, int8) +{ + using T = int8_t; + T min_v = -1; + T max_v = 3; + constexpr GDALDataType eDT = GDT_Int8; + { + T nodata = 0; + std::vector v{max_v, nodata, min_v}; + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = + gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + auto [idx_min, idx_max] = + gdal::minmax_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + EXPECT_EQ(v[idx_max], max_v); + } + } + { + T nodata = 0; + std::vector v{nodata, max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{static_cast((min_v + max_v) / 2), max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[5] = min_v; + v[31] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast(min_v + 2)); + v[128] = static_cast(min_v + 1); + v[256] = min_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, + static_cast(min_v + 1)); + EXPECT_EQ(v[idx_min], min_v); + } +} + +TEST_F(test_gdal_minmax_element, uint16) +{ + using T = uint16_t; + constexpr GDALDataType eDT = GDT_UInt16; + T min_v = 1000; + T max_v = 2000; + { + T nodata = 0; + std::vector v{max_v, nodata, min_v}; + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = + gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + auto [idx_min, idx_max] = + gdal::minmax_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + EXPECT_EQ(v[idx_max], max_v); + } + } + { + T nodata = 0; + std::vector v{nodata, max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{static_cast((min_v + max_v) / 2), max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[5] = min_v; + v[31] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast(min_v + 2)); + v[128] = static_cast(min_v + 1); + v[256] = min_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, + static_cast(min_v + 1)); + EXPECT_EQ(v[idx_min], min_v); + } +} + +TEST_F(test_gdal_minmax_element, int16) +{ + using T = int16_t; + constexpr GDALDataType eDT = GDT_Int16; + T min_v = -1000; + T max_v = 2000; + { + T nodata = 0; + std::vector v{max_v, nodata, min_v}; + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = + gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + auto [idx_min, idx_max] = + gdal::minmax_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + EXPECT_EQ(v[idx_max], max_v); + } + } + { + T nodata = 0; + std::vector v{nodata, max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{static_cast((min_v + max_v) / 2), max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[5] = min_v; + v[31] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast(min_v + 2)); + v[128] = static_cast(min_v + 1); + v[256] = min_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, + static_cast(min_v + 1)); + EXPECT_EQ(v[idx_min], min_v); + } +} + +TEST_F(test_gdal_minmax_element, uint32) +{ + using T = uint32_t; + constexpr GDALDataType eDT = GDT_UInt32; + T min_v = 10000000; + T max_v = 20000000; + { + T nodata = 0; + std::vector v{max_v, nodata, min_v}; + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = + gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + auto [idx_min, idx_max] = + gdal::minmax_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + EXPECT_EQ(v[idx_max], max_v); + } + } + { + T nodata = 0; + std::vector v{nodata, max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{static_cast((min_v + max_v) / 2), max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[5] = min_v; + v[31] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast(min_v + 2)); + v[128] = static_cast(min_v + 1); + v[256] = min_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, + static_cast(min_v + 1)); + EXPECT_EQ(v[idx_min], min_v); + } +} + +TEST_F(test_gdal_minmax_element, int32) +{ + using T = int32_t; + constexpr GDALDataType eDT = GDT_Int32; + T min_v = -10000000; + T max_v = 20000000; + { + T nodata = 0; + std::vector v{max_v, nodata, min_v}; + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = + gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + auto [idx_min, idx_max] = + gdal::minmax_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + EXPECT_EQ(v[idx_max], max_v); + } + } + { + T nodata = 0; + std::vector v{nodata, max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{static_cast((min_v + max_v) / 2), max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[5] = min_v; + v[31] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast(min_v + 2)); + v[128] = static_cast(min_v + 1); + v[256] = min_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, + static_cast(min_v + 1)); + EXPECT_EQ(v[idx_min], min_v); + } +} + +TEST_F(test_gdal_minmax_element, uint64) +{ + using T = uint64_t; + constexpr GDALDataType eDT = GDT_UInt64; + T min_v = 100000000000000; + T max_v = 200000000000000; + { + double nodata = 0; + std::vector v{max_v, static_cast(nodata), min_v}; + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = + gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + auto [idx_min, idx_max] = + gdal::minmax_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + EXPECT_EQ(v[idx_max], max_v); + } + } + { + double nodata = 0; + std::vector v{static_cast(nodata), max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{static_cast((min_v + max_v) / 2), max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[5] = min_v; + v[31] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast(min_v + 2)); + v[128] = static_cast(min_v + 1); + v[256] = min_v; + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, + static_cast(static_cast(min_v + 1))); + EXPECT_EQ(v[idx_min], min_v); + } +} + +TEST_F(test_gdal_minmax_element, int64) +{ + using T = int64_t; + constexpr GDALDataType eDT = GDT_Int64; + T min_v = -100000000000000; + T max_v = 200000000000000; + { + double nodata = 0; + std::vector v{max_v, static_cast(nodata), min_v}; + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = + gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + auto [idx_min, idx_max] = + gdal::minmax_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + EXPECT_EQ(v[idx_max], max_v); + } + } + { + double nodata = 0; + std::vector v{static_cast(nodata), max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{static_cast((min_v + max_v) / 2), + max_v - 1, + max_v, + max_v - 1, + min_v + 1, + min_v, + min_v + 1}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast((min_v + max_v) / 2)); + v[5] = min_v; + v[31] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast(min_v + 2)); + v[128] = static_cast(min_v + 1); + v[256] = min_v; + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, + static_cast(static_cast(min_v + 1))); + EXPECT_EQ(v[idx_min], min_v); + } +} + +TEST_F(test_gdal_minmax_element, float32) +{ + using T = float; + constexpr GDALDataType eDT = GDT_Float32; + T min_v = 1.0f; + T max_v = 1.5f; + { + T nodata = 2.0f; + std::vector v{max_v, nodata, min_v}; + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = + gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + auto [idx_min, idx_max] = + gdal::minmax_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + EXPECT_EQ(v[idx_max], max_v); + } + } + { + T nodata = 2.0f; + std::vector v{nodata, max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + T nodata = 2.0f; + std::vector v{std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), nodata, max_v, + min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + T nodata = std::numeric_limits::quiet_NaN(); + std::vector v{std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), nodata, max_v, + min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + max_v, + std::numeric_limits::quiet_NaN(), + min_v, + std::numeric_limits::quiet_NaN()}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{max_v, std::numeric_limits::quiet_NaN(), min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, std::numeric_limits::quiet_NaN()); + v[125] = static_cast(min_v + 0.1f); + v[126] = min_v; + v[127] = static_cast(min_v + 0.1f); + v[128] = static_cast(max_v - 0.1f); + v[129] = max_v; + v[130] = static_cast(max_v - 0.1f); + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(33, 1.2f); + v[5] = min_v; + v[15] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(255, std::numeric_limits::quiet_NaN()); + v[v.size() - 2] = min_v; + v.back() = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast(min_v + 0.2f)); + v[128] = static_cast(min_v + 0.1f); + v[256] = min_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, + static_cast(min_v + 0.1f)); + EXPECT_EQ(v[idx_min], min_v); + } +} + +TEST_F(test_gdal_minmax_element, float64) +{ + using T = double; + constexpr GDALDataType eDT = GDT_Float64; + T min_v = 1.0; + T max_v = 1.5; + { + T nodata = 2.0; + std::vector v{max_v, nodata, min_v}; + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, true, nodata); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = gdal::min_element(v.data(), 0, eDT, false, 0); + EXPECT_EQ(idx_min, 0); + } + { + auto idx_min = + gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = + gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + auto [idx_min, idx_max] = + gdal::minmax_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + EXPECT_EQ(v[idx_max], max_v); + } + } + { + T nodata = 2.0; + std::vector v{nodata, max_v, min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + T nodata = 2.0; + std::vector v{std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), nodata, max_v, + min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, true, nodata); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{max_v, std::numeric_limits::quiet_NaN(), min_v}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v{std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + max_v, + std::numeric_limits::quiet_NaN(), + min_v, + std::numeric_limits::quiet_NaN()}; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(33, std::numeric_limits::quiet_NaN()); + v[5] = min_v; + v[15] = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(255, std::numeric_limits::quiet_NaN()); + v[v.size() - 2] = min_v; + v.back() = max_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_min], min_v); + auto idx_max = gdal::max_element(v.data(), v.size(), eDT, false, 0); + EXPECT_EQ(v[idx_max], max_v); + } + { + std::vector v(257, static_cast(min_v + 0.2)); + v[128] = static_cast(min_v + 0.1); + v[256] = min_v; + auto idx_min = gdal::min_element(v.data(), v.size(), eDT, true, + static_cast(min_v + 0.1)); + EXPECT_EQ(v[idx_min], min_v); + } +} + +TEST_F(test_gdal_minmax_element, unsupported) +{ + float v[2] = {0, 0}; + CPLErrorHandlerPusher oErrorHandler(CPLQuietErrorHandler); + { + CPLErrorReset(); + EXPECT_EQ(gdal::min_element(v, 1, GDT_CFloat32, false, 0), 0); + EXPECT_EQ(CPLGetLastErrorNo(), CPLE_NotSupported); + } + { + CPLErrorReset(); + EXPECT_EQ(gdal::max_element(v, 1, GDT_CFloat32, false, 0), 0); + EXPECT_EQ(CPLGetLastErrorNo(), CPLE_NotSupported); + } + { + CPLErrorReset(); + auto [idx_min, idx_max] = + gdal::minmax_element(v, 1, GDT_CFloat32, false, 0); + EXPECT_EQ(idx_min, 0); + EXPECT_EQ(idx_max, 0); + EXPECT_EQ(CPLGetLastErrorNo(), CPLE_NotSupported); + } +} + +} // namespace diff --git a/autotest/cpp/test_ogr.cpp b/autotest/cpp/test_ogr.cpp index b43f40d1cd65..8a91c026f57b 100644 --- a/autotest/cpp/test_ogr.cpp +++ b/autotest/cpp/test_ogr.cpp @@ -69,6 +69,7 @@ void testSpatialReferenceLeakOnCopy(OGRSpatialReference *poSRS) ASSERT_GT(nCurCount, nLastCount); nLastCount = nCurCount; + // coverity[copy_assignment_call] value3 = value; ASSERT_EQ(nLastCount, poSRS->GetReferenceCount()); } @@ -327,6 +328,7 @@ TEST_F(test_ogr, OGRGeometryCollection_copy_constructor_illegal_use) CPLErrorReset(); { CPLErrorHandlerPusher oPusher(CPLQuietErrorHandler); + // coverity[copy_assignment_call] *mp_as_gc = gc; } EXPECT_STREQ(CPLGetLastErrorMsg(), @@ -360,6 +362,7 @@ TEST_F(test_ogr, OGRCurvePolygon_copy_constructor_illegal_use) CPLErrorReset(); { CPLErrorHandlerPusher oPusher(CPLQuietErrorHandler); + // coverity[copy_assignment_call] *poly_as_cp = cp; } EXPECT_STREQ(CPLGetLastErrorMsg(), @@ -368,6 +371,60 @@ TEST_F(test_ogr, OGRCurvePolygon_copy_constructor_illegal_use) EXPECT_TRUE(poly.IsEmpty()); } +template void testMove() +{ + auto poSRS = new OGRSpatialReference(); + { + auto poOrigin = std::unique_ptr(make()); + ASSERT_TRUE(nullptr != poOrigin); + poOrigin->assignSpatialReference(poSRS); + + T valueCopy(*poOrigin); + const int refCountBefore = poSRS->GetReferenceCount(); + T fromMoved(std::move(*poOrigin)); + EXPECT_EQ(poSRS->GetReferenceCount(), refCountBefore); + + ASSERT_TRUE(CPL_TO_BOOL(fromMoved.Equals(&valueCopy))) + << valueCopy.getGeometryName() + << ": move constructor changed a value"; + EXPECT_EQ(fromMoved.getSpatialReference(), poSRS); + + T valueCopy2(valueCopy); + EXPECT_EQ(valueCopy.getSpatialReference(), poSRS); + T value3; + const int refCountBefore2 = poSRS->GetReferenceCount(); + value3 = std::move(valueCopy); + EXPECT_EQ(poSRS->GetReferenceCount(), refCountBefore2); + + ASSERT_TRUE(CPL_TO_BOOL(value3.Equals(&valueCopy2))) + << valueCopy2.getGeometryName() + << ": move assignment operator changed a value"; + EXPECT_EQ(value3.getSpatialReference(), poSRS); + } + EXPECT_EQ(poSRS->GetReferenceCount(), 1); + poSRS->Release(); +} + +TEST_F(test_ogr, geometry_move) +{ + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); + testMove(); +} + TEST_F(test_ogr, geometry_get_point) { { @@ -4241,9 +4298,8 @@ TEST_F(test_ogr, OGRCurve_reversePoints) TEST_F(test_ogr, transformWithOptions) { // Projected CRS to national geographic CRS (not including poles or antimeridian) - OGRGeometry *poGeom = nullptr; - OGRGeometryFactory::createFromWkt( - "LINESTRING(700000 6600000, 700001 6600001)", nullptr, &poGeom); + auto [poGeom, err] = OGRGeometryFactory::createFromWkt( + "LINESTRING(700000 6600000, 700001 6600001)"); ASSERT_NE(poGeom, nullptr); OGRSpatialReference oEPSG_2154; @@ -4254,12 +4310,12 @@ TEST_F(test_ogr, transformWithOptions) auto poCT = std::unique_ptr( OGRCreateCoordinateTransformation(&oEPSG_2154, &oEPSG_4171)); OGRGeometryFactory::TransformWithOptionsCache oCache; - poGeom = OGRGeometryFactory::transformWithOptions(poGeom, poCT.get(), - nullptr, oCache); - EXPECT_NEAR(poGeom->toLineString()->getX(0), 3, 1e-8); - EXPECT_NEAR(poGeom->toLineString()->getY(0), 46.5, 1e-8); - - delete poGeom; + auto poNewGeom = + std::unique_ptr(OGRGeometryFactory::transformWithOptions( + poGeom.get(), poCT.get(), nullptr, oCache)); + ASSERT_NE(poNewGeom, nullptr); + EXPECT_NEAR(poNewGeom->toLineString()->getX(0), 3, 1e-8); + EXPECT_NEAR(poNewGeom->toLineString()->getY(0), 46.5, 1e-8); } #ifdef HAVE_GEOS @@ -4268,10 +4324,8 @@ TEST_F(test_ogr, transformWithOptions) TEST_F(test_ogr, transformWithOptions_GEOS) { // Projected CRS to national geographic CRS including antimeridian - OGRGeometry *poGeom = nullptr; - OGRGeometryFactory::createFromWkt( - "LINESTRING(657630.64 4984896.17,815261.43 4990738.26)", nullptr, - &poGeom); + auto [poGeom, err] = OGRGeometryFactory::createFromWkt( + "LINESTRING(657630.64 4984896.17,815261.43 4990738.26)"); ASSERT_NE(poGeom, nullptr); OGRSpatialReference oEPSG_6329; @@ -4282,12 +4336,14 @@ TEST_F(test_ogr, transformWithOptions_GEOS) auto poCT = std::unique_ptr( OGRCreateCoordinateTransformation(&oEPSG_6329, &oEPSG_6318)); OGRGeometryFactory::TransformWithOptionsCache oCache; - poGeom = OGRGeometryFactory::transformWithOptions(poGeom, poCT.get(), - nullptr, oCache); - EXPECT_EQ(poGeom->getGeometryType(), wkbMultiLineString); - if (poGeom->getGeometryType() == wkbMultiLineString) - { - const auto poMLS = poGeom->toMultiLineString(); + auto poNewGeom = + std::unique_ptr(OGRGeometryFactory::transformWithOptions( + poGeom.get(), poCT.get(), nullptr, oCache)); + ASSERT_NE(poNewGeom, nullptr); + EXPECT_EQ(poNewGeom->getGeometryType(), wkbMultiLineString); + if (poNewGeom->getGeometryType() == wkbMultiLineString) + { + const auto poMLS = poNewGeom->toMultiLineString(); EXPECT_EQ(poMLS->getNumGeometries(), 2); if (poMLS->getNumGeometries() == 2) { @@ -4302,8 +4358,6 @@ TEST_F(test_ogr, transformWithOptions_GEOS) } } } - - delete poGeom; } #endif diff --git a/autotest/gcore/basic_test.py b/autotest/gcore/basic_test.py index 3b399d820925..33f9e58a36ab 100755 --- a/autotest/gcore/basic_test.py +++ b/autotest/gcore/basic_test.py @@ -987,3 +987,22 @@ def test_colorinterp(): assert name not in d d[name] = c assert gdal.GetColorInterpretationByName(name) == c + + +def test_ComputeMinMaxLocation(): + + ds = gdal.Open("data/byte.tif") + ret = ds.GetRasterBand(1).ComputeMinMaxLocation() + assert ( + ret.min == 74 + and ret.max == 255 + and ret.minX == 9 + and ret.minY == 17 + and ret.maxX == 2 + and ret.maxY == 18 + ) + + ds = gdal.GetDriverByName("MEM").Create("", 1, 1, 1, gdal.GDT_Float64) + ds.GetRasterBand(1).Fill(float("nan")) + ret = ds.GetRasterBand(1).ComputeMinMaxLocation() + assert ret is None diff --git a/autotest/gcore/cog.py b/autotest/gcore/cog.py index 3ef850bb019c..0e58f0e18af7 100755 --- a/autotest/gcore/cog.py +++ b/autotest/gcore/cog.py @@ -13,6 +13,7 @@ # SPDX-License-Identifier: MIT ############################################################################### +import os import struct import sys @@ -1935,3 +1936,29 @@ def test_cog_mask_band_overviews(tmp_vsimem): assert ds.GetRasterBand(1).IsMaskBand() assert ds.GetRasterBand(1).GetOverview(0).IsMaskBand() assert ds.GetRasterBand(1).GetOverview(1).IsMaskBand() + + +############################################################################### +# Verify that we can generate an output that is byte-identical to the expected golden file. + + +@pytest.mark.parametrize( + "src_filename,creation_options", + [ + ("data/cog/byte_little_endian_golden.tif", []), + ( + "data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif", + ["BLOCKSIZE=16", "PREDICTOR=STANDARD"], + ), + ], +) +def test_cog_write_check_golden_file(tmp_path, src_filename, creation_options): + + out_filename = str(tmp_path / "test.tif") + with gdal.config_option("GDAL_TIFF_ENDIANNESS", "LITTLE"): + with gdal.Open(src_filename) as src_ds: + gdal.GetDriverByName("COG").CreateCopy( + out_filename, src_ds, options=creation_options + ) + assert os.stat(src_filename).st_size == os.stat(out_filename).st_size + assert open(src_filename, "rb").read() == open(out_filename, "rb").read() diff --git a/autotest/gcore/data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif b/autotest/gcore/data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif new file mode 100644 index 000000000000..aaade76e71f7 Binary files /dev/null and b/autotest/gcore/data/cog/byte_little_endian_blocksize_16_predictor_standard_golden.tif differ diff --git a/autotest/gcore/data/cog/byte_little_endian_golden.tif b/autotest/gcore/data/cog/byte_little_endian_golden.tif new file mode 100644 index 000000000000..4ace013ced02 Binary files /dev/null and b/autotest/gcore/data/cog/byte_little_endian_golden.tif differ diff --git a/autotest/gcore/data/gtiff/byte_little_endian_golden.tif b/autotest/gcore/data/gtiff/byte_little_endian_golden.tif new file mode 100644 index 000000000000..a4cbc947432c Binary files /dev/null and b/autotest/gcore/data/gtiff/byte_little_endian_golden.tif differ diff --git a/autotest/gcore/data/gtiff/byte_little_endian_tiled_lzw_golden.tif b/autotest/gcore/data/gtiff/byte_little_endian_tiled_lzw_golden.tif new file mode 100644 index 000000000000..b43311f0e16a Binary files /dev/null and b/autotest/gcore/data/gtiff/byte_little_endian_tiled_lzw_golden.tif differ diff --git a/autotest/gcore/data/gtiff/float32_little_endian_golden.tif b/autotest/gcore/data/gtiff/float32_little_endian_golden.tif new file mode 100644 index 000000000000..96e3dfd31339 Binary files /dev/null and b/autotest/gcore/data/gtiff/float32_little_endian_golden.tif differ diff --git a/autotest/gcore/data/gtiff/uint16_little_endian_golden.tif b/autotest/gcore/data/gtiff/uint16_little_endian_golden.tif new file mode 100644 index 000000000000..2ea26292221b Binary files /dev/null and b/autotest/gcore/data/gtiff/uint16_little_endian_golden.tif differ diff --git a/autotest/gcore/misc.py b/autotest/gcore/misc.py index 18d1f4c806c0..62f8b29a5663 100755 --- a/autotest/gcore/misc.py +++ b/autotest/gcore/misc.py @@ -13,6 +13,7 @@ # SPDX-License-Identifier: MIT ############################################################################### +import datetime import os import shutil @@ -721,6 +722,70 @@ def test_misc_13(): assert out_ds is None +############################################################################### +# Test parsing of CPL_DEBUG and CPL_TIMESTAMP + + +@pytest.fixture +def debug_output(): + + messages = [] + + def handle(ecls, ecode, emsg): + messages.append(emsg) + + def log_message(category, message): + messages.clear() + gdal.Debug(category, message) + return messages[0] if messages else None + + log_message.handle = handle + + with gdaltest.error_handler(handle): + yield log_message + + +@pytest.mark.parametrize( + "booleans", + [("YES", "NO"), ("TRUE", "FALSE"), ("ON", "OFF"), ("1", "0")], + ids="_".join, +) +def test_misc_cpl_debug(debug_output, booleans): + + on, off = booleans + + assert debug_output("GDAL", "msg") is None + + with gdal.config_option("CPL_DEBUG", off): + assert debug_output("GDAL", "msg") is None + + with gdal.config_option("CPL_DEBUG", on): + assert debug_output("GDAL", "message") == "GDAL: message" + + with gdal.config_option("CPL_TIMESTAMP", off): + assert debug_output("GDAL", "message") == "GDAL: message" + + with gdal.config_option("CPL_TIMESTAMP", on): + output = debug_output("GDAL", "message") + assert str(datetime.datetime.now().year) in output + assert output.endswith("GDAL: message") + + +def test_misc_cpl_debug_filtering(debug_output): + + with gdal.config_option("CPL_DEBUG", "GDAL"): + assert debug_output("GDAL", "msg") == "GDAL: msg" + assert debug_output("GDAL_WARP", "msg") is None + assert debug_output("", "msg") == ": msg" + + with gdal.config_option("CPL_DEBUG", "GDAL_WARP_TRANSLATE_ETC"): + assert debug_output("GDAL", "msg") == "GDAL: msg" + assert debug_output("TRANSLATE", "msg") == "TRANSLATE: msg" + + with gdal.config_option("CPL_DEBUG", ""): + assert debug_output("GDAL", "msg") == "GDAL: msg" + + ############################################################################### # Test ConfigureLogging() diff --git a/autotest/gcore/tiff_write.py b/autotest/gcore/tiff_write.py index f803b95d24b8..3e2c8d9de67b 100755 --- a/autotest/gcore/tiff_write.py +++ b/autotest/gcore/tiff_write.py @@ -11901,3 +11901,30 @@ def test_tiff_write_band_IMAGERY(tmp_vsimem): ) with gdal.Open(filename2) as ds: assert ds.GetRasterBand(1).GetMetadata_Dict("IMAGERY") == {"foo": "bar"} + + +############################################################################### +# Verify that we can generate an output that is byte-identical to the expected golden file. + + +@pytest.mark.parametrize( + "src_filename,creation_options", + [ + ("data/gtiff/byte_little_endian_golden.tif", []), + ("data/gtiff/uint16_little_endian_golden.tif", []), + ("data/gtiff/float32_little_endian_golden.tif", []), + ( + "data/gtiff/byte_little_endian_tiled_lzw_golden.tif", + ["TILED=YES", "BLOCKXSIZE=16", "BLOCKYSIZE=16", "COMPRESS=LZW"], + ), + ], +) +def test_tiff_write_check_golden_file(tmp_path, src_filename, creation_options): + + out_filename = str(tmp_path / "test.tif") + with gdal.Open(src_filename) as src_ds: + gdal.GetDriverByName("GTiff").CreateCopy( + out_filename, src_ds, options=["ENDIANNESS=LITTLE"] + creation_options + ) + assert os.stat(src_filename).st_size == os.stat(out_filename).st_size + assert open(src_filename, "rb").read() == open(out_filename, "rb").read() diff --git a/autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2 b/autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2 new file mode 100644 index 000000000000..049871c52ec8 Binary files /dev/null and b/autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2 differ diff --git a/autotest/gdrivers/data/netcdf/byte_nc3_golden.nc b/autotest/gdrivers/data/netcdf/byte_nc3_golden.nc new file mode 100644 index 000000000000..09b43e1a71fb Binary files /dev/null and b/autotest/gdrivers/data/netcdf/byte_nc3_golden.nc differ diff --git a/autotest/gdrivers/jp2openjpeg.py b/autotest/gdrivers/jp2openjpeg.py index 387a15cc0756..1aa3439bba66 100755 --- a/autotest/gdrivers/jp2openjpeg.py +++ b/autotest/gdrivers/jp2openjpeg.py @@ -3920,3 +3920,29 @@ def test_jp2openjpeg_unsupported_srs_for_gmljp2(tmp_vsimem): assert ds.GetSpatialRef().IsSame(ref_srs) # Check that we do *not* have a GMLJP2 box assert "xml:gml.root-instance" not in ds.GetMetadataDomainList() + + +############################################################################### +# Verify that we can generate an output that is byte-identical to the expected golden file. +# (might be risky depending on libopenjp2...) + + +@pytest.mark.parametrize( + "src_filename,creation_options", + [ + # Created with gdal_translate autotest/gcore/data/byte.tif autotest/gdrivers/data/jpeg2000/byte_lossless_openjp2_golden.jp2 -of jp2openjpeg -co QUALITY=100 -co REVERSIBLE=YES -co COMMENT= + ( + "data/jpeg2000/byte_lossless_openjp2_golden.jp2", + ["QUALITY=100", "REVERSIBLE=YES", "COMMENT="], + ), + ], +) +def test_jp2openjpeg_write_check_golden_file(tmp_path, src_filename, creation_options): + + out_filename = str(tmp_path / "test.jp2") + with gdal.Open(src_filename) as src_ds: + gdal.GetDriverByName("JP2OpenJPEG").CreateCopy( + out_filename, src_ds, options=creation_options + ) + assert os.stat(src_filename).st_size == os.stat(out_filename).st_size + assert open(src_filename, "rb").read() == open(out_filename, "rb").read() diff --git a/autotest/gdrivers/netcdf.py b/autotest/gdrivers/netcdf.py index 21c0b3026ca6..6622974f49b4 100755 --- a/autotest/gdrivers/netcdf.py +++ b/autotest/gdrivers/netcdf.py @@ -6583,3 +6583,36 @@ def test_netcdf_extra_dim_no_georef(tmp_path): ds = gdal.Open(fname) assert ds.RasterCount == 4 assert ds.ReadRaster() == src_ds.ReadRaster() + + +############################################################################### +# Verify that we can generate an output that is byte-identical to the expected golden file. +# (might be risky depending on libopenjp2...) + + +@pytest.mark.parametrize( + "src_filename,golden_file,creation_options", + [ + # Created with gdal_translate gdal_translate autotest/gcore/data/byte.tif autotest/gdrivers/data/netcdf/byte_nc3_golden.nc -co WRITE_GDAL_VERSION=NO -co WRITE_GDAL_HISTORY=NO -co FORMAT=NC + ( + "../gcore/data/byte.tif", + "data/netcdf/byte_nc3_golden.nc", + ["WRITE_GDAL_VERSION=NO", "WRITE_GDAL_HISTORY=NO", "FORMAT=NC"], + ), + ], +) +# I've that feeling that netCDF might be host endianness dependent... +@pytest.mark.skipif( + sys.byteorder != "little", reason="only supported on little-endian hosts" +) +def test_netcdf_write_check_golden_file( + tmp_path, src_filename, golden_file, creation_options +): + + out_filename = str(tmp_path / "test.nc") + with gdal.Open(src_filename) as src_ds: + gdal.GetDriverByName("netCDF").CreateCopy( + out_filename, src_ds, options=creation_options + ) + assert os.stat(golden_file).st_size == os.stat(out_filename).st_size + assert open(golden_file, "rb").read() == open(out_filename, "rb").read() diff --git a/autotest/gnm/gnm_test.py b/autotest/gnm/gnm_test.py index 530d2db8fdfd..f82cbaf71270 100755 --- a/autotest/gnm/gnm_test.py +++ b/autotest/gnm/gnm_test.py @@ -36,7 +36,7 @@ def test_gnm_filenetwork_create(): pass drv = gdal.GetDriverByName("GNMFile") - ds = drv.Create( + with drv.Create( "tmp/", 0, 0, @@ -47,17 +47,15 @@ def test_gnm_filenetwork_create(): "net_description=Test file based GNM", "net_srs=EPSG:4326", ], - ) - # cast to GNM - dn = gnm.CastToNetwork(ds) - assert dn is not None - assert dn.GetVersion() == 100, "GNM: Check GNM version failed" - assert dn.GetName() == "test_gnm", "GNM: Check GNM name failed" - assert ( - dn.GetDescription() == "Test file based GNM" - ), "GNM: Check GNM description failed" - - dn = None + ) as ds: + # cast to GNM + dn = gnm.CastToNetwork(ds) + assert dn is not None + assert dn.GetVersion() == 100, "GNM: Check GNM version failed" + assert dn.GetName() == "test_gnm", "GNM: Check GNM name failed" + assert ( + dn.GetDescription() == "Test file based GNM" + ), "GNM: Check GNM description failed" ############################################################################### diff --git a/autotest/ogr/data/gpkg/poly_golden.gpkg b/autotest/ogr/data/gpkg/poly_golden.gpkg new file mode 100644 index 000000000000..ab45c9747eee Binary files /dev/null and b/autotest/ogr/data/gpkg/poly_golden.gpkg differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtable new file mode 100644 index 000000000000..ca18a72cc503 Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtable differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtablx new file mode 100644 index 000000000000..d3c0d9e20c02 Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000001.gdbtablx differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtable new file mode 100644 index 000000000000..b4ed54fbfb33 Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtable differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtablx new file mode 100644 index 000000000000..29323984e52a Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000002.gdbtablx differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtable new file mode 100644 index 000000000000..29b5907954ae Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtable differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtablx new file mode 100644 index 000000000000..4cf22da58b1f Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000003.gdbtablx differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtable new file mode 100644 index 000000000000..e731d3f0c724 Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtable differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtablx new file mode 100644 index 000000000000..0209415ef5ba Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000004.gdbtablx differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtable new file mode 100644 index 000000000000..d717f9fa1a7c Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtable differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtablx new file mode 100644 index 000000000000..259a71f00d2c Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000005.gdbtablx differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtable new file mode 100644 index 000000000000..7c8b607cb27b Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtable differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtablx new file mode 100644 index 000000000000..92567c81d8c1 Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000006.gdbtablx differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtable new file mode 100644 index 000000000000..3e11ac8a531f Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtable differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtablx new file mode 100644 index 000000000000..8a2df70bfbfb Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000007.gdbtablx differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbindexes b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbindexes new file mode 100644 index 000000000000..cc24e2a06b9b Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbindexes differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtable b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtable new file mode 100644 index 000000000000..5cde63fc1c8e Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtable differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtablx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtablx new file mode 100644 index 000000000000..c8318b15c834 Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.gdbtablx differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.spx b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.spx new file mode 100644 index 000000000000..44769469081c Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/a00000009.spx differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/gdb b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/gdb new file mode 100644 index 000000000000..506f9c628294 Binary files /dev/null and b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/gdb differ diff --git a/autotest/ogr/data/openfilegdb/polygon_golden.gdb/timestamps b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/timestamps new file mode 100644 index 000000000000..05d2b9440ec0 --- /dev/null +++ b/autotest/ogr/data/openfilegdb/polygon_golden.gdb/timestamps @@ -0,0 +1 @@ +ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿ \ No newline at end of file diff --git a/autotest/ogr/data/shp/poly_golden/poly.dbf b/autotest/ogr/data/shp/poly_golden/poly.dbf new file mode 100644 index 000000000000..ad76f9f42a5b Binary files /dev/null and b/autotest/ogr/data/shp/poly_golden/poly.dbf differ diff --git a/autotest/ogr/data/shp/poly_golden/poly.prj b/autotest/ogr/data/shp/poly_golden/poly.prj new file mode 100644 index 000000000000..fec0ee28909b --- /dev/null +++ b/autotest/ogr/data/shp/poly_golden/poly.prj @@ -0,0 +1 @@ +PROJCS["British_National_Grid",GEOGCS["GCS_OSGB_1936",DATUM["D_OSGB_1936",SPHEROID["Airy_1830",6377563.396,299.3249646]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["False_Easting",400000.0],PARAMETER["False_Northing",-100000.0],PARAMETER["Central_Meridian",-2.0],PARAMETER["Scale_Factor",0.9996012717],PARAMETER["Latitude_Of_Origin",49.0],UNIT["Meter",1.0]] \ No newline at end of file diff --git a/autotest/ogr/data/shp/poly_golden/poly.shp b/autotest/ogr/data/shp/poly_golden/poly.shp new file mode 100644 index 000000000000..98951531b18c Binary files /dev/null and b/autotest/ogr/data/shp/poly_golden/poly.shp differ diff --git a/autotest/ogr/data/shp/poly_golden/poly.shx b/autotest/ogr/data/shp/poly_golden/poly.shx new file mode 100644 index 000000000000..134898b3895e Binary files /dev/null and b/autotest/ogr/data/shp/poly_golden/poly.shx differ diff --git a/autotest/ogr/ogr_aivector.py b/autotest/ogr/ogr_aivector.py new file mode 100755 index 000000000000..32c194666cc9 --- /dev/null +++ b/autotest/ogr/ogr_aivector.py @@ -0,0 +1,33 @@ +#!/usr/bin/env pytest +############################################################################### +# $Id$ +# +# Project: GDAL/OGR Test Suite +# Purpose: Test read functionality for OGR AIVector driver. +# Author: Even Rouault +# +############################################################################### +# Copyright (c) 2024, Even Rouault +# +# SPDX-License-Identifier: MIT +############################################################################### + +import gdaltest +import pytest + +pytestmark = pytest.mark.require_driver("AIVector") + + +def test_ogr_aivector_test_ogrsf(): + + import test_cli_utilities + + if test_cli_utilities.get_test_ogrsf_path() is None: + pytest.skip() + + ret = gdaltest.runexternal( + test_cli_utilities.get_test_ogrsf_path() + " -ro AIVector:foo.bin" + ) + + assert "INFO" in ret + assert "ERROR" not in ret diff --git a/autotest/ogr/ogr_gpkg.py b/autotest/ogr/ogr_gpkg.py index 78e882d94892..a14fc22c98f6 100755 --- a/autotest/ogr/ogr_gpkg.py +++ b/autotest/ogr/ogr_gpkg.py @@ -10766,3 +10766,45 @@ def test_gpkg_secure_delete(tmp_vsimem): with ds.ExecuteSQL("PRAGMA secure_delete") as sql_lyr: f = sql_lyr.GetNextFeature() assert f.GetField(0) == 0 + + +############################################################################### +# Verify that we can generate an output that is byte-identical to the expected golden file. + + +@pytest.mark.parametrize( + "src_filename", + [ + # Generated with: ogr2ogr autotest/ogr/data/gpkg/poly_golden.gpkg autotest/ogr/data/poly.shp --config OGR_CURRENT_DATE="2000-01-01T:00:00:00.000Z" -nomd + "data/gpkg/poly_golden.gpkg", + ], +) +def test_ogr_gpkg_write_check_golden_file(tmp_path, src_filename): + + out_filename = str(tmp_path / "test.gpkg") + with gdal.config_option("OGR_CURRENT_DATE", "2000-01-01T:00:00:00.000Z"): + gdal.VectorTranslate(out_filename, src_filename) + + # Compare first sqlite3 dump if sqlite3 binary available + import subprocess + + try: + golden_dump = subprocess.check_output( + ["sqlite3", src_filename, ".dump"] + ).decode("utf-8") + got_dump = subprocess.check_output(["sqlite3", out_filename, ".dump"]).decode( + "utf-8" + ) + assert got_dump == golden_dump + # print("Identical sqlite3 dump") + except Exception: + pass + + if get_sqlite_version() >= (3, 46, 0): + assert os.stat(src_filename).st_size == os.stat(out_filename).st_size + golden_data = bytearray(open(src_filename, "rb").read()) + got_data = bytearray(open(out_filename, "rb").read()) + # Zero out the SQLite version number at bytes 96-99. Cf https://www.sqlite.org/fileformat.html + golden_data[96] = golden_data[97] = golden_data[98] = golden_data[99] = 0 + got_data[96] = got_data[97] = got_data[98] = got_data[99] = 0 + assert got_data == golden_data diff --git a/autotest/ogr/ogr_mem.py b/autotest/ogr/ogr_mem.py index 3d26ac31a00b..e763eb031d7c 100755 --- a/autotest/ogr/ogr_mem.py +++ b/autotest/ogr/ogr_mem.py @@ -2872,9 +2872,12 @@ def test_ogr_mem_arrow_json(): lyr.CreateField(field_def) stream = lyr.GetArrowStreamAsPyArrow() - md = stream.schema["field_json"].metadata - assert b"ARROW:extension:name" in md - assert md[b"ARROW:extension:name"] == b"arrow.json" + field_schema = stream.schema["field_json"] + # Since pyarrow 18, the field type is extension + if str(field_schema.type) != "extension": + md = field_schema.metadata + assert b"ARROW:extension:name" in md + assert md[b"ARROW:extension:name"] == b"arrow.json" ############################################################################### diff --git a/autotest/ogr/ogr_openfilegdb_write.py b/autotest/ogr/ogr_openfilegdb_write.py index ad2ccf036bb6..d2b0486267a4 100755 --- a/autotest/ogr/ogr_openfilegdb_write.py +++ b/autotest/ogr/ogr_openfilegdb_write.py @@ -13,6 +13,7 @@ # SPDX-License-Identifier: MIT ############################################################################### +import os import struct import sys @@ -4571,3 +4572,32 @@ def test_ogr_openfilegdb_write_OGRUnsetMarker(tmp_vsimem): lyr = ds.GetLayer(0) f = lyr.GetNextFeature() assert f["i32"] == -21121 + + +############################################################################### +# Verify that we can generate an output that is byte-identical to the expected golden file. + + +@pytest.mark.parametrize( + "src_directory", + [ + # Generated with: + # ogr2ogr autotest/ogr/data/openfilegdb/polygon_golden.gdb '{"type":"Feature","properties":{"foo":"bar"},"geometry":{"type":"Polygon","coordinates":[[[0,0],[0,1],[1,0],[0,0]]]}}' --config OPENFILEGDB_CREATOR GDAL --config OPENFILEGDB_REPRODUCIBLE_UUID YES -f openfilegdb + "data/openfilegdb/polygon_golden.gdb", + ], +) +def test_ogr_openfilegdb_write_check_golden_file(tmp_path, src_directory): + + out_directory = str(tmp_path / "test.gdb") + with gdaltest.config_options( + {"OPENFILEGDB_CREATOR": "GDAL", "OPENFILEGDB_REPRODUCIBLE_UUID": "YES"} + ): + gdal.VectorTranslate(out_directory, src_directory, format="OpenFileGDB") + for filename in os.listdir(src_directory): + src_filename = os.path.join(src_directory, filename) + out_filename = os.path.join(out_directory, filename) + + assert os.stat(src_filename).st_size == os.stat(out_filename).st_size, filename + assert ( + open(src_filename, "rb").read() == open(out_filename, "rb").read() + ), filename diff --git a/autotest/ogr/ogr_shape.py b/autotest/ogr/ogr_shape.py index 889d0c688abd..d777a11e41bc 100755 --- a/autotest/ogr/ogr_shape.py +++ b/autotest/ogr/ogr_shape.py @@ -6134,3 +6134,34 @@ def test_ogr_shape_read_date_empty_string(): lyr = ds.GetLayer(0) f = lyr.GetNextFeature() assert f["date"] is None + + +############################################################################### +# Verify that we can generate an output that is byte-identical to the expected golden file. + + +@pytest.mark.parametrize( + "src_directory", + [ + # Generated with: + # ogr2ogr autotest/ogr/data/shp/poly_golden autotest/ogr/data/poly.shp -lco DBF_DATE_LAST_UPDATE=2000-01-01 + "data/shp/poly_golden", + ], +) +def test_ogr_shape_write_check_golden_file(tmp_path, src_directory): + + out_directory = str(tmp_path / "test") + gdal.VectorTranslate( + out_directory, + src_directory, + format="ESRI Shapefile", + layerCreationOptions=["DBF_DATE_LAST_UPDATE=2000-01-01"], + ) + for filename in os.listdir(src_directory): + src_filename = os.path.join(src_directory, filename) + out_filename = os.path.join(out_directory, filename) + + assert os.stat(src_filename).st_size == os.stat(out_filename).st_size, filename + assert ( + open(src_filename, "rb").read() == open(out_filename, "rb").read() + ), filename diff --git a/autotest/postinstall/test_gdal-config.sh b/autotest/postinstall/test_gdal-config.sh index 19a656ee04ae..e77088e12e7b 100755 --- a/autotest/postinstall/test_gdal-config.sh +++ b/autotest/postinstall/test_gdal-config.sh @@ -102,7 +102,7 @@ set -eu CXX="${CXX:-c++}" echo "Test that we can compile all headers with C++11 using ${CXX}" for i in $prefix/include/*.h; do - ${CXX} -std=c++11 -c $(${GDAL_CONFIG} --cflags) $i; + ${CXX} -Wall -Wpedantic -std=c++11 -c $(${GDAL_CONFIG} --cflags) $i; done echo "$ERRORS tests failed out of $NTESTS" diff --git a/autotest/pyscripts/test_pct.py b/autotest/pyscripts/test_pct.py index 88212f866c43..c549685ea5ff 100755 --- a/autotest/pyscripts/test_pct.py +++ b/autotest/pyscripts/test_pct.py @@ -40,6 +40,9 @@ def script_path(): def test_rgb2pct_help(script_path): + if gdaltest.is_travis_branch("sanitize"): + pytest.skip("fails on sanitize for unknown reason") + assert "ERROR" not in test_py_scripts.run_py_script( script_path, "rgb2pct", "--help" ) @@ -51,6 +54,9 @@ def test_rgb2pct_help(script_path): def test_rgb2pct_version(script_path): + if gdaltest.is_travis_branch("sanitize"): + pytest.skip("fails on sanitize for unknown reason") + assert "ERROR" not in test_py_scripts.run_py_script( script_path, "rgb2pct", "--version" ) diff --git a/ci/travis/osx/install.sh b/ci/travis/osx/install.sh index 00e392ae0633..95f435e7e052 100755 --- a/ci/travis/osx/install.sh +++ b/ci/travis/osx/install.sh @@ -27,6 +27,9 @@ CFLAGS="-Wextra -Werror" CXXFLAGS="-Wextra -Werror" cmake .. \ -DBUILD_CSHARP_BINDINGS=OFF \ -DCMAKE_UNITY_BUILD=ON +echo "Check that GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON" +(grep "GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS:BOOL=ON" CMakeCache.txt > /dev/null && echo "yes") || (echo "Missing" && /bin/false) + NPROC=$(sysctl -n hw.ncpu) echo "NPROC=${NPROC}" make -j${NPROC} diff --git a/ci/travis/osx/script.sh b/ci/travis/osx/script.sh index a28b9eeac6be..7b212dfc9deb 100755 --- a/ci/travis/osx/script.sh +++ b/ci/travis/osx/script.sh @@ -7,6 +7,9 @@ export PROJ_NETWORK=ON echo 'Running CPP unit tests' (cd build && make quicktest) +echo 'Running CPP perftests' +(cd build && ctest -V -R perf) + echo 'Running Python unit tests' # install test dependencies sudo -H pip3 install -r autotest/requirements.txt diff --git a/cmake/modules/thirdparty/FindDotnet.cmake b/cmake/modules/thirdparty/FindDotnet.cmake index 945688b6eb47..499634228d63 100644 --- a/cmake/modules/thirdparty/FindDotnet.cmake +++ b/cmake/modules/thirdparty/FindDotnet.cmake @@ -5,24 +5,24 @@ # # FindDotnet # ---------- -# +# # Find DotNet executable, and initialize functions for adding dotnet projects. -# +# # Results are reported in the following variables:: -# +# # DOTNET_FOUND - True if dotnet executable is found # DOTNET_EXE - Dotnet executable # DOTNET_VERSION - Dotnet version as reported by dotnet executable # DOTNET_SDKS - Dotnet SDKs loaded as reported by dotnet executable # NUGET_EXE - Nuget executable (WIN32 only) # NUGET_CACHE_PATH - Nuget package cache path -# +# # The following functions are defined to add dotnet/msbuild projects: -# +# # ADD_DOTNET -- add a project to be built by dotnet. -# +# # ``` -# ADD_DOTNET( [RELEASE|DEBUG] [X86|X64|ANYCPU] +# ADD_DOTNET( [RELEASE|DEBUG] [X86|X64|ANYCPU] # [CONFIG configuration] # [PLATFORM platform] # [PACKAGE nuget_package_dependencies... ] @@ -34,12 +34,12 @@ # [ARGUMENTS additional_build_args...] # [PACK_ARGUMENTS additional_pack_args...]) # ``` -# -# RUN_DOTNET -- Run a project with `dotnet run`. The `OUTPUT` argument represents artifacts +# +# RUN_DOTNET -- Run a project with `dotnet run`. The `OUTPUT` argument represents artifacts # produced by running the .NET program, and can be consumed from other build steps. -# +# # ``` -# RUN_DOTNET( [RELEASE|DEBUG] [X86|X64|ANYCPU] +# RUN_DOTNET( [RELEASE|DEBUG] [X86|X64|ANYCPU] # [ARGUMENTS program_args...] # [OUTPUT outputs...] # [CONFIG configuration] @@ -49,11 +49,11 @@ # [CUSTOM_BUILDPROPS value....] # [SOURCES additional_file_dependencies... ]) # ``` -# +# # ADD_MSBUILD -- add a project to be built by msbuild. Windows-only. When building in Unix systems, msbuild targets are skipped. -# +# # ``` -# ADD_MSBUILD( [RELEASE|DEBUG] [X86|X64|ANYCPU] +# ADD_MSBUILD( [RELEASE|DEBUG] [X86|X64|ANYCPU] # [CONFIG configuration] # [PLATFORM platform] # [PACKAGE output_nuget_packages... ] @@ -68,7 +68,7 @@ # and if the program fails to build or run, the build fails. Currently only .NET Core App framework is supported. # Multiple smoke tests will be run one-by-one to avoid global resource conflicts. # -# SMOKETEST_DOTNET( [RELEASE|DEBUG] [X86|X64|ANYCPU] +# SMOKETEST_DOTNET( [RELEASE|DEBUG] [X86|X64|ANYCPU] # [ARGUMENTS program_args...] # [CONFIG configuration] # [PLATFORM platform] @@ -76,12 +76,12 @@ # [OUTPUT_PATH output_path relative to cmake binary output dir] # [CUSTOM_BUILDPROPS value....] # [SOURCES additional_file_dependencies... ]) -# +# # For all the above functions, `RELEASE|DEBUG` overrides `CONFIG`, `X86|X64|ANYCPU` overrides PLATFORM. # # # DOTNET_REGISTER_LOCAL_REPOSITORY -- register a local NuGet package repository. -# +# # ``` # DOTNET_REGISTER_LOCAL_REPOSITORY(repo_name repo_path) # ``` @@ -97,7 +97,7 @@ # [ARGUMENTS additional_dotnet_test_args...] # [OUTPUT_PATH output_path relative to cmake binary output dir]) # ``` -# +# # GEN_DOTNET_PROPS -- Generates a Directory.Build.props file. The created file is populated with MSBuild properties: # - DOTNET_PACKAGE_VERSION: a version string that can be referenced in the actual project file as $(DOTNET_PACKAGE_VERSION). # The version string value can be set with PACKAGE_VERSION argument, and defaults to '1.0.0'. @@ -111,10 +111,7 @@ # [PACKAGE_VERSION version] # [XML_INJECT xml_injection]) # ``` -# -# Require 3.5 for batch copy multiple files - -cmake_minimum_required(VERSION 3.5.0) +# IF(DOTNET_FOUND) RETURN() @@ -184,11 +181,11 @@ ENDFUNCTION() FUNCTION(DOTNET_GET_DEPS _DN_PROJECT arguments) CMAKE_PARSE_ARGUMENTS( # prefix - _DN + _DN # options (flags) - "RELEASE;DEBUG;X86;X64;ANYCPU;NETCOREAPP" + "RELEASE;DEBUG;X86;X64;ANYCPU;NETCOREAPP" # oneValueArgs - "NAME;CONFIG;PLATFORM;VERSION;OUTPUT_PATH" + "NAME;CONFIG;PLATFORM;VERSION;OUTPUT_PATH" # multiValueArgs "PACKAGE;DEPENDS;ARGUMENTS;PACK_ARGUMENTS;OUTPUT;SOURCES;CUSTOM_BUILDPROPS,BUILD_OPTIONS" # the input arguments @@ -199,7 +196,7 @@ FUNCTION(DOTNET_GET_DEPS _DN_PROJECT arguments) GET_FILENAME_COMPONENT(_DN_projname "${_DN_PROJECT}" NAME) STRING(REGEX REPLACE "\\.[^.]*$" "" _DN_projname_noext ${_DN_projname}) - FILE(GLOB_RECURSE DOTNET_deps + FILE(GLOB_RECURSE DOTNET_deps ${_DN_proj_dir}/*.cs ${_DN_proj_dir}/*.fs ${_DN_proj_dir}/*.vb @@ -328,14 +325,14 @@ ENDMACRO() MACRO(DOTNET_BUILD_COMMANDS) IF(${DOTNET_IS_MSBUILD}) - SET(build_dotnet_cmds + SET(build_dotnet_cmds COMMAND ${CMAKE_COMMAND} -E echo "======= Building msbuild project ${DOTNET_PROJNAME} [${DOTNET_CONFIG} ${DOTNET_PLATFORM}]" COMMAND ${NUGET_EXE} restore -Force ${DOTNET_PROJPATH} COMMAND ${DOTNET_EXE} msbuild ${DOTNET_PROJPATH} /t:Clean ${DOTNET_BUILD_PROPERTIES} /p:Configuration="${DOTNET_CONFIG}" COMMAND ${DOTNET_EXE} msbuild ${DOTNET_PROJPATH} /t:Build ${DOTNET_BUILD_PROPERTIES} /p:Configuration="${DOTNET_CONFIG}" ${DOTNET_ARGUMENTS}) SET(build_dotnet_type "msbuild") ELSE() - SET(build_dotnet_cmds + SET(build_dotnet_cmds COMMAND ${CMAKE_COMMAND} -E echo "======= Building .NET project ${DOTNET_PROJNAME} [${DOTNET_CONFIG} ${DOTNET_PLATFORM}]") foreach (_src ${DOTNET_SOURCES} ) LIST(APPEND build_dotnet_cmds COMMAND ${DOTNET_EXE} add ${DOTNET_PROJPATH} reference ${_src}) @@ -362,10 +359,10 @@ MACRO(DOTNET_BUILD_COMMANDS) MESSAGE("-- Adding ${build_dotnet_type} project ${DOTNET_PROJPATH} (no nupkg)") ENDIF() endif() - - LIST(APPEND build_dotnet_cmds COMMAND ${DOTNET_EXE} pack - --no-build --no-restore ${DOTNET_PROJPATH} - -c ${DOTNET_CONFIG} ${DOTNET_BUILD_PROPERTIES} ${DOTNET_PACK_OPTIONS} + + LIST(APPEND build_dotnet_cmds COMMAND ${DOTNET_EXE} pack + --no-build --no-restore ${DOTNET_PROJPATH} + -c ${DOTNET_CONFIG} ${DOTNET_BUILD_PROPERTIES} ${DOTNET_PACK_OPTIONS} --output ${CMAKE_CURRENT_BINARY_DIR} -p:PackageVersion=${DOTNET_PACKAGE_VERSION} ) LIST(APPEND DOTNET_OUTPUTS ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.buildtimestamp) LIST(APPEND build_dotnet_cmds COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.buildtimestamp) @@ -416,7 +413,7 @@ FUNCTION(RUN_DOTNET DOTNET_PROJECT) COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.runtimestamp WORKING_DIRECTORY ${DOTNET_OUTPUT_PATH}) ADD_CUSTOM_TARGET( - ${DOTNET_PROJNAME} + ${DOTNET_PROJNAME} DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${DOTNET_PROJNAME}.runtimestamp ${DOTNET_RUN_OUTPUT}) ADD_DOTNET_DEPENDENCY_TARGETS() ENDFUNCTION() @@ -469,9 +466,9 @@ FUNCTION(GEN_DOTNET_PROPS target_props_file) # prefix _DNP # options (flags) - "" + "" # oneValueArgs - "PACKAGE_VERSION;XML_INJECT" + "PACKAGE_VERSION;XML_INJECT" # multiValueArgs "" # the input arguments @@ -496,4 +493,4 @@ ENDFUNCTION() MESSAGE("-- Found .NET toolchain: ${DOTNET_EXE} (version ${DOTNET_VERSION})") -SET(DOTNET_FOUND TRUE) \ No newline at end of file +SET(DOTNET_FOUND TRUE) diff --git a/doc/source/about_no_title.rst b/doc/source/about_no_title.rst index c60c858cce3e..819c4aca9f4a 100644 --- a/doc/source/about_no_title.rst +++ b/doc/source/about_no_title.rst @@ -1,4 +1,4 @@ -GDAL is a translator library for raster and vector geospatial data formats that is released under an MIT style Open Source :ref:`license` by the `Open Source Geospatial Foundation`_. As a library, it presents a single raster abstract data model and single vector abstract data model to the calling application for all supported formats. It also comes with a variety of useful command line utilities for data translation and processing. The `NEWS`_ page describes the October 2024 GDAL/OGR 3.9.3 release. +GDAL is a translator library for raster and vector geospatial data formats that is released under an MIT style Open Source :ref:`license` by the `Open Source Geospatial Foundation`_. As a library, it presents a single raster abstract data model and single vector abstract data model to the calling application for all supported formats. It also comes with a variety of useful command line utilities for data translation and processing. The `NEWS`_ page describes the November 2024 GDAL/OGR 3.10.0 release. .. note:: @@ -17,7 +17,7 @@ GDAL is a translator library for raster and vector geospatial data formats that :target: `Open Source Geospatial Foundation`_ .. _`Open Source Geospatial Foundation`: http://www.osgeo.org/ -.. _`NEWS`: https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md +.. _`NEWS`: https://github.com/OSGeo/gdal/blob/v3.10.0/NEWS.md See :ref:`software_using_gdal` diff --git a/doc/source/api/python_samples.rst b/doc/source/api/python_samples.rst index a2831f63ffaa..294a2bce48c4 100644 --- a/doc/source/api/python_samples.rst +++ b/doc/source/api/python_samples.rst @@ -47,6 +47,7 @@ Python Raster Sample scripts - hsv_merge: Merge greyscale image into RGB image as intensity in HSV space. - gdal_ls: Display the list of files in a virtual directory, like /vsicurl or /vsizip - gdal_cp: Copy a virtual file + - gdal_minmax_location: returns the location where min/max values of a raster are hit. Python Vector Sample scripts ------------------------------ diff --git a/doc/source/community/code_of_conduct.rst b/doc/source/community/code_of_conduct.rst index 9dc38d42ac5b..b151edb5daf9 100644 --- a/doc/source/community/code_of_conduct.rst +++ b/doc/source/community/code_of_conduct.rst @@ -19,7 +19,7 @@ claims any affiliation with the GDAL project. It applies to in-person events (such as conferences and related social events), IRC, public and private mailing lists, the issue tracker, the wiki, blogs, -Twitter, and any other forums which the community uses for communication and +social media, and any other forums which the community uses for communication and interactions. This code is not exhaustive or complete. It serves to distill our common diff --git a/doc/source/download.rst b/doc/source/download.rst index ca336557fdc3..847f3a8e3b1d 100644 --- a/doc/source/download.rst +++ b/doc/source/download.rst @@ -18,11 +18,11 @@ Source Code Current Release ............... -* **2024-10-14** `gdal-3.9.3.tar.gz`_ `3.9.3 Release Notes`_ (`3.9.3 md5`_) +* **2024-11-01** `gdal-3.10.0.tar.gz`_ `3.10.0 Release Notes`_ (`3.10.0 md5`_) -.. _`3.9.3 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md -.. _`gdal-3.9.3.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz -.. _`3.9.3 md5`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz.md5 +.. _`3.10.0 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.10.0/NEWS.md +.. _`gdal-3.10.0.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.10.0/gdal-3.10.0.tar.gz +.. _`3.10.0 md5`: https://github.com/OSGeo/gdal/releases/download/v3.10.0/gdal-3.10.0.tar.gz.md5 Past Releases ............. diff --git a/doc/source/download_past.rst b/doc/source/download_past.rst index f732485793ef..afff6413df04 100644 --- a/doc/source/download_past.rst +++ b/doc/source/download_past.rst @@ -5,6 +5,12 @@ Past Releases ============= +* **2024-10-14** `gdal-3.9.3.tar.gz`_ `3.9.3 Release Notes`_ (`3.9.3 md5`_) + +.. _`3.9.3 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.9.3/NEWS.md +.. _`gdal-3.9.3.tar.gz`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz +.. _`3.9.3 md5`: https://github.com/OSGeo/gdal/releases/download/v3.9.3/gdal-3.9.3.tar.gz.md5 + * **2024-08-16** `gdal-3.9.2.tar.gz`_ `3.9.2 Release Notes`_ (`3.9.2 md5`_) .. _`3.9.2 Release Notes`: https://github.com/OSGeo/gdal/blob/v3.9.2/NEWS.md diff --git a/doc/source/drivers/vector/aivector.rst b/doc/source/drivers/vector/aivector.rst new file mode 100644 index 000000000000..41c230f45dd2 --- /dev/null +++ b/doc/source/drivers/vector/aivector.rst @@ -0,0 +1,32 @@ +.. _vector.aivector: + +Artificial intelligence powered vector driver +============================================= + +.. versionadded:: 3.11 + +.. shortname:: AIVector + +.. built_in_by_default:: + +This driver builds on many years of self-funded investments from the GDAL team on AI +technologies to bring you the ultimate driver that can read any vector format. +After that one, no need for any new vector driver! + +The open syntax is ``AIVector:{filename}``, or directly specify the filename and +force the use of the AIVector driver with the ``-if`` flag of ogrinfo or ogr2ogr. +No options at all. Just enjoy the true power of AI. + +.. note:: We are open to external investors to develop the write side of the driver. + +Examples +-------- + +:: + + ogrinfo -if AIVector undocumented_proprietary_format.bin -al + +.. note:: + + The above works even if you make a typo in the filename. The driver will + automatically figure out the filename you meant. diff --git a/doc/source/drivers/vector/index.rst b/doc/source/drivers/vector/index.rst index 859bba292217..411189072692 100644 --- a/doc/source/drivers/vector/index.rst +++ b/doc/source/drivers/vector/index.rst @@ -23,6 +23,7 @@ Vector drivers :hidden: adbc + aivector amigocloud arrow avcbin diff --git a/doc/source/software_using_gdal.rst b/doc/source/software_using_gdal.rst index 7886d4df8b77..e5ae5e161eab 100644 --- a/doc/source/software_using_gdal.rst +++ b/doc/source/software_using_gdal.rst @@ -44,6 +44,7 @@ Free and open source - `NextGIS Web `_ Server-side Web GIS and a framework for storage, visualization and permissions management of all kinds - `Ogr2 GUI `_ Graphical user interface for ogr2ogr - `OpenCPN `_ A concise ChartPlotter/Navigator. A cross-platform ship-borne GUI application. +- `OpenDataCube `_ FOSS software package that simplifies the management and analysis of large amounts of satellite imagery and other Earth observation data. - `OpenEV `_ An OpenGL/GTK/Python based graphical viewer which exclusively uses GDAL for raster access. - `OFGT `_ a collection of utilities for multipurpose forest monitoring under the `Open Foris Initiative `_ Open Foris Initiative. - `OpenFLUID `_ a software platform for spatial modelling of landscapes dynamics diff --git a/docker/README.md b/docker/README.md index 390cea55b7f7..74d3a1d865a3 100644 --- a/docker/README.md +++ b/docker/README.md @@ -47,7 +47,7 @@ See [alpine-normal/Dockerfile](alpine-normal/Dockerfile) # Ubuntu based Ubuntu version: -* 24.04 for GDAL 3.9 +* 24.04 for GDAL 3.9 and 3.10 * 22.04 for GDAL 3.6, 3.7 and 3.8 * 20.04 for GDAL 3.4 and 3.5 @@ -100,11 +100,11 @@ If you are getting a ``: arena 0 background thread creation failed (1) # Images of releases -Tagged images of recent past releases are available. The last ones (at time of writing) are for GDAL 3.9.3 and PROJ 9.5.0, for linux/amd64 and linux/arm64: -* ghcr.io/osgeo/gdal:alpine-small-3.9.3 -* ghcr.io/osgeo/gdal:alpine-normal-3.9.3 -* ghcr.io/osgeo/gdal:ubuntu-small-3.9.3 -* ghcr.io/osgeo/gdal:ubuntu-full-3.9.3 +Tagged images of recent past releases are available. The last ones (at time of writing) are for GDAL 3.10.0 and PROJ 9.5.0, for linux/amd64 and linux/arm64: +* ghcr.io/osgeo/gdal:alpine-small-3.10.0 +* ghcr.io/osgeo/gdal:alpine-normal-3.10.0 +* ghcr.io/osgeo/gdal:ubuntu-small-3.10.0 +* ghcr.io/osgeo/gdal:ubuntu-full-3.10.0 ## Multi-arch Images diff --git a/docker/ubuntu-full/bh-gdal.sh b/docker/ubuntu-full/bh-gdal.sh index 3c66862a6c73..b4a7659463c6 100755 --- a/docker/ubuntu-full/bh-gdal.sh +++ b/docker/ubuntu-full/bh-gdal.sh @@ -46,7 +46,9 @@ wget -q "https://github.com/${GDAL_REPOSITORY}/archive/${GDAL_VERSION}.tar.gz" \ cd build # GDAL_USE_TIFF_INTERNAL=ON to use JXL export GDAL_CMAKE_EXTRA_OPTS="" - if test "${GCC_ARCH}" != "x86_64"; then + if test "${GCC_ARCH}" = "x86_64"; then + export GDAL_CMAKE_EXTRA_OPTS="${GDAL_CMAKE_EXTRA_OPTS} -DENABLE_IPO=ON" + else export GDAL_CMAKE_EXTRA_OPTS="${GDAL_CMAKE_EXTRA_OPTS} -DPDFIUM_INCLUDE_DIR=" fi export JAVA_ARCH="" diff --git a/docker/ubuntu-small/Dockerfile b/docker/ubuntu-small/Dockerfile index 524f52b35488..65cf4fd17b3a 100644 --- a/docker/ubuntu-small/Dockerfile +++ b/docker/ubuntu-small/Dockerfile @@ -153,6 +153,11 @@ RUN --mount=type=cache,id=ubuntu-small-gdal,target=$HOME/.cache \ && if test "x${GDAL_BUILD_IS_RELEASE:-}" = "x"; then \ export GDAL_SHA1SUM=${GDAL_VERSION}; \ fi \ + && if test "${GCC_ARCH}" = "x86_64"; then \ + export GDAL_CMAKE_EXTRA_OPTS="-DENABLE_IPO=ON"; \ + else \ + export GDAL_CMAKE_EXTRA_OPTS=""; \ + fi \ && mkdir gdal \ && wget -q https://github.com/${GDAL_REPOSITORY}/archive/${GDAL_VERSION}.tar.gz -O - \ | tar xz -C gdal --strip-components=1 \ @@ -183,7 +188,7 @@ RUN --mount=type=cache,id=ubuntu-small-gdal,target=$HOME/.cache \ -DPROJ_INCLUDE_DIR="/build${PROJ_INSTALL_PREFIX-/usr/local}/include" \ -DPROJ_LIBRARY="/build${PROJ_INSTALL_PREFIX-/usr/local}/lib/libinternalproj.so" \ -DGDAL_USE_TIFF_INTERNAL=ON \ - -DGDAL_USE_GEOTIFF_INTERNAL=ON \ + -DGDAL_USE_GEOTIFF_INTERNAL=ON ${GDAL_CMAKE_EXTRA_OPTS} \ -DBUILD_TESTING=OFF \ && ninja \ && DESTDIR="/build" ninja install \ diff --git a/frmts/drivers.ini b/frmts/drivers.ini index 65b3a6666dbd..1e59d34eea35 100644 --- a/frmts/drivers.ini +++ b/frmts/drivers.ini @@ -273,6 +273,9 @@ Tiger AVCBin AVCE00 +# Last but not the least +AIVector + # End of OGR drivers # Put here drivers that absolutely need to look for side car diff --git a/frmts/gtiff/tifvsi.cpp b/frmts/gtiff/tifvsi.cpp index afc77e171e2e..a889f3cad534 100644 --- a/frmts/gtiff/tifvsi.cpp +++ b/frmts/gtiff/tifvsi.cpp @@ -439,7 +439,10 @@ static void VSI_TIFFSetOpenOptions(TIFFOpenOptions *opts) { const auto nUsableRAM = CPLGetUsablePhysicalRAM(); if (nUsableRAM > 0) + { + // coverity[return_overflow] return nUsableRAM / 10 * 9; + } else return 0; } diff --git a/frmts/rcm/rcmdataset.cpp b/frmts/rcm/rcmdataset.cpp index 087521b98345..46248c14c7c1 100644 --- a/frmts/rcm/rcmdataset.cpp +++ b/frmts/rcm/rcmdataset.cpp @@ -79,7 +79,10 @@ static double *InterpolateValues(CSLConstList papszList, int tableSize, int pixelFirstLutValue) { /* Allocate the right LUT size according to the product range pixel */ - double *table = static_cast(CPLCalloc(sizeof(double), tableSize)); + double *table = + static_cast(VSI_CALLOC_VERBOSE(sizeof(double), tableSize)); + if (!table) + return nullptr; if (stepSize <= 0) { @@ -301,38 +304,16 @@ RCMRasterBand::~RCMRasterBand() CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage) { - int nRequestYSize; - int nRequestXSize; - - /* -------------------------------------------------------------------- */ - /* If the last strip is partial, we need to avoid */ - /* over-requesting. We also need to initialize the extra part */ - /* of the block to zero. */ - /* -------------------------------------------------------------------- */ - if ((nBlockYOff + 1) * nBlockYSize > nRasterYSize) - { - nRequestYSize = nRasterYSize - nBlockYOff * nBlockYSize; - memset(pImage, 0, - GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize); - } - else - { - nRequestYSize = nBlockYSize; - } + int nRequestXSize = 0; + int nRequestYSize = 0; + GetActualBlockSize(nBlockXOff, nBlockYOff, &nRequestXSize, &nRequestYSize); - /*-------------------------------------------------------------------- */ - /* If the input imagery is tiled, also need to avoid over- */ - /* requesting in the X-direction. */ - /* ------------------------------------------------------------------- */ - if ((nBlockXOff + 1) * nBlockXSize > nRasterXSize) + // Zero initial partial right-most and bottom-most blocks + if (nRequestXSize < nBlockXSize || nRequestYSize < nBlockYSize) { - nRequestXSize = nRasterXSize - nBlockXOff * nBlockXSize; memset(pImage, 0, - GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize); - } - else - { - nRequestXSize = nBlockXSize; + static_cast(GDALGetDataTypeSizeBytes(eDataType)) * + nBlockXSize * nBlockYSize); } int dataTypeSize = GDALGetDataTypeSizeBytes(eDataType); @@ -355,14 +336,16 @@ CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage) GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, nRequestXSize, nRequestYSize, pImage, nRequestXSize, nRequestYSize, bandFileType, 2, nullptr, dataTypeSize, - dataTypeSize * nBlockXSize, bandFileSize, nullptr); + static_cast(dataTypeSize) * nBlockXSize, bandFileSize, + nullptr); } else if (twoBandComplex && this->isNITF) { return poBand->RasterIO( GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, nRequestXSize, nRequestYSize, pImage, nRequestXSize, nRequestYSize, - eDataType, 0, dataTypeSize * nBlockXSize, nullptr); + eDataType, 0, static_cast(dataTypeSize) * nBlockXSize, + nullptr); } if (poRCMDataset->IsComplexData()) @@ -377,7 +360,8 @@ CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage) GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, nRequestXSize, nRequestYSize, pImage, nRequestXSize, nRequestYSize, bandFileType, 2, nullptr, dataTypeSize, - nBlockXSize * dataTypeSize, bandFileSize, nullptr); + static_cast(dataTypeSize) * nBlockXSize, bandFileSize, + nullptr); } // case: band file == this band @@ -388,7 +372,8 @@ CPLErr RCMRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage) return poBand->RasterIO( GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, nRequestXSize, nRequestYSize, pImage, nRequestXSize, nRequestYSize, - eDataType, 0, dataTypeSize * nBlockXSize, nullptr); + eDataType, 0, static_cast(dataTypeSize) * nBlockXSize, + nullptr); } else { @@ -457,7 +442,8 @@ void RCMCalibRasterBand::ReadLUT() } /* Get the Pixel Per range */ - if (this->stepSize == INT_MIN || this->numberOfValues == INT_MIN || + if (this->stepSize == 0 || this->stepSize == INT_MIN || + this->numberOfValues == INT_MIN || abs(this->stepSize) > INT_MAX / abs(this->numberOfValues)) { CPLError(CE_Failure, CPLE_AppDefined, @@ -476,15 +462,26 @@ void RCMCalibRasterBand::ReadLUT() return; } + // Avoid excessive memory allocation + if (this->m_nTableSize > 1000 * 1000) + { + CPLError(CE_Failure, CPLE_NotSupported, "Too many elements in LUT: %d", + this->m_nTableSize); + return; + } + /* Allocate the right LUT size according to the product range pixel */ this->m_nfTable = InterpolateValues(aosLUTList.List(), this->m_nTableSize, this->stepSize, this->numberOfValues, this->pixelFirstLutValue); + if (!this->m_nfTable) + return; - const size_t nLen = - this->m_nTableSize * max_space_for_string; // 32 max + space - char *lut_gains = static_cast(CPLMalloc(nLen)); - memset(lut_gains, 0, nLen); + // 32 max + space + char *lut_gains = static_cast( + VSI_CALLOC_VERBOSE(this->m_nTableSize, max_space_for_string)); + if (!lut_gains) + return; for (int i = 0; i < this->m_nTableSize; i++) { @@ -588,29 +585,35 @@ void RCMCalibRasterBand::ReadNoiseLevels() atoi(CPLGetXMLValue(psNumberOfValues, "", "0")); const char *noiseLevelValues = CPLGetXMLValue(psNoiseLevelValues, "", ""); - char **papszNoiseLevelList = CSLTokenizeString2( - noiseLevelValues, " ", CSLT_HONOURSTRINGS); - /* Get the Pixel Per range */ - this->m_nTableNoiseLevelsSize = - abs(this->stepSizeNoiseLevels) * - abs(this->numberOfValuesNoiseLevels); - - if ((EQUAL(calibType, "Beta Nought") && - this->m_eCalib == Beta0) || - (EQUAL(calibType, "Sigma Nought") && - this->m_eCalib == Sigma0) || - (EQUAL(calibType, "Gamma") && this->m_eCalib == Gamma)) + if (this->stepSizeNoiseLevels > 0 && + this->numberOfValuesNoiseLevels != INT_MIN && + abs(this->numberOfValuesNoiseLevels) < + INT_MAX / this->stepSizeNoiseLevels) { - /* Allocate the right Noise Levels size according to the - * product range pixel */ - this->m_nfTableNoiseLevels = InterpolateValues( - papszNoiseLevelList, this->m_nTableNoiseLevelsSize, - this->stepSizeNoiseLevels, - this->numberOfValuesNoiseLevels, - this->pixelFirstLutValueNoiseLevels); - } + char **papszNoiseLevelList = CSLTokenizeString2( + noiseLevelValues, " ", CSLT_HONOURSTRINGS); + /* Get the Pixel Per range */ + this->m_nTableNoiseLevelsSize = + abs(this->stepSizeNoiseLevels) * + abs(this->numberOfValuesNoiseLevels); + + if ((EQUAL(calibType, "Beta Nought") && + this->m_eCalib == Beta0) || + (EQUAL(calibType, "Sigma Nought") && + this->m_eCalib == Sigma0) || + (EQUAL(calibType, "Gamma") && this->m_eCalib == Gamma)) + { + /* Allocate the right Noise Levels size according to the + * product range pixel */ + this->m_nfTableNoiseLevels = InterpolateValues( + papszNoiseLevelList, this->m_nTableNoiseLevelsSize, + this->stepSizeNoiseLevels, + this->numberOfValuesNoiseLevels, + this->pixelFirstLutValueNoiseLevels); + } - CSLDestroy(papszNoiseLevelList); + CSLDestroy(papszNoiseLevelList); + } if (this->m_nfTableNoiseLevels != nullptr) { @@ -673,53 +676,32 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, void *pImage) { CPLErr eErr; - int nRequestYSize; - int nRequestXSize; + int nRequestXSize = 0; + int nRequestYSize = 0; + GetActualBlockSize(nBlockXOff, nBlockYOff, &nRequestXSize, &nRequestYSize); - /* -------------------------------------------------------------------- */ - /* If the last strip is partial, we need to avoid */ - /* over-requesting. We also need to initialize the extra part */ - /* of the block to zero. */ - /* -------------------------------------------------------------------- */ - if ((nBlockYOff + 1) * nBlockYSize > nRasterYSize) + // Zero initial partial right-most and bottom-most blocks + if (nRequestXSize < nBlockXSize || nRequestYSize < nBlockYSize) { - nRequestYSize = nRasterYSize - nBlockYOff * nBlockYSize; memset(pImage, 0, - GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize); - } - else - { - nRequestYSize = nBlockYSize; - } - - /*-------------------------------------------------------------------- */ - /* If the input imagery is tiled, also need to avoid over- */ - /* requesting in the X-direction. */ - /* ------------------------------------------------------------------- */ - if ((nBlockXOff + 1) * nBlockXSize > nRasterXSize) - { - nRequestXSize = nRasterXSize - nBlockXOff * nBlockXSize; - memset(pImage, 0, - GDALGetDataTypeSizeBytes(eDataType) * nBlockXSize * nBlockYSize); - } - else - { - nRequestXSize = nBlockXSize; + static_cast(GDALGetDataTypeSizeBytes(eDataType)) * + nBlockXSize * nBlockYSize); } if (this->m_eOriginalType == GDT_CInt16) { - GInt16 *pnImageTmp; /* read in complex values */ - pnImageTmp = static_cast( - CPLMalloc(nBlockXSize * nBlockYSize * - GDALGetDataTypeSizeBytes(m_eOriginalType))); + GInt16 *panImageTmp = static_cast( + VSI_MALLOC3_VERBOSE(nBlockXSize, nBlockYSize, + GDALGetDataTypeSizeBytes(m_eOriginalType))); + if (!panImageTmp) + return CE_Failure; if (m_poBandDataset->GetRasterCount() == 2) { eErr = m_poBandDataset->RasterIO( GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, - nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize, + nRequestXSize, nRequestYSize, panImageTmp, nRequestXSize, nRequestYSize, this->m_eOriginalType, 2, nullptr, 4, nBlockXSize * 4, 4, nullptr); @@ -728,7 +710,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, nRequestXSize, nRequestYSize, - pnImageTmp, nRequestXSize, nRequestYSize, + panImageTmp, nRequestXSize, nRequestYSize, GDT_Int32, 2, nullptr, 4, nBlockXSize * 4, 2, nullptr); */ @@ -737,7 +719,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, { eErr = m_poBandDataset->RasterIO( GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, - nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize, + nRequestXSize, nRequestYSize, panImageTmp, nRequestXSize, nRequestYSize, this->m_eOriginalType, 1, nullptr, 4, nBlockXSize * 4, 0, nullptr); @@ -747,7 +729,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, nRequestXSize, nRequestYSize, - pnImageTmp, nRequestXSize, nRequestYSize, + panImageTmp, nRequestXSize, nRequestYSize, GDT_UInt32, 1, nullptr, 4, nBlockXSize * 4, 0, nullptr); */ @@ -771,8 +753,8 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, const int nTruePixOff = (i * nBlockXSize) + j; // Formula for Complex Q+J - const float real = static_cast(pnImageTmp[nPixOff]); - const float img = static_cast(pnImageTmp[nPixOff + 1]); + const float real = static_cast(panImageTmp[nPixOff]); + const float img = static_cast(panImageTmp[nPixOff + 1]); const float digitalValue = (real * real) + (img * img); const float lutValue = static_cast(m_nfTable[nBlockXOff * nBlockXSize + j]); @@ -782,7 +764,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, } } - CPLFree(pnImageTmp); + CPLFree(panImageTmp); } // If the underlying file is NITF CFloat32 @@ -790,25 +772,26 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, this->m_eOriginalType == GDT_CFloat64) { /* read in complex values */ - float *pnImageTmp; - const int dataTypeSize = GDALGetDataTypeSizeBytes(this->m_eOriginalType); const GDALDataType bandFileType = this->m_eOriginalType; - const int bandFileSize = GDALGetDataTypeSizeBytes(bandFileType); + const int bandFileDataTypeSize = GDALGetDataTypeSizeBytes(bandFileType); /* read the original image complex values in a temporary image space */ - pnImageTmp = static_cast( - CPLMalloc(2 * nBlockXSize * nBlockYSize * bandFileSize)); + float *pafImageTmp = static_cast(VSI_MALLOC3_VERBOSE( + nBlockXSize, nBlockYSize, 2 * bandFileDataTypeSize)); + if (!pafImageTmp) + return CE_Failure; eErr = // I and Q from each band are pixel-interleaved into this complex // band m_poBandDataset->RasterIO( GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, - nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize, + nRequestXSize, nRequestYSize, pafImageTmp, nRequestXSize, nRequestYSize, bandFileType, 2, nullptr, dataTypeSize, - nBlockXSize * dataTypeSize, bandFileSize, nullptr); + static_cast(dataTypeSize) * nBlockXSize, + bandFileDataTypeSize, nullptr); /* calibrate the complex values */ for (int i = 0; i < nRequestYSize; i++) @@ -820,8 +803,8 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, const int nTruePixOff = (i * nBlockXSize) + j; // Formula for Complex Q+J - const float real = static_cast(pnImageTmp[nPixOff]); - const float img = static_cast(pnImageTmp[nPixOff + 1]); + const float real = pafImageTmp[nPixOff]; + const float img = pafImageTmp[nPixOff + 1]; const float digitalValue = (real * real) + (img * img); const float lutValue = static_cast(m_nfTable[nBlockXOff * nBlockXSize + j]); @@ -831,7 +814,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, } } - CPLFree(pnImageTmp); + CPLFree(pafImageTmp); } else if (this->m_eOriginalType == GDT_Float32) @@ -900,13 +883,14 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, else if (this->m_eOriginalType == GDT_UInt16) { - GUInt16 *pnImageTmp; /* read in detected values */ - pnImageTmp = static_cast(CPLMalloc( - nBlockXSize * nBlockYSize * GDALGetDataTypeSizeBytes(GDT_UInt16))); + GUInt16 *panImageTmp = static_cast(VSI_MALLOC3_VERBOSE( + nBlockXSize, nBlockYSize, GDALGetDataTypeSizeBytes(GDT_UInt16))); + if (!panImageTmp) + return CE_Failure; eErr = m_poBandDataset->RasterIO( GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, - nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize, + nRequestXSize, nRequestYSize, panImageTmp, nRequestXSize, nRequestYSize, GDT_UInt16, 1, nullptr, 2, nBlockXSize * 2, 0, nullptr); @@ -918,7 +902,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, const int nPixOff = (i * nBlockXSize) + j; const float digitalValue = - static_cast(pnImageTmp[nPixOff]); + static_cast(panImageTmp[nPixOff]); const float A = static_cast(m_nfTable[nBlockXOff * nBlockXSize + j]); reinterpret_cast(pImage)[nPixOff] = @@ -927,16 +911,18 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, A; } } - CPLFree(pnImageTmp); + CPLFree(panImageTmp); } /* Ticket #2104: Support for ScanSAR products */ else if (this->m_eOriginalType == GDT_Byte) { - GByte *pnImageTmp; - pnImageTmp = static_cast(CPLMalloc(nBlockXSize * nBlockYSize)); + GByte *pabyImageTmp = + static_cast(VSI_MALLOC2_VERBOSE(nBlockXSize, nBlockYSize)); + if (!pabyImageTmp) + return CE_Failure; eErr = m_poBandDataset->RasterIO( GF_Read, nBlockXOff * nBlockXSize, nBlockYOff * nBlockYSize, - nRequestXSize, nRequestYSize, pnImageTmp, nRequestXSize, + nRequestXSize, nRequestYSize, pabyImageTmp, nRequestXSize, nRequestYSize, GDT_Byte, 1, nullptr, 1, nBlockXSize, 0, nullptr); /* iterate over detected values */ @@ -947,7 +933,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, const int nPixOff = (i * nBlockXSize) + j; const float digitalValue = - static_cast(pnImageTmp[nPixOff]); + static_cast(pabyImageTmp[nPixOff]); const float A = static_cast(m_nfTable[nBlockXOff * nBlockXSize + j]); reinterpret_cast(pImage)[nPixOff] = @@ -956,7 +942,7 @@ CPLErr RCMCalibRasterBand::IReadBlock(int nBlockXOff, int nBlockYOff, A; } } - CPLFree(pnImageTmp); + CPLFree(pabyImageTmp); } else { @@ -1213,14 +1199,8 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo) psSceneAttributes, "imageAttributes.samplesPerLine", "-1")); poDS->nRasterYSize = atoi( CPLGetXMLValue(psSceneAttributes, "imageAttributes.numLines", "-1")); - if (poDS->nRasterXSize <= 1 || poDS->nRasterYSize <= 1) + if (!GDALCheckDatasetDimensions(poDS->nRasterXSize, poDS->nRasterYSize)) { - CPLError( - CE_Failure, CPLE_OpenFailed, - "ERROR: Non-sane raster dimensions provided in product.xml. If " - "this is " - "a valid RCM scene, please contact your data provider for " - "a corrected dataset."); return nullptr; } @@ -1582,47 +1562,52 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo) CPLGetXMLValue(psIncidenceAngle.get(), "=incidenceAngles.pixelFirstAnglesValue", "0")); - int stepSize = atoi(CPLGetXMLValue( + const int stepSize = atoi(CPLGetXMLValue( psIncidenceAngle.get(), "=incidenceAngles.stepSize", "0")); - - int numberOfValues = + const int numberOfValues = atoi(CPLGetXMLValue(psIncidenceAngle.get(), "=incidenceAngles.numberOfValues", "0")); - /* Get the Pixel Per range */ - int tableSize = abs(stepSize) * abs(numberOfValues); + if (!(stepSize == 0 || stepSize == INT_MIN || + numberOfValues == INT_MIN || + abs(numberOfValues) > INT_MAX / abs(stepSize))) + { + /* Get the Pixel Per range */ + const int tableSize = abs(stepSize) * abs(numberOfValues); - CPLString angles; - // Loop through all nodes with spaces - CPLXMLNode *psNextNode = - CPLGetXMLNode(psIncidenceAngle.get(), "=incidenceAngles"); + CPLString angles; + // Loop through all nodes with spaces + CPLXMLNode *psNextNode = + CPLGetXMLNode(psIncidenceAngle.get(), "=incidenceAngles"); - CPLXMLNode *psNodeInc; - for (psNodeInc = psNextNode->psChild; psNodeInc != nullptr; - psNodeInc = psNodeInc->psNext) - { - if (EQUAL(psNodeInc->pszValue, "angles")) + CPLXMLNode *psNodeInc; + for (psNodeInc = psNextNode->psChild; psNodeInc != nullptr; + psNodeInc = psNodeInc->psNext) { - if (angles.length() > 0) + if (EQUAL(psNodeInc->pszValue, "angles")) { - angles.append(" "); /* separator */ + if (angles.length() > 0) + { + angles.append(" "); /* separator */ + } + const char *valAngle = + CPLGetXMLValue(psNodeInc, "", ""); + angles.append(valAngle); } - const char *valAngle = CPLGetXMLValue(psNodeInc, "", ""); - angles.append(valAngle); } - } - char **papszAngleList = - CSLTokenizeString2(angles, " ", CSLT_HONOURSTRINGS); + char **papszAngleList = + CSLTokenizeString2(angles, " ", CSLT_HONOURSTRINGS); - /* Allocate the right LUT size according to the product range pixel - */ - poDS->m_IncidenceAngleTableSize = tableSize; - poDS->m_nfIncidenceAngleTable = - InterpolateValues(papszAngleList, tableSize, stepSize, - numberOfValues, pixelFirstLutValue); + /* Allocate the right LUT size according to the product range pixel + */ + poDS->m_IncidenceAngleTableSize = tableSize; + poDS->m_nfIncidenceAngleTable = + InterpolateValues(papszAngleList, tableSize, stepSize, + numberOfValues, pixelFirstLutValue); - CSLDestroy(papszAngleList); + CSLDestroy(papszAngleList); + } } } @@ -1962,6 +1947,12 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo) /* we should bomb gracefully... */ pszLUT = pszSigma0LUT; } + if (!pszLUT) + { + CPLFree(pszFullname); + CPLError(CE_Failure, CPLE_AppDefined, "LUT missing."); + return nullptr; + } // The variable 'osNoiseLevelsValues' is always the same for a ban // name except the XML contains different calibration name @@ -1970,10 +1961,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo) // If Complex, always 32 bits RCMCalibRasterBand *poBand = new RCMCalibRasterBand( poDS.get(), pszPole, GDT_Float32, poBandFile.release(), - eCalib, CPLFormFilename(osPath, pszLUT, nullptr), - CPLFormFilename(osPath, osNoiseLevelsValues.c_str(), - nullptr), - eDataType); + eCalib, pszLUT, osNoiseLevelsValues.c_str(), eDataType); poDS->SetBand(poDS->GetRasterCount() + 1, poBand); } else @@ -1981,10 +1969,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo) // Whatever the datatype was previoulsy set RCMCalibRasterBand *poBand = new RCMCalibRasterBand( poDS.get(), pszPole, eDataType, poBandFile.release(), - eCalib, CPLFormFilename(osPath, pszLUT, nullptr), - CPLFormFilename(osPath, osNoiseLevelsValues.c_str(), - nullptr), - eDataType); + eCalib, pszLUT, osNoiseLevelsValues.c_str(), eDataType); poDS->SetBand(poDS->GetRasterCount() + 1, poBand); } } @@ -2310,7 +2295,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo) if (bUseProjInfo) { - poDS->m_oSRS = oPrj; + poDS->m_oSRS = std::move(oPrj); } else { @@ -2320,7 +2305,7 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo) } } - poDS->m_oGCPSRS = oLL; + poDS->m_oGCPSRS = std::move(oLL); } /* -------------------------------------------------------------------- */ @@ -2448,33 +2433,25 @@ GDALDataset *RCMDataset::Open(GDALOpenInfo *poOpenInfo) case Sigma0: { osSubdatasetName = szSIGMA0; - CPLString pszDescriptionSigma = - FormatCalibration(szSIGMA0, osMDFilename.c_str()); - osDescription = pszDescriptionSigma; + osDescription = FormatCalibration(szSIGMA0, osMDFilename.c_str()); } break; case Beta0: { osSubdatasetName = szBETA0; - CPLString pszDescriptionBeta = - FormatCalibration(szBETA0, osMDFilename.c_str()); - osDescription = pszDescriptionBeta; + osDescription = FormatCalibration(szBETA0, osMDFilename.c_str()); } break; case Gamma: { osSubdatasetName = szGAMMA; - CPLString pszDescriptionGamma = - FormatCalibration(szGAMMA, osMDFilename.c_str()); - osDescription = pszDescriptionGamma; + osDescription = FormatCalibration(szGAMMA, osMDFilename.c_str()); } break; case Uncalib: { osSubdatasetName = szUNCALIB; - CPLString pszDescriptionUncalib = - FormatCalibration(szUNCALIB, osMDFilename.c_str()); - osDescription = pszDescriptionUncalib; + osDescription = FormatCalibration(szUNCALIB, osMDFilename.c_str()); } break; default: diff --git a/gcore/CMakeLists.txt b/gcore/CMakeLists.txt index 57a114bc3205..0923511caada 100644 --- a/gcore/CMakeLists.txt +++ b/gcore/CMakeLists.txt @@ -104,7 +104,10 @@ if (NOT GDAL_AUTOLOAD_PLUGINS) PROPERTY COMPILE_DEFINITIONS GDAL_NO_AUTOLOAD) endif () -if (HAVE_SSSE3_AT_COMPILE_TIME) +if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS) + target_compile_definitions(gcore PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME -DUSE_NEON_OPTIMIZATIONS) + target_sources(gcore PRIVATE rasterio_ssse3.cpp) +elseif (HAVE_SSSE3_AT_COMPILE_TIME) target_compile_definitions(gcore PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME) target_sources(gcore PRIVATE rasterio_ssse3.cpp) set_property( @@ -209,6 +212,8 @@ target_public_header( gdalsubdatasetinfo.h gdal_typetraits.h gdal_adbc.h + gdal_minmax_element.hpp + gdal_priv_templates.hpp # Required by gdal_minmax_element.hpp ) set(GDAL_DATA_FILES diff --git a/gcore/gdal.h b/gcore/gdal.h index f7a411c393cb..8638f8501b27 100644 --- a/gcore/gdal.h +++ b/gcore/gdal.h @@ -1670,6 +1670,10 @@ CPLErr CPL_DLL CPL_STDCALL GDALSetRasterScale(GDALRasterBandH hBand, CPLErr CPL_DLL CPL_STDCALL GDALComputeRasterMinMax(GDALRasterBandH hBand, int bApproxOK, double adfMinMax[2]); +CPLErr CPL_DLL GDALComputeRasterMinMaxLocation(GDALRasterBandH hBand, + double *pdfMin, double *pdfMax, + int *pnMinX, int *pnMinY, + int *pnMaxX, int *pnMaxY); CPLErr CPL_DLL CPL_STDCALL GDALFlushRasterCache(GDALRasterBandH hBand); CPLErr CPL_DLL CPL_STDCALL GDALDropRasterCache(GDALRasterBandH hBand); CPLErr CPL_DLL CPL_STDCALL GDALGetRasterHistogram( diff --git a/gcore/gdal_minmax_element.hpp b/gcore/gdal_minmax_element.hpp new file mode 100644 index 000000000000..9ceb304553b3 --- /dev/null +++ b/gcore/gdal_minmax_element.hpp @@ -0,0 +1,1411 @@ +/****************************************************************************** + * Project: GDAL Core + * Purpose: Utility functions to find minimum and maximum values in a buffer + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2024, Even Rouault + * + * SPDX-License-Identifier: MIT + ****************************************************************************/ + +#ifndef GDAL_MINMAX_ELEMENT_INCLUDED +#define GDAL_MINMAX_ELEMENT_INCLUDED + +// NOTE: This header requires C++17 + +// This file may be vendored by other applications than GDAL +// WARNING: if modifying this file, please also update the upstream GDAL version +// at https://github.com/OSGeo/gdal/blob/master/gcore/gdal_minmax_element.hpp + +#include +#include +#include +#include +#include +#include + +#include "gdal.h" + +#ifdef GDAL_COMPILATION +#define GDAL_MINMAXELT_NS gdal +#elif !defined(GDAL_MINMAXELT_NS) +#error "Please define the GDAL_MINMAXELT_NS macro to define the namespace" +#endif + +#ifdef USE_NEON_OPTIMIZATIONS +#include "include_sse2neon.h" +#define GDAL_MINMAX_ELEMENT_USE_SSE2 +#else +#if defined(__x86_64) || defined(_M_X64) +#define GDAL_MINMAX_ELEMENT_USE_SSE2 +#endif +#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2 +// SSE2 header +#include +#ifdef __SSE4_1__ +#include +#endif +#endif +#endif + +#include "gdal_priv_templates.hpp" + +namespace GDAL_MINMAXELT_NS +{ +namespace detail +{ + +#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2 +/************************************************************************/ +/* compScalar() */ +/************************************************************************/ + +template inline static bool compScalar(T x, T y) +{ + if constexpr (IS_MAX) + return x > y; + else + return x < y; +} + +/************************************************************************/ +/* extremum_element() */ +/************************************************************************/ + +template +size_t extremum_element(const T *v, size_t size, T noDataValue) +{ + static_assert(!(std::is_floating_point_v)); + if (size == 0) + return 0; + size_t idx_of_extremum = 0; + T extremum = v[0]; + bool extremum_is_nodata = extremum == noDataValue; + size_t i = 1; + for (; i < size; ++i) + { + if (v[i] != noDataValue && + (compScalar(v[i], extremum) || extremum_is_nodata)) + { + extremum = v[i]; + idx_of_extremum = i; + extremum_is_nodata = false; + } + } + return idx_of_extremum; +} + +/************************************************************************/ +/* extremum_element() */ +/************************************************************************/ + +template size_t extremum_element(const T *v, size_t size) +{ + static_assert(!(std::is_floating_point_v)); + if (size == 0) + return 0; + size_t idx_of_extremum = 0; + T extremum = v[0]; + size_t i = 1; + for (; i < size; ++i) + { + if (compScalar(v[i], extremum)) + { + extremum = v[i]; + idx_of_extremum = i; + } + } + return idx_of_extremum; +} + +#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2 + +/************************************************************************/ +/* extremum_element_with_nan() */ +/************************************************************************/ + +static inline int8_t Shift8(uint8_t x) +{ + return static_cast(x + std::numeric_limits::min()); +} + +static inline int16_t Shift16(uint16_t x) +{ + return static_cast(x + std::numeric_limits::min()); +} + +CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW +static inline int32_t Shift32(uint32_t x) +{ + x += static_cast(std::numeric_limits::min()); + int32_t ret; + memcpy(&ret, &x, sizeof(x)); + return ret; +} + +// Return a _mm128[i|d] register with all its elements set to x +template static inline auto set1(T x) +{ + if constexpr (std::is_same_v) + return _mm_set1_epi8(Shift8(x)); + else if constexpr (std::is_same_v) + return _mm_set1_epi8(x); + else if constexpr (std::is_same_v) + return _mm_set1_epi16(Shift16(x)); + else if constexpr (std::is_same_v) + return _mm_set1_epi16(x); + else if constexpr (std::is_same_v) + return _mm_set1_epi32(Shift32(x)); + else if constexpr (std::is_same_v) + return _mm_set1_epi32(x); + else if constexpr (std::is_same_v) + return _mm_set1_ps(x); + else + return _mm_set1_pd(x); +} + +// Return a _mm128[i|d] register with all its elements set to x +template static inline auto set1_unshifted(T x) +{ + if constexpr (std::is_same_v) + { + int8_t xSigned; + memcpy(&xSigned, &x, sizeof(xSigned)); + return _mm_set1_epi8(xSigned); + } + else if constexpr (std::is_same_v) + return _mm_set1_epi8(x); + else if constexpr (std::is_same_v) + { + int16_t xSigned; + memcpy(&xSigned, &x, sizeof(xSigned)); + return _mm_set1_epi16(xSigned); + } + else if constexpr (std::is_same_v) + return _mm_set1_epi16(x); + else if constexpr (std::is_same_v) + { + int32_t xSigned; + memcpy(&xSigned, &x, sizeof(xSigned)); + return _mm_set1_epi32(xSigned); + } + else if constexpr (std::is_same_v) + return _mm_set1_epi32(x); + else if constexpr (std::is_same_v) + return _mm_set1_ps(x); + else + return _mm_set1_pd(x); +} + +// Load as many values of type T at a _mm128[i|d] register can contain from x +template static inline auto loadv(const T *x) +{ + if constexpr (std::is_same_v) + return _mm_loadu_ps(x); + else if constexpr (std::is_same_v) + return _mm_loadu_pd(x); + else + return _mm_loadu_si128(reinterpret_cast(x)); +} + +// Return a __m128i register with bits set when x[i] < y[i] when !IS_MAX +// or x[i] > y[i] when IS_MAX +template +static inline __m128i comp(SSE_T x, SSE_T y) +{ + if constexpr (IS_MAX) + { + if constexpr (std::is_same_v) + return _mm_cmpgt_epi8( + _mm_add_epi8(x, + _mm_set1_epi8(std::numeric_limits::min())), + y); + else if constexpr (std::is_same_v) + return _mm_cmpgt_epi8(x, y); + else if constexpr (std::is_same_v) + return _mm_cmpgt_epi16( + _mm_add_epi16( + x, _mm_set1_epi16(std::numeric_limits::min())), + y); + else if constexpr (std::is_same_v) + return _mm_cmpgt_epi16(x, y); + else if constexpr (std::is_same_v) + return _mm_cmpgt_epi32( + _mm_add_epi32( + x, _mm_set1_epi32(std::numeric_limits::min())), + y); + else if constexpr (std::is_same_v) + return _mm_cmpgt_epi32(x, y); + // We could use _mm_cmpgt_pX() if there was no NaN values + else if constexpr (std::is_same_v) + return _mm_castps_si128(_mm_cmpnle_ps(x, y)); + else + return _mm_castpd_si128(_mm_cmpnle_pd(x, y)); + } + else + { + if constexpr (std::is_same_v) + return _mm_cmplt_epi8( + _mm_add_epi8(x, + _mm_set1_epi8(std::numeric_limits::min())), + y); + else if constexpr (std::is_same_v) + return _mm_cmplt_epi8(x, y); + else if constexpr (std::is_same_v) + return _mm_cmplt_epi16( + _mm_add_epi16( + x, _mm_set1_epi16(std::numeric_limits::min())), + y); + else if constexpr (std::is_same_v) + return _mm_cmplt_epi16(x, y); + else if constexpr (std::is_same_v) + return _mm_cmplt_epi32( + _mm_add_epi32( + x, _mm_set1_epi32(std::numeric_limits::min())), + y); + else if constexpr (std::is_same_v) + return _mm_cmplt_epi32(x, y); + // We could use _mm_cmplt_pX() if there was no NaN values + else if constexpr (std::is_same_v) + return _mm_castps_si128(_mm_cmpnge_ps(x, y)); + else + return _mm_castpd_si128(_mm_cmpnge_pd(x, y)); + } +} + +template static inline SSE_T compeq(SSE_T a, SSE_T b); + +template <> __m128i compeq(__m128i a, __m128i b) +{ + return _mm_cmpeq_epi8(a, b); +} + +template <> __m128i compeq(__m128i a, __m128i b) +{ + return _mm_cmpeq_epi8(a, b); +} + +template <> __m128i compeq(__m128i a, __m128i b) +{ + return _mm_cmpeq_epi16(a, b); +} + +template <> __m128i compeq(__m128i a, __m128i b) +{ + return _mm_cmpeq_epi16(a, b); +} + +template <> __m128i compeq(__m128i a, __m128i b) +{ + return _mm_cmpeq_epi32(a, b); +} + +template <> __m128i compeq(__m128i a, __m128i b) +{ + return _mm_cmpeq_epi32(a, b); +} + +template <> __m128 compeq(__m128 a, __m128 b) +{ + return _mm_cmpeq_ps(a, b); +} + +template <> __m128d compeq(__m128d a, __m128d b) +{ + return _mm_cmpeq_pd(a, b); +} + +template static inline T blendv(T a, T b, T mask); + +template <> __m128i blendv(__m128i a, __m128i b, __m128i mask) +{ +#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) + return _mm_blendv_epi8(a, b, mask); +#else + return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b)); +#endif +} + +template <> __m128 blendv(__m128 a, __m128 b, __m128 mask) +{ +#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) + return _mm_blendv_ps(a, b, mask); +#else + return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, b)); +#endif +} + +template <> __m128d blendv(__m128d a, __m128d b, __m128d mask) +{ +#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) + return _mm_blendv_pd(a, b, mask); +#else + return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, b)); +#endif +} + +// Using SSE2 +template +inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue) +{ + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_floating_point_v); + if (size == 0) + return 0; + size_t idx_of_extremum = 0; + T extremum = v[0]; + [[maybe_unused]] bool extremum_is_invalid = false; + if constexpr (std::is_floating_point_v) + { + extremum_is_invalid = std::isnan(extremum); + } + if constexpr (HAS_NODATA) + { + if (extremum == noDataValue) + extremum_is_invalid = true; + } + size_t i = 1; + + constexpr size_t VALS_PER_REG = sizeof(set1(extremum)) / sizeof(extremum); + constexpr int LOOP_UNROLLING = 4; + // If changing the value, then we need to adjust the number of sse_valX + // loading in the loop. + static_assert(LOOP_UNROLLING == 4); + constexpr size_t VALS_PER_ITER = VALS_PER_REG * LOOP_UNROLLING; + + const auto update = [v, noDataValue, &extremum, &idx_of_extremum, + &extremum_is_invalid](size_t idx) + { + if constexpr (HAS_NODATA) + { + if (v[idx] == noDataValue) + return; + if (extremum_is_invalid) + { + if constexpr (std::is_floating_point_v) + { + if (std::isnan(v[idx])) + return; + } + extremum = v[idx]; + idx_of_extremum = idx; + extremum_is_invalid = false; + return; + } + } + else + { + CPL_IGNORE_RET_VAL(noDataValue); + } + if (compScalar(v[idx], extremum)) + { + extremum = v[idx]; + idx_of_extremum = idx; + extremum_is_invalid = false; + } + else if constexpr (std::is_floating_point_v) + { + if (extremum_is_invalid && !std::isnan(v[idx])) + { + extremum = v[idx]; + idx_of_extremum = idx; + extremum_is_invalid = false; + } + } + }; + + for (; i < VALS_PER_ITER && i < size; ++i) + { + update(i); + } + + [[maybe_unused]] auto sse_neutral = set1_unshifted(static_cast(0)); + [[maybe_unused]] auto sse_nodata = set1_unshifted(noDataValue); + if constexpr (HAS_NODATA) + { + for (; i < size && extremum_is_invalid; ++i) + { + update(i); + } + if (!extremum_is_invalid) + { + for (; i < size && (i % VALS_PER_ITER) != 0; ++i) + { + update(i); + } + sse_neutral = set1_unshifted(extremum); + } + } + + auto sse_extremum = set1(extremum); + + [[maybe_unused]] size_t hits = 0; + const auto sse_iter_count = (size / VALS_PER_ITER) * VALS_PER_ITER; + for (; i < sse_iter_count; i += VALS_PER_ITER) + { + // A bit of loop unrolling to save 3/4 of slow movemask operations. + auto sse_val0 = loadv(v + i + 0 * VALS_PER_REG); + auto sse_val1 = loadv(v + i + 1 * VALS_PER_REG); + auto sse_val2 = loadv(v + i + 2 * VALS_PER_REG); + auto sse_val3 = loadv(v + i + 3 * VALS_PER_REG); + + if constexpr (HAS_NODATA) + { + // Replace all components that are at the nodata value by a + // neutral value (current minimum) + const auto replaceNoDataByNeutral = + [sse_neutral, sse_nodata](auto sse_val) + { + const auto eq_nodata = compeq(sse_val, sse_nodata); + return blendv(sse_val, sse_neutral, eq_nodata); + }; + + sse_val0 = replaceNoDataByNeutral(sse_val0); + sse_val1 = replaceNoDataByNeutral(sse_val1); + sse_val2 = replaceNoDataByNeutral(sse_val2); + sse_val3 = replaceNoDataByNeutral(sse_val3); + } + + if (_mm_movemask_epi8(_mm_or_si128( + _mm_or_si128(comp(sse_val0, sse_extremum), + comp(sse_val1, sse_extremum)), + _mm_or_si128(comp(sse_val2, sse_extremum), + comp(sse_val3, sse_extremum)))) != 0) + { + if constexpr (!std::is_same_v && + !std::is_same_v) + { + // The above tests excluding int8_t/uint8_t is due to the fact + // with those small ranges of values we will quickly converge + // to the minimum, so no need to do the below "smart" test. + + if (++hits == size / 16) + { + // If we have an almost sorted array, then using this code path + // will hurt performance. Arbitrary give up if we get here + // more than 1. / 16 of the size of the array. + // fprintf(stderr, "going to non-vector path\n"); + break; + } + } + for (size_t j = 0; j < VALS_PER_ITER; j++) + { + update(i + j); + } + sse_extremum = set1(extremum); + if constexpr (HAS_NODATA) + { + sse_neutral = set1_unshifted(extremum); + } + } + } + for (; i < size; ++i) + { + update(i); + } + return idx_of_extremum; +} + +#else + +/************************************************************************/ +/* extremum_element_with_nan() */ +/************************************************************************/ + +template +inline size_t extremum_element_with_nan(const T *v, size_t size, T /* nodata */) +{ + static_assert(!HAS_NODATA); + if (size == 0) + return 0; + size_t idx_of_extremum = 0; + auto extremum = v[0]; + bool extremum_is_nan = std::isnan(extremum); + size_t i = 1; + for (; i < size; ++i) + { + if (compScalar(v[i], extremum) || + (extremum_is_nan && !std::isnan(v[i]))) + { + extremum = v[i]; + idx_of_extremum = i; + extremum_is_nan = false; + } + } + return idx_of_extremum; +} +#endif + +/************************************************************************/ +/* extremum_element() */ +/************************************************************************/ + +#ifdef GDAL_MINMAX_ELEMENT_USE_SSE2 + +template <> +size_t extremum_element(const uint8_t *v, size_t size, + uint8_t noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> +size_t extremum_element(const uint8_t *v, size_t size, + uint8_t noDataValue) +{ + return extremum_element_with_nan(v, size, + noDataValue); +} + +template <> +size_t extremum_element(const uint8_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const uint8_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const int8_t *v, size_t size, + int8_t noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> +size_t extremum_element(const int8_t *v, size_t size, + int8_t noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> size_t extremum_element(const int8_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> size_t extremum_element(const int8_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const uint16_t *v, size_t size, + uint16_t noDataValue) +{ + return extremum_element_with_nan(v, size, + noDataValue); +} + +template <> +size_t extremum_element(const uint16_t *v, size_t size, + uint16_t noDataValue) +{ + return extremum_element_with_nan(v, size, + noDataValue); +} + +template <> +size_t extremum_element(const uint16_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const uint16_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const int16_t *v, size_t size, + int16_t noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> +size_t extremum_element(const int16_t *v, size_t size, + int16_t noDataValue) +{ + return extremum_element_with_nan(v, size, + noDataValue); +} + +template <> +size_t extremum_element(const int16_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const int16_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const uint32_t *v, size_t size, + uint32_t noDataValue) +{ + return extremum_element_with_nan(v, size, + noDataValue); +} + +template <> +size_t extremum_element(const uint32_t *v, size_t size, + uint32_t noDataValue) +{ + return extremum_element_with_nan(v, size, + noDataValue); +} + +template <> +size_t extremum_element(const uint32_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const uint32_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const int32_t *v, size_t size, + int32_t noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> +size_t extremum_element(const int32_t *v, size_t size, + int32_t noDataValue) +{ + return extremum_element_with_nan(v, size, + noDataValue); +} + +template <> +size_t extremum_element(const int32_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const int32_t *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> +size_t extremum_element(const float *v, size_t size, + float noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> +size_t extremum_element(const float *v, size_t size, + float noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> +size_t extremum_element(const double *v, size_t size, + double noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> +size_t extremum_element(const double *v, size_t size, + double noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +#endif + +template <> size_t extremum_element(const float *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> size_t extremum_element(const double *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> size_t extremum_element(const float *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +template <> size_t extremum_element(const double *v, size_t size) +{ + return extremum_element_with_nan(v, size, 0); +} + +/************************************************************************/ +/* extremum_element_with_nan() */ +/************************************************************************/ + +template +inline size_t extremum_element_with_nan(const T *v, size_t size, T noDataValue) +{ + if (std::isnan(noDataValue)) + return extremum_element_with_nan(v, size, 0); + if (size == 0) + return 0; + size_t idx_of_extremum = 0; + auto extremum = v[0]; + bool extremum_is_nan_or_nodata = + std::isnan(extremum) || (extremum == noDataValue); + size_t i = 1; + for (; i < size; ++i) + { + if (v[i] != noDataValue && + (compScalar(v[i], extremum) || + (extremum_is_nan_or_nodata && !std::isnan(v[i])))) + { + extremum = v[i]; + idx_of_extremum = i; + extremum_is_nan_or_nodata = false; + } + } + return idx_of_extremum; +} + +/************************************************************************/ +/* extremum_element() */ +/************************************************************************/ + +#if !defined(GDAL_MINMAX_ELEMENT_USE_SSE2) + +template <> +size_t extremum_element(const float *v, size_t size, + float noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> +size_t extremum_element(const float *v, size_t size, + float noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> +size_t extremum_element(const double *v, size_t size, + double noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +template <> +size_t extremum_element(const double *v, size_t size, + double noDataValue) +{ + return extremum_element_with_nan(v, size, noDataValue); +} + +#endif + +template +inline size_t extremum_element(const T *buffer, size_t size, bool bHasNoData, + T noDataValue) +{ + if (bHasNoData) + return extremum_element(buffer, size, noDataValue); + else + return extremum_element(buffer, size); +} + +#else + +template +inline size_t extremum_element(const T *buffer, size_t size, bool bHasNoData, + T noDataValue) +{ + if (bHasNoData) + { + if constexpr (std::is_floating_point_v) + { + if (std::isnan(noDataValue)) + { + if constexpr (IS_MAX) + { + return std::max_element(buffer, buffer + size, + [](T a, T b) { + return std::isnan(b) ? false + : std::isnan(a) ? true + : a < b; + }) - + buffer; + } + else + { + return std::min_element(buffer, buffer + size, + [](T a, T b) { + return std::isnan(b) ? true + : std::isnan(a) ? false + : a < b; + }) - + buffer; + } + } + else + { + if constexpr (IS_MAX) + { + return std::max_element(buffer, buffer + size, + [noDataValue](T a, T b) + { + return std::isnan(b) ? false + : std::isnan(a) ? true + : (b == noDataValue) + ? false + : (a == noDataValue) + ? true + : a < b; + }) - + buffer; + } + else + { + return std::min_element(buffer, buffer + size, + [noDataValue](T a, T b) + { + return std::isnan(b) ? true + : std::isnan(a) ? false + : (b == noDataValue) + ? true + : (a == noDataValue) + ? false + : a < b; + }) - + buffer; + } + } + } + else + { + if constexpr (IS_MAX) + { + return std::max_element(buffer, buffer + size, + [noDataValue](T a, T b) { + return (b == noDataValue) ? false + : (a == noDataValue) ? true + : a < b; + }) - + buffer; + } + else + { + return std::min_element(buffer, buffer + size, + [noDataValue](T a, T b) { + return (b == noDataValue) ? true + : (a == noDataValue) ? false + : a < b; + }) - + buffer; + } + } + } + else + { + if constexpr (std::is_floating_point_v) + { + if constexpr (IS_MAX) + { + return std::max_element(buffer, buffer + size, + [](T a, T b) { + return std::isnan(b) ? false + : std::isnan(a) ? true + : a < b; + }) - + buffer; + } + else + { + return std::min_element(buffer, buffer + size, + [](T a, T b) { + return std::isnan(b) ? true + : std::isnan(a) ? false + : a < b; + }) - + buffer; + } + } + else + { + if constexpr (IS_MAX) + { + return std::max_element(buffer, buffer + size) - buffer; + } + else + { + return std::min_element(buffer, buffer + size) - buffer; + } + } + } +} +#endif + +template +size_t extremum_element(const void *buffer, size_t nElts, GDALDataType eDT, + bool bHasNoData, double dfNoDataValue) +{ + switch (eDT) + { +#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 7, 0) + case GDT_Int8: + { + using T = int8_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return extremum_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } +#endif + case GDT_Byte: + { + using T = uint8_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return extremum_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_Int16: + { + using T = int16_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return extremum_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_UInt16: + { + using T = uint16_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return extremum_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_Int32: + { + using T = int32_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return extremum_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_UInt32: + { + using T = uint32_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return extremum_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } +#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 5, 0) + case GDT_Int64: + { + using T = int64_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return extremum_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_UInt64: + { + using T = uint64_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return extremum_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } +#endif + case GDT_Float32: + { + using T = float; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return extremum_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_Float64: + { + using T = double; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return extremum_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + default: + break; + } + CPLError(CE_Failure, CPLE_NotSupported, + "%s not supported for this data type.", __FUNCTION__); + return 0; +} + +} // namespace detail + +/************************************************************************/ +/* max_element() */ +/************************************************************************/ + +/** Return the index of the element where the maximum value is hit. + * + * If it is hit in several locations, it is not specified which one will be + * returned. + * + * @param buffer Vector of nElts elements of type eDT. + * @param nElts Number of elements in buffer. + * @param eDT Data type of the elements of buffer. + * @param bHasNoData Whether dfNoDataValue is valid. + * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true + * + * @since GDAL 3.11 + */ +inline size_t max_element(const void *buffer, size_t nElts, GDALDataType eDT, + bool bHasNoData, double dfNoDataValue) +{ + return detail::extremum_element(buffer, nElts, eDT, bHasNoData, + dfNoDataValue); +} + +/************************************************************************/ +/* min_element() */ +/************************************************************************/ + +/** Return the index of the element where the minimum value is hit. + * + * If it is hit in several locations, it is not specified which one will be + * returned. + * + * @param buffer Vector of nElts elements of type eDT. + * @param nElts Number of elements in buffer. + * @param eDT Data type of the elements of buffer. + * @param bHasNoData Whether dfNoDataValue is valid. + * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true + * + * @since GDAL 3.11 + */ +inline size_t min_element(const void *buffer, size_t nElts, GDALDataType eDT, + bool bHasNoData, double dfNoDataValue) +{ + return detail::extremum_element(buffer, nElts, eDT, bHasNoData, + dfNoDataValue); +} + +namespace detail +{ + +#ifdef NOT_EFFICIENT + +/************************************************************************/ +/* minmax_element() */ +/************************************************************************/ + +template +std::pair minmax_element(const T *v, size_t size, T noDataValue) +{ + static_assert(!(std::is_floating_point_v)); + if (size == 0) + return std::pair(0, 0); + size_t idx_of_min = 0; + size_t idx_of_max = 0; + T vmin = v[0]; + T vmax = v[0]; + bool extremum_is_nodata = vmin == noDataValue; + size_t i = 1; + for (; i < size; ++i) + { + if (v[i] != noDataValue && (v[i] < vmin || extremum_is_nodata)) + { + vmin = v[i]; + idx_of_min = i; + extremum_is_nodata = false; + } + if (v[i] != noDataValue && (v[i] > vmax || extremum_is_nodata)) + { + vmax = v[i]; + idx_of_max = i; + extremum_is_nodata = false; + } + } + return std::pair(idx_of_min, idx_of_max); +} + +template +std::pair minmax_element(const T *v, size_t size) +{ + static_assert(!(std::is_floating_point_v)); + if (size == 0) + return std::pair(0, 0); + size_t idx_of_min = 0; + size_t idx_of_max = 0; + T vmin = v[0]; + T vmax = v[0]; + size_t i = 1; + for (; i < size; ++i) + { + if (v[i] < vmin) + { + vmin = v[i]; + idx_of_min = i; + } + if (v[i] > vmax) + { + vmax = v[i]; + idx_of_max = i; + } + } + return std::pair(idx_of_min, idx_of_max); +} + +template +inline std::pair minmax_element_with_nan(const T *v, + size_t size) +{ + if (size == 0) + return std::pair(0, 0); + size_t idx_of_min = 0; + size_t idx_of_max = 0; + T vmin = v[0]; + T vmax = v[0]; + size_t i = 1; + if (std::isnan(v[0])) + { + for (; i < size; ++i) + { + if (!std::isnan(v[i])) + { + vmin = v[i]; + idx_of_min = i; + vmax = v[i]; + idx_of_max = i; + break; + } + } + } + for (; i < size; ++i) + { + if (v[i] < vmin) + { + vmin = v[i]; + idx_of_min = i; + } + if (v[i] > vmax) + { + vmax = v[i]; + idx_of_max = i; + } + } + return std::pair(idx_of_min, idx_of_max); +} + +template <> +std::pair minmax_element(const float *v, size_t size) +{ + return minmax_element_with_nan(v, size); +} + +template <> +std::pair minmax_element(const double *v, size_t size) +{ + return minmax_element_with_nan(v, size); +} + +template +inline std::pair minmax_element(const T *buffer, size_t size, + bool bHasNoData, T noDataValue) +{ + if (bHasNoData) + { + return minmax_element(buffer, size, noDataValue); + } + else + { + return minmax_element(buffer, size); + } +} +#else + +/************************************************************************/ +/* minmax_element() */ +/************************************************************************/ + +template +inline std::pair minmax_element(const T *buffer, size_t size, + bool bHasNoData, T noDataValue) +{ +#ifdef NOT_EFFICIENT + if (bHasNoData) + { + return minmax_element(buffer, size, noDataValue); + } + else + { + return minmax_element(buffer, size); + //auto [imin, imax] = std::minmax_element(buffer, buffer + size); + //return std::pair(imin - buffer, imax - buffer); + } +#else + +#if !defined(GDAL_MINMAX_ELEMENT_USE_SSE2) + if constexpr (!std::is_floating_point_v) + { + if (!bHasNoData) + { + auto [min_iter, max_iter] = + std::minmax_element(buffer, buffer + size); + return std::pair(min_iter - buffer, max_iter - buffer); + } + } +#endif + + // Using separately min and max is more efficient than computing them + // within the same loop + return std::pair( + extremum_element(buffer, size, bHasNoData, noDataValue), + extremum_element(buffer, size, bHasNoData, noDataValue)); + +#endif +} +#endif + +} // namespace detail + +/************************************************************************/ +/* minmax_element() */ +/************************************************************************/ + +/** Return the index of the elements where the minimum and maximum values are hit. + * + * If they are hit in several locations, it is not specified which one will be + * returned (contrary to std::minmax_element). + * + * @param buffer Vector of nElts elements of type eDT. + * @param nElts Number of elements in buffer. + * @param eDT Data type of the elements of buffer. + * @param bHasNoData Whether dfNoDataValue is valid. + * @param dfNoDataValue Nodata value, only taken into account if bHasNoData == true + * + * @since GDAL 3.11 + */ +inline std::pair minmax_element(const void *buffer, + size_t nElts, GDALDataType eDT, + bool bHasNoData, + double dfNoDataValue) +{ + switch (eDT) + { +#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 7, 0) + case GDT_Int8: + { + using T = int8_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return detail::minmax_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } +#endif + case GDT_Byte: + { + using T = uint8_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return detail::minmax_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_Int16: + { + using T = int16_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return detail::minmax_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_UInt16: + { + using T = uint16_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return detail::minmax_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_Int32: + { + using T = int32_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return detail::minmax_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_UInt32: + { + using T = uint32_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return detail::minmax_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } +#if GDAL_VERSION_NUM >= GDAL_COMPUTE_VERSION(3, 5, 0) + case GDT_Int64: + { + using T = int64_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return detail::minmax_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_UInt64: + { + using T = uint64_t; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return detail::minmax_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } +#endif + case GDT_Float32: + { + using T = float; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return detail::minmax_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + case GDT_Float64: + { + using T = double; + bHasNoData = bHasNoData && GDALIsValueExactAs(dfNoDataValue); + return detail::minmax_element( + static_cast(buffer), nElts, bHasNoData, + bHasNoData ? static_cast(dfNoDataValue) : 0); + } + default: + break; + } + CPLError(CE_Failure, CPLE_NotSupported, + "%s not supported for this data type.", __FUNCTION__); + return std::pair(0, 0); +} + +} // namespace GDAL_MINMAXELT_NS + +#endif // GDAL_MINMAX_ELEMENT_INCLUDED diff --git a/gcore/gdal_priv.h b/gcore/gdal_priv.h index 20dcc3734cf3..c615a3a90319 100644 --- a/gcore/gdal_priv.h +++ b/gcore/gdal_priv.h @@ -1810,6 +1810,9 @@ class CPL_DLL GDALRasterBand : public GDALMajorObject virtual CPLErr SetStatistics(double dfMin, double dfMax, double dfMean, double dfStdDev); virtual CPLErr ComputeRasterMinMax(int bApproxOK, double *adfMinMax); + virtual CPLErr ComputeRasterMinMaxLocation(double *pdfMin, double *pdfMax, + int *pnMinX, int *pnMinY, + int *pnMaxX, int *pnMaxY); // Only defined when Doxygen enabled #ifdef DOXYGEN_SKIP diff --git a/gcore/gdal_priv_templates.hpp b/gcore/gdal_priv_templates.hpp index 3c20c055687b..cb1631485cf1 100644 --- a/gcore/gdal_priv_templates.hpp +++ b/gcore/gdal_priv_templates.hpp @@ -585,9 +585,14 @@ inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut) } // Needs SSE2 -#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) +#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) || \ + defined(USE_NEON_OPTIMIZATIONS) +#ifdef USE_NEON_OPTIMIZATIONS +#include "include_sse2neon.h" +#else #include +#endif static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest) { diff --git a/gcore/gdalrasterband.cpp b/gcore/gdalrasterband.cpp index bf2da7a2893f..87c814efdedf 100644 --- a/gcore/gdalrasterband.cpp +++ b/gcore/gdalrasterband.cpp @@ -38,6 +38,7 @@ #include "gdal_rat.h" #include "gdal_priv_templates.hpp" #include "gdal_interpolateatpoint.h" +#include "gdal_minmax_element.hpp" /************************************************************************/ /* GDALRasterBand() */ @@ -7415,6 +7416,288 @@ CPLErr CPL_STDCALL GDALComputeRasterMinMax(GDALRasterBandH hBand, int bApproxOK, return poBand->ComputeRasterMinMax(bApproxOK, adfMinMax); } +/************************************************************************/ +/* ComputeRasterMinMaxLocation() */ +/************************************************************************/ + +/** + * \brief Compute the min/max values for a band, and their location. + * + * Pixels whose value matches the nodata value or are masked by the mask + * band are ignored. + * + * If the minimum or maximum value is hit in several locations, it is not + * specified which one will be returned. + * + * @param[out] pdfMin Pointer to the minimum value. + * @param[out] pdfMax Pointer to the maximum value. + * @param[out] pnMinX Pointer to the column where the minimum value is hit. + * @param[out] pnMinY Pointer to the line where the minimum value is hit. + * @param[out] pnMaxX Pointer to the column where the maximum value is hit. + * @param[out] pnMaxY Pointer to the line where the maximum value is hit. + * + * @return CE_None in case of success, CE_Warning if there are no valid values, + * CE_Failure in case of error. + * + * @since GDAL 3.11 + */ + +CPLErr GDALRasterBand::ComputeRasterMinMaxLocation(double *pdfMin, + double *pdfMax, int *pnMinX, + int *pnMinY, int *pnMaxX, + int *pnMaxY) +{ + int nMinX = -1; + int nMinY = -1; + int nMaxX = -1; + int nMaxY = -1; + double dfMin = std::numeric_limits::infinity(); + double dfMax = -std::numeric_limits::infinity(); + if (pdfMin) + *pdfMin = dfMin; + if (pdfMax) + *pdfMax = dfMax; + if (pnMinX) + *pnMinX = nMinX; + if (pnMinY) + *pnMinY = nMinY; + if (pnMaxX) + *pnMaxX = nMaxX; + if (pnMaxY) + *pnMaxY = nMaxY; + + if (GDALDataTypeIsComplex(eDataType)) + { + CPLError(CE_Failure, CPLE_NotSupported, + "Complex data type not supported"); + return CE_Failure; + } + + if (!InitBlockInfo()) + return CE_Failure; + + int bGotNoDataValue = FALSE; + const double dfNoDataValue = GetNoDataValue(&bGotNoDataValue); + bGotNoDataValue = bGotNoDataValue && !std::isnan(dfNoDataValue); + bool bGotFloatNoDataValue = false; + float fNoDataValue = 0.0f; + ComputeFloatNoDataValue(eDataType, dfNoDataValue, bGotNoDataValue, + fNoDataValue, bGotFloatNoDataValue); + + GDALRasterBand *poMaskBand = nullptr; + if (!bGotNoDataValue) + { + const int l_nMaskFlags = GetMaskFlags(); + if (l_nMaskFlags != GMF_ALL_VALID && l_nMaskFlags != GMF_NODATA && + GetColorInterpretation() != GCI_AlphaBand) + { + poMaskBand = GetMaskBand(); + } + } + + bool bSignedByte = false; + if (eDataType == GDT_Byte) + { + EnablePixelTypeSignedByteWarning(false); + const char *pszPixelType = + GetMetadataItem("PIXELTYPE", "IMAGE_STRUCTURE"); + EnablePixelTypeSignedByteWarning(true); + bSignedByte = + pszPixelType != nullptr && EQUAL(pszPixelType, "SIGNEDBYTE"); + } + + GByte *pabyMaskData = nullptr; + if (poMaskBand) + { + pabyMaskData = + static_cast(VSI_MALLOC2_VERBOSE(nBlockXSize, nBlockYSize)); + if (!pabyMaskData) + { + return CE_Failure; + } + } + + const GIntBig nTotalBlocks = + static_cast(nBlocksPerRow) * nBlocksPerColumn; + bool bNeedsMin = pdfMin || pnMinX || pnMinY; + bool bNeedsMax = pdfMax || pnMaxX || pnMaxY; + for (GIntBig iBlock = 0; iBlock < nTotalBlocks; ++iBlock) + { + const int iYBlock = static_cast(iBlock / nBlocksPerRow); + const int iXBlock = static_cast(iBlock % nBlocksPerRow); + + GDALRasterBlock *poBlock = GetLockedBlockRef(iXBlock, iYBlock); + if (poBlock == nullptr) + { + CPLFree(pabyMaskData); + return CE_Failure; + } + + void *const pData = poBlock->GetDataRef(); + + int nXCheck = 0, nYCheck = 0; + GetActualBlockSize(iXBlock, iYBlock, &nXCheck, &nYCheck); + + if (poMaskBand && + poMaskBand->RasterIO(GF_Read, iXBlock * nBlockXSize, + iYBlock * nBlockYSize, nXCheck, nYCheck, + pabyMaskData, nXCheck, nYCheck, GDT_Byte, 0, + nBlockXSize, nullptr) != CE_None) + { + poBlock->DropLock(); + CPLFree(pabyMaskData); + return CE_Failure; + } + + if (poMaskBand || nYCheck < nBlockYSize || nXCheck < nBlockXSize) + { + for (int iY = 0; iY < nYCheck; ++iY) + { + for (int iX = 0; iX < nXCheck; ++iX) + { + const GPtrDiff_t iOffset = + iX + static_cast(iY) * nBlockXSize; + if (pabyMaskData && pabyMaskData[iOffset] == 0) + continue; + bool bValid = true; + double dfValue = GetPixelValue( + eDataType, bSignedByte, pData, iOffset, bGotNoDataValue, + dfNoDataValue, bGotFloatNoDataValue, fNoDataValue, + bValid); + if (!bValid) + continue; + if (dfValue < dfMin) + { + dfMin = dfValue; + nMinX = iXBlock * nBlockXSize + iX; + nMinY = iYBlock * nBlockYSize + iY; + } + if (dfValue > dfMax) + { + dfMax = dfValue; + nMaxX = iXBlock * nBlockXSize + iX; + nMaxY = iYBlock * nBlockYSize + iY; + } + } + } + } + else + { + size_t pos_min = 0; + size_t pos_max = 0; + const auto eEffectiveDT = bSignedByte ? GDT_Int8 : eDataType; + if (bNeedsMin && bNeedsMax) + { + std::tie(pos_min, pos_max) = gdal::minmax_element( + pData, static_cast(nBlockXSize) * nBlockYSize, + eEffectiveDT, bGotNoDataValue, dfNoDataValue); + } + else if (bNeedsMin) + { + pos_min = gdal::min_element( + pData, static_cast(nBlockXSize) * nBlockYSize, + eEffectiveDT, bGotNoDataValue, dfNoDataValue); + } + else if (bNeedsMax) + { + pos_max = gdal::max_element( + pData, static_cast(nBlockXSize) * nBlockYSize, + eEffectiveDT, bGotNoDataValue, dfNoDataValue); + } + + if (bNeedsMin) + { + const int nMinXBlock = static_cast(pos_min % nBlockXSize); + const int nMinYBlock = static_cast(pos_min / nBlockXSize); + bool bValid = true; + const double dfMinValueBlock = GetPixelValue( + eDataType, bSignedByte, pData, pos_min, bGotNoDataValue, + dfNoDataValue, bGotFloatNoDataValue, fNoDataValue, bValid); + if (bValid && dfMinValueBlock < dfMin) + { + dfMin = dfMinValueBlock; + nMinX = iXBlock * nBlockXSize + nMinXBlock; + nMinY = iYBlock * nBlockYSize + nMinYBlock; + } + } + + if (bNeedsMax) + { + const int nMaxXBlock = static_cast(pos_max % nBlockXSize); + const int nMaxYBlock = static_cast(pos_max / nBlockXSize); + bool bValid = true; + const double dfMaxValueBlock = GetPixelValue( + eDataType, bSignedByte, pData, pos_max, bGotNoDataValue, + dfNoDataValue, bGotFloatNoDataValue, fNoDataValue, bValid); + if (bValid && dfMaxValueBlock > dfMax) + { + dfMax = dfMaxValueBlock; + nMaxX = iXBlock * nBlockXSize + nMaxXBlock; + nMaxY = iYBlock * nBlockYSize + nMaxYBlock; + } + } + } + + poBlock->DropLock(); + + if (eDataType == GDT_Byte) + { + if (bNeedsMin && dfMin == 0) + { + bNeedsMin = false; + } + if (bNeedsMax && dfMax == 255) + { + bNeedsMax = false; + } + if (!bNeedsMin && !bNeedsMax) + { + break; + } + } + } + + CPLFree(pabyMaskData); + + if (pdfMin) + *pdfMin = dfMin; + if (pdfMax) + *pdfMax = dfMax; + if (pnMinX) + *pnMinX = nMinX; + if (pnMinY) + *pnMinY = nMinY; + if (pnMaxX) + *pnMaxX = nMaxX; + if (pnMaxY) + *pnMaxY = nMaxY; + return ((bNeedsMin && nMinX < 0) || (bNeedsMax && nMaxX < 0)) ? CE_Warning + : CE_None; +} + +/************************************************************************/ +/* GDALComputeRasterMinMaxLocation() */ +/************************************************************************/ + +/** + * \brief Compute the min/max values for a band, and their location. + * + * @see GDALRasterBand::ComputeRasterMinMax() + * @since GDAL 3.11 + */ + +CPLErr GDALComputeRasterMinMaxLocation(GDALRasterBandH hBand, double *pdfMin, + double *pdfMax, int *pnMinX, int *pnMinY, + int *pnMaxX, int *pnMaxY) + +{ + VALIDATE_POINTER1(hBand, "GDALComputeRasterMinMaxLocation", CE_Failure); + + GDALRasterBand *poBand = GDALRasterBand::FromHandle(hBand); + return poBand->ComputeRasterMinMaxLocation(pdfMin, pdfMax, pnMinX, pnMinY, + pnMaxX, pnMaxY); +} + /************************************************************************/ /* SetDefaultHistogram() */ /************************************************************************/ diff --git a/gcore/gdalsse_priv.h b/gcore/gdalsse_priv.h index 3c7ec7ba8cdd..ade33367ee55 100644 --- a/gcore/gdalsse_priv.h +++ b/gcore/gdalsse_priv.h @@ -23,13 +23,18 @@ #if (defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)) && \ !defined(USE_SSE2_EMULATION) +#include + +#ifdef USE_NEON_OPTIMIZATIONS +#include "include_sse2neon.h" +#else /* Requires SSE2 */ #include -#include #ifdef __SSE4_1__ #include #endif +#endif #include "gdal_priv_templates.hpp" diff --git a/gcore/include_sse2neon.h b/gcore/include_sse2neon.h new file mode 100644 index 000000000000..fd1dbb927cb8 --- /dev/null +++ b/gcore/include_sse2neon.h @@ -0,0 +1,32 @@ +/****************************************************************************** + * + * Project: GDAL + * Purpose: Includes sse2neon.h headers + * Author: Even Rouault + * + ****************************************************************************** + * Copyright (c) 2024, Even Rouault + * + * SPDX-License-Identifier: MIT + *****************************************************************************/ + +#ifndef INCLUDE_SSE2NEON_H +#define INCLUDE_SSE2NEON_H + +#if defined(__GNUC__) +#pragma GCC system_header +#endif + +// This check is done in sse2neon.h just as a warning. Turn that into an +// error, so that gdal.cmake doesn't try to use it +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10 +#error "sse2neon.h: GCC versions earlier than 10 are not supported." +#endif + +#include "sse2neon.h" + +#ifndef _MM_SHUFFLE2 +#define _MM_SHUFFLE2(fp1, fp0) (((fp1) << 1) | (fp0)) +#endif + +#endif /* INCLUDE_SSE2NEON_H */ diff --git a/gcore/overview.cpp b/gcore/overview.cpp index 5867ac11b04f..846c89a91e4e 100644 --- a/gcore/overview.cpp +++ b/gcore/overview.cpp @@ -36,9 +36,15 @@ #include "gdal_thread_pool.h" #include "gdalwarper.h" +#ifdef USE_NEON_OPTIMIZATIONS +#include "include_sse2neon.h" +#define USE_SSE2 + +#include "gdalsse_priv.h" + // Restrict to 64bit processors because they are guaranteed to have SSE2, // or if __AVX2__ is defined. -#if defined(__x86_64) || defined(_M_X64) || defined(__AVX2__) +#elif defined(__x86_64) || defined(_M_X64) || defined(__AVX2__) #define USE_SSE2 #include "gdalsse_priv.h" @@ -335,7 +341,7 @@ inline GUInt16 ComputeIntegerRMS_4values(double sumSquares) /* QuadraticMeanByteSSE2OrAVX2() */ /************************************************************************/ -#ifdef __SSE4_1__ +#if defined(__SSE4_1__) || defined(USE_NEON_OPTIMIZATIONS) #define sse2_packus_epi32 _mm_packus_epi32 #else inline __m128i sse2_packus_epi32(__m128i a, __m128i b) @@ -350,7 +356,7 @@ inline __m128i sse2_packus_epi32(__m128i a, __m128i b) } #endif -#ifdef __SSSE3__ +#if defined(__SSSE3__) || defined(USE_NEON_OPTIMIZATIONS) #define sse2_hadd_epi16 _mm_hadd_epi16 #else inline __m128i sse2_hadd_epi16(__m128i a, __m128i b) diff --git a/gcore/rasterio.cpp b/gcore/rasterio.cpp index 880e851204b9..c46df08b9b88 100644 --- a/gcore/rasterio.cpp +++ b/gcore/rasterio.cpp @@ -41,6 +41,18 @@ #include "memdataset.h" #include "vrtdataset.h" +#if defined(__x86_64) || defined(_M_X64) +#include +#define HAVE_SSE2 +#elif defined(USE_NEON_OPTIMIZATIONS) +#include "include_sse2neon.h" +#define HAVE_SSE2 +#endif + +#ifdef HAVE_SSSE3_AT_COMPILE_TIME +#include "rasterio_ssse3.h" +#endif + static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData, int nSrcPixelStride, GByte *CPL_RESTRICT pDstData, int nDstPixelStride, GPtrDiff_t nWordCount); @@ -2217,9 +2229,7 @@ static void inline GDALCopyWordsT_8atatime( } } -#if defined(__x86_64) || defined(_M_X64) - -#include +#ifdef HAVE_SSE2 template void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData, @@ -2630,7 +2640,7 @@ void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData, nDstPixelStride, nWordCount); } -#endif // defined(__x86_64) || defined(_M_X64) +#endif // HAVE_SSE2 template <> void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData, @@ -3068,13 +3078,7 @@ static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest, GDALUnrolledCopyGeneric(pDest, pSrc, nIters); } -#if (defined(__x86_64) || defined(_M_X64)) - -#ifdef HAVE_SSSE3_AT_COMPILE_TIME - -#include "rasterio_ssse3.h" - -#endif +#ifdef HAVE_SSE2 template <> void GDALUnrolledCopy(GByte *CPL_RESTRICT pDest, @@ -3175,7 +3179,7 @@ void GDALUnrolledCopy(GByte *CPL_RESTRICT pDest, pSrc += 4; } } -#endif // defined(__x86_64) || defined(_M_X64) +#endif // HAVE_SSE2 /************************************************************************/ /* GDALFastCopy() */ @@ -5299,13 +5303,7 @@ bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue, return false; } -#if defined(__x86_64) || defined(_M_X64) - -#include - -#ifdef HAVE_SSSE3_AT_COMPILE_TIME -#include "rasterio_ssse3.h" -#endif +#ifdef HAVE_SSE2 /************************************************************************/ /* GDALDeinterleave3Byte() */ @@ -5319,6 +5317,12 @@ GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0, GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters) +#ifdef USE_NEON_OPTIMIZATIONS +{ + return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2, + nIters); +} +#else { #ifdef HAVE_SSSE3_AT_COMPILE_TIME if (CPLHaveRuntimeSSSE3()) @@ -5366,6 +5370,7 @@ GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc, pabyDest2[i] = pabySrc[3 * i + 2]; } } +#endif /************************************************************************/ /* GDALDeinterleave4Byte() */ @@ -5421,6 +5426,12 @@ static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, GByte *CPL_RESTRICT pabyDest3, size_t nIters) +#ifdef USE_NEON_OPTIMIZATIONS +{ + return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2, + pabyDest3, nIters); +} +#else { #ifdef HAVE_SSSE3_AT_COMPILE_TIME if (CPLHaveRuntimeSSSE3()) @@ -5469,6 +5480,7 @@ static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc, pabyDest3[i] = pabySrc[4 * i + 3]; } } +#endif #else // GCC autovectorizer does an excellent job __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte( @@ -5596,8 +5608,7 @@ void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT, } #if ((defined(__GNUC__) && !defined(__clang__)) || \ defined(__INTEL_CLANG_COMPILER)) && \ - (defined(__x86_64) || defined(_M_X64)) && \ - defined(HAVE_SSSE3_AT_COMPILE_TIME) + defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME) else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) && CPLHaveRuntimeSSSE3()) { diff --git a/gcore/rasterio_ssse3.cpp b/gcore/rasterio_ssse3.cpp index 7b770f6030b5..fa9cd6ab24e4 100644 --- a/gcore/rasterio_ssse3.cpp +++ b/gcore/rasterio_ssse3.cpp @@ -12,12 +12,18 @@ #include "cpl_port.h" -#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && \ - (defined(__x86_64) || defined(_M_X64)) +#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \ + (defined(__x86_64) || defined(_M_X64))) || \ + defined(USE_NEON_OPTIMIZATIONS) #include "rasterio_ssse3.h" +#ifdef USE_NEON_OPTIMIZATIONS +#include "include_sse2neon.h" +#else #include +#endif + #include "gdal_priv_templates.hpp" void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest, diff --git a/gcore/rasterio_ssse3.h b/gcore/rasterio_ssse3.h index 57f5d556fd10..ac20e45c6c57 100644 --- a/gcore/rasterio_ssse3.h +++ b/gcore/rasterio_ssse3.h @@ -16,7 +16,7 @@ #include "cpl_port.h" #if defined(HAVE_SSSE3_AT_COMPILE_TIME) && \ - (defined(__x86_64) || defined(_M_X64)) + (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS)) void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest, const GByte *CPL_RESTRICT pSrc, diff --git a/gcore/sse2neon.h b/gcore/sse2neon.h new file mode 100644 index 000000000000..7754a2dc574b --- /dev/null +++ b/gcore/sse2neon.h @@ -0,0 +1,9402 @@ +#ifndef SSE2NEON_H +#define SSE2NEON_H + +/* + * sse2neon is freely redistributable under the MIT License. + * + * Copyright (c) 2015-2024 SSE2NEON Contributors. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions +// +// Contributors to this work are: +// John W. Ratcliff +// Brandon Rowlett +// Ken Fast +// Eric van Beurden +// Alexander Potylitsin +// Hasindu Gamaarachchi +// Jim Huang +// Mark Cheng +// Malcolm James MacLeod +// Devin Hussey (easyaspi314) +// Sebastian Pop +// Developer Ecosystem Engineering +// Danila Kutenin +// François Turban (JishinMaster) +// Pei-Hsuan Hung +// Yang-Hao Yuan +// Syoyo Fujita +// Brecht Van Lommel +// Jonathan Hue +// Cuda Chen +// Aymen Qader +// Anthony Roberts + +/* Tunable configurations */ + +/* Enable precise implementation of math operations + * This would slow down the computation a bit, but gives consistent result with + * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result) + */ +/* _mm_min|max_ps|ss|pd|sd */ +#ifndef SSE2NEON_PRECISE_MINMAX +#define SSE2NEON_PRECISE_MINMAX (0) +#endif +/* _mm_rcp_ps */ +#ifndef SSE2NEON_PRECISE_DIV +#define SSE2NEON_PRECISE_DIV (0) +#endif +/* _mm_sqrt_ps and _mm_rsqrt_ps */ +#ifndef SSE2NEON_PRECISE_SQRT +#define SSE2NEON_PRECISE_SQRT (0) +#endif +/* _mm_dp_pd */ +#ifndef SSE2NEON_PRECISE_DP +#define SSE2NEON_PRECISE_DP (0) +#endif + +/* Enable inclusion of windows.h on MSVC platforms + * This makes _mm_clflush functional on windows, as there is no builtin. + */ +#ifndef SSE2NEON_INCLUDE_WINDOWS_H +#define SSE2NEON_INCLUDE_WINDOWS_H (0) +#endif + +/* compiler specific definitions */ +#if defined(__GNUC__) || defined(__clang__) +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#define _sse2neon_likely(x) __builtin_expect(!!(x), 1) +#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) +#elif defined(_MSC_VER) +#if _MSVC_TRADITIONAL +#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead. +#endif +#ifndef FORCE_INLINE +#define FORCE_INLINE static inline +#endif +#ifndef ALIGN_STRUCT +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif +#define _sse2neon_likely(x) (x) +#define _sse2neon_unlikely(x) (x) +#else +#pragma message("Macro name collisions may happen with unsupported compilers.") +#endif + +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10 +#warning "GCC versions earlier than 10 are not supported." +#endif + +// Disabled by GDAL to avoid issues with -Werror +#if 0 +#ifdef __OPTIMIZE__ +#warning \ + "Report any potential compiler optimization issues when using SSE2NEON. See the 'Optimization' section at https://github.com/DLTcollab/sse2neon." +#endif +#endif + +/* C language does not allow initializing a variable with a function call. */ +#ifdef __cplusplus +#define _sse2neon_const static const +#else +#define _sse2neon_const const +#endif + +#include +#include +#include +#include + +FORCE_INLINE double sse2neon_recast_u64_f64(uint64_t u64) +{ + double f64; + memcpy(&f64, &u64, sizeof(uint64_t)); + return f64; +} +FORCE_INLINE int64_t sse2neon_recast_f64_s64(double f64) +{ + int64_t i64; + memcpy(&i64, &f64, sizeof(uint64_t)); + return i64; +} + +#if defined(_WIN32) +/* Definitions for _mm_{malloc,free} are provided by + * from both MinGW-w64 and MSVC. + */ +#define SSE2NEON_ALLOC_DEFINED +#endif + +/* If using MSVC */ +#ifdef _MSC_VER +#include +#if SSE2NEON_INCLUDE_WINDOWS_H +#include +#include +#endif + +#if !defined(__cplusplus) +#error SSE2NEON only supports C++ compilation with this compiler +#endif + +#ifdef SSE2NEON_ALLOC_DEFINED +#include +#endif + +#if (defined(_M_AMD64) || defined(__x86_64__)) || \ + (defined(_M_ARM64) || defined(__arm64__)) +#define SSE2NEON_HAS_BITSCAN64 +#endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define _sse2neon_define0(type, s, body) \ + __extension__({ \ + type _a = (s); \ + body \ + }) +#define _sse2neon_define1(type, s, body) \ + __extension__({ \ + type _a = (s); \ + body \ + }) +#define _sse2neon_define2(type, a, b, body) \ + __extension__({ \ + type _a = (a), _b = (b); \ + body \ + }) +#define _sse2neon_return(ret) (ret) +#else +#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a) +#define _sse2neon_define1(type, a, body) [](type _a) { body }(a) +#define _sse2neon_define2(type, a, b, body) \ + [](type _a, type _b) { body }((a), (b)) +#define _sse2neon_return(ret) return ret +#endif + +#define _sse2neon_init(...) \ + { \ + __VA_ARGS__ \ + } + +/* Compiler barrier */ +#if defined(_MSC_VER) && !defined(__clang__) +#define SSE2NEON_BARRIER() _ReadWriteBarrier() +#else +#define SSE2NEON_BARRIER() \ + do { \ + __asm__ __volatile__("" ::: "memory"); \ + (void) 0; \ + } while (0) +#endif + +/* Memory barriers + * __atomic_thread_fence does not include a compiler barrier; instead, + * the barrier is part of __atomic_load/__atomic_store's "volatile-like" + * semantics. + */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) +#include +#endif + +FORCE_INLINE void _sse2neon_smp_mb(void) +{ + SSE2NEON_BARRIER(); +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ + !defined(__STDC_NO_ATOMICS__) + atomic_thread_fence(memory_order_seq_cst); +#elif defined(__GNUC__) || defined(__clang__) + __atomic_thread_fence(__ATOMIC_SEQ_CST); +#else /* MSVC */ + __dmb(_ARM64_BARRIER_ISH); +#endif +} + +/* Architecture-specific build options */ +/* FIXME: #pragma GCC push_options is only available on GCC */ +#if defined(__GNUC__) +#if defined(__arm__) && __ARM_ARCH == 7 +/* According to ARM C Language Extensions Architecture specification, + * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) + * architecture supported. + */ +#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) +#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." +#endif +#if !defined(__clang__) +#pragma GCC push_options +#pragma GCC target("fpu=neon") +#endif +#elif defined(__aarch64__) || defined(_M_ARM64) +#if !defined(__clang__) && !defined(_MSC_VER) +#pragma GCC push_options +#pragma GCC target("+simd") +#endif +#elif __ARM_ARCH == 8 +#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) +#error \ + "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON." +#endif +#if !defined(__clang__) && !defined(_MSC_VER) +#pragma GCC push_options +#endif +#else +#error \ + "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A \ +(you could try setting target explicitly with -march or -mcpu)" +#endif +#endif + +#include +#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8) +#if defined __has_include && __has_include() +#include +#endif +#endif + +/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD + * and other Arm microarchitectures use. + * From sysctl -a on Apple M1: + * hw.cachelinesize: 128 + */ +#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) +#define SSE2NEON_CACHELINE_SIZE 128 +#else +#define SSE2NEON_CACHELINE_SIZE 64 +#endif + +/* Rounding functions require either Aarch64 instructions or libm fallback */ +#if !defined(__aarch64__) && !defined(_M_ARM64) +#include +#endif + +/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only + * or even not accessible in user mode. + * To write or access to these registers in user mode, + * we have to perform syscall instead. + */ +#if (!defined(__aarch64__) && !defined(_M_ARM64)) +#include +#endif + +/* "__has_builtin" can be used to query support for built-in functions + * provided by gcc/clang and other compilers that support it. + */ +#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ +/* Compatibility with gcc <= 9 */ +#if defined(__GNUC__) && (__GNUC__ <= 9) +#define __has_builtin(x) HAS##x +#define HAS__builtin_popcount 1 +#define HAS__builtin_popcountll 1 + +// __builtin_shuffle introduced in GCC 4.7.0 +#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)) +#define HAS__builtin_shuffle 1 +#else +#define HAS__builtin_shuffle 0 +#endif + +#define HAS__builtin_shufflevector 0 +#define HAS__builtin_nontemporal_store 0 +#else +#define __has_builtin(x) 0 +#endif +#endif + +/** + * MACRO for shuffle parameter for _mm_shuffle_ps(). + * Argument fp3 is a digit[0123] that represents the fp from argument "b" + * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same + * for fp2 in result. fp1 is a digit[0123] that represents the fp from + * argument "a" of mm_shuffle_ps that will be places in fp1 of result. + * fp0 is the same for fp0 of result. + */ +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + +#if __has_builtin(__builtin_shufflevector) +#define _sse2neon_shuffle(type, a, b, ...) \ + __builtin_shufflevector(a, b, __VA_ARGS__) +#elif __has_builtin(__builtin_shuffle) +#define _sse2neon_shuffle(type, a, b, ...) \ + __extension__({ \ + type tmp = {__VA_ARGS__}; \ + __builtin_shuffle(a, b, tmp); \ + }) +#endif + +#ifdef _sse2neon_shuffle +#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__) +#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__) +#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__) +#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__) +#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__) +#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__) +#endif + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 +#define _MM_FROUND_NO_EXC 0x08 +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 +/* Flush zero mode macros. */ +#define _MM_FLUSH_ZERO_MASK 0x8000 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_FLUSH_ZERO_OFF 0x0000 +/* Denormals are zeros mode macros. */ +#define _MM_DENORMALS_ZERO_MASK 0x0040 +#define _MM_DENORMALS_ZERO_ON 0x0040 +#define _MM_DENORMALS_ZERO_OFF 0x0000 + +/* indicate immediate constant argument in a given range */ +#define __constrange(a, b) const + +/* A few intrinsics accept traditional data types like ints or floats, but + * most operate on data types that are specific to SSE. + * If a vector type ends in d, it contains doubles, and if it does not have + * a suffix, it contains floats. An integer vector type can contain any type + * of integer, from chars to shorts to unsigned long longs. + */ +typedef int64x1_t __m64; +typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ +// On ARM 32-bit architecture, the float64x2_t is not supported. +// The data type __m128d should be represented in a different way for related +// intrinsic conversion. +#if defined(__aarch64__) || defined(_M_ARM64) +typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ +#else +typedef float32x4_t __m128d; +#endif +typedef int64x2_t __m128i; /* 128-bit vector containing integers */ + +// Some intrinsics operate on unaligned data types. +typedef int16_t ALIGN_STRUCT(1) unaligned_int16_t; +typedef int32_t ALIGN_STRUCT(1) unaligned_int32_t; +typedef int64_t ALIGN_STRUCT(1) unaligned_int64_t; + +// __int64 is defined in the Intrinsics Guide which maps to different datatype +// in different data model +#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) +#if (defined(__x86_64__) || defined(__i386__)) +#define __int64 long long +#else +#define __int64 int64_t +#endif +#endif + +/* type-safe casting between types */ + +#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) +#define vreinterpretq_m128_f32(x) (x) +#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) + +#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) +#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) +#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) +#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) +#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) +#define vreinterpretq_f32_m128(x) (x) +#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) + +#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) +#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) +#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) +#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) +#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) +#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) +#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) +#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) +#define vreinterpretq_m128i_s64(x) (x) + +#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) +#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) +#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) +#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) + +#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) +#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) +#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) +#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) +#define vreinterpretq_s64_m128i(x) (x) + +#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) +#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) +#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) +#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) + +#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) +#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) +#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) +#define vreinterpret_m64_s64(x) (x) + +#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) +#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) +#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) +#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) + +#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) +#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) +#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) + +#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) +#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) +#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) +#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) + +#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) +#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) +#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) +#define vreinterpret_s64_m64(x) (x) + +#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) + +#if defined(__aarch64__) || defined(_M_ARM64) +#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) + +#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) +#define vreinterpretq_m128d_f64(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) + +#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) + +#define vreinterpretq_f64_m128d(x) (x) +#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) +#else +#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128d_f32(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_f32_m128d(x) (x) +#endif + +// A struct is defined in this header file called 'SIMDVec' which can be used +// by applications which attempt to access the contents of an __m128 struct +// directly. It is important to note that accessing the __m128 struct directly +// is bad coding practice by Microsoft: @see: +// https://learn.microsoft.com/en-us/cpp/cpp/m128 +// +// However, some legacy source code may try to access the contents of an __m128 +// struct directly so the developer can use the SIMDVec as an alias for it. Any +// casting must be done manually by the developer, as you cannot cast or +// otherwise alias the base NEON data type for intrinsic operations. +// +// union intended to allow direct access to an __m128 variable using the names +// that the MSVC compiler provides. This union should really only be used when +// trying to access the members of the vector as integer values. GCC/clang +// allow native access to the float members through a simple array access +// operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause +// a performance hit. If it really is needed however, the original __m128 +// variable can be aliased with a pointer to this union and used to access +// individual components. The use of this union should be hidden behind a macro +// that is used throughout the codebase to access the members instead of always +// declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec { + float m128_f32[4]; // as floats - DON'T USE. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. +} SIMDVec; + +// casting using SIMDVec +#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) +#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) +#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) + +/* SSE macros */ +#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode +#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode +#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode +#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode + +// Function declaration +// SSE +FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void); +FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); +FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); +FORCE_INLINE __m128 _mm_set_ps1(float); +FORCE_INLINE __m128 _mm_setzero_ps(void); +// SSE2 +FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i); +FORCE_INLINE __m128i _mm_castps_si128(__m128); +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i); +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128); +FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d); +FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i); +FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int); +FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t); +FORCE_INLINE __m128d _mm_set_pd(double, double); +FORCE_INLINE __m128i _mm_set1_epi32(int); +FORCE_INLINE __m128i _mm_setzero_si128(void); +// SSE4.1 +FORCE_INLINE __m128d _mm_ceil_pd(__m128d); +FORCE_INLINE __m128 _mm_ceil_ps(__m128); +FORCE_INLINE __m128d _mm_floor_pd(__m128d); +FORCE_INLINE __m128 _mm_floor_ps(__m128); +FORCE_INLINE __m128d _mm_round_pd(__m128d, int); +FORCE_INLINE __m128 _mm_round_ps(__m128, int); +// SSE4.2 +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); + +/* Backwards compatibility for compilers with lack of specific type support */ + +// Older gcc does not define vld1q_u8_x4 type +#if defined(__GNUC__) && !defined(__clang__) && \ + ((__GNUC__ <= 13 && defined(__arm__)) || \ + (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ + (__GNUC__ <= 9 && defined(__aarch64__))) +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +{ + uint8x16x4_t ret; + ret.val[0] = vld1q_u8(p + 0); + ret.val[1] = vld1q_u8(p + 16); + ret.val[2] = vld1q_u8(p + 32); + ret.val[3] = vld1q_u8(p + 48); + return ret; +} +#else +// Wraps vld1q_u8_x4 +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +{ + return vld1q_u8_x4(p); +} +#endif + +#if !defined(__aarch64__) && !defined(_M_ARM64) +/* emulate vaddv u8 variant */ +FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) +{ + const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8))); + return vget_lane_u8(vreinterpret_u8_u64(v1), 0); +} +#else +// Wraps vaddv_u8 +FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) +{ + return vaddv_u8(v8); +} +#endif + +#if !defined(__aarch64__) && !defined(_M_ARM64) +/* emulate vaddvq u8 variant */ +FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) +{ + uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a)); + uint8_t res = 0; + for (int i = 0; i < 8; ++i) + res += tmp[i]; + return res; +} +#else +// Wraps vaddvq_u8 +FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) +{ + return vaddvq_u8(a); +} +#endif + +#if !defined(__aarch64__) && !defined(_M_ARM64) +/* emulate vaddvq u16 variant */ +FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) +{ + uint32x4_t m = vpaddlq_u16(a); + uint64x2_t n = vpaddlq_u32(m); + uint64x1_t o = vget_low_u64(n) + vget_high_u64(n); + + return vget_lane_u32((uint32x2_t) o, 0); +} +#else +// Wraps vaddvq_u16 +FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) +{ + return vaddvq_u16(a); +} +#endif + +/* Function Naming Conventions + * The naming convention of SSE intrinsics is straightforward. A generic SSE + * intrinsic function is given as follows: + * _mm__ + * + * The parts of this format are given as follows: + * 1. describes the operation performed by the intrinsic + * 2. identifies the data type of the function's primary arguments + * + * This last part, , is a little complicated. It identifies the + * content of the input values, and can be set to any of the following values: + * + ps - vectors contain floats (ps stands for packed single-precision) + * + pd - vectors contain doubles (pd stands for packed double-precision) + * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * signed integers + * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * unsigned integers + * + si128 - unspecified 128-bit vector or 256-bit vector + * + m128/m128i/m128d - identifies input vector types when they are different + * than the type of the returned vector + * + * For example, _mm_setzero_ps. The _mm implies that the function returns + * a 128-bit vector. The _ps at the end implies that the argument vectors + * contain floats. + * + * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) + * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits + * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + * // Set packed 8-bit integers + * // 128 bits, 16 chars, per 8 bits + * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, + * 4, 5, 12, 13, 6, 7, 14, 15); + * // Shuffle packed 8-bit integers + * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb + */ + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint { + _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ + _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ + _MM_HINT_T1 = 2, /* load data to L2 cache only */ + _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ +}; + +// The bit field mapping to the FPCR(floating-point control register) +typedef struct { + uint16_t res0; + uint8_t res1 : 6; + uint8_t bit22 : 1; + uint8_t bit23 : 1; + uint8_t bit24 : 1; + uint8_t res2 : 7; +#if defined(__aarch64__) || defined(_M_ARM64) + uint32_t res3; +#endif +} fpcr_bitfield; + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in high +// end of result takes the higher two 32 bit values from b and swaps them and +// places in low end of result. +FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) +{ + float32x2_t a21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) +{ + float32x2_t a03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); +} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the +// high +FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) +{ + float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) +{ + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) +{ + float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) +{ + float32x2_t a33 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); + return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); +} + +// For MSVC, we check only if it is ARM64, as every single ARM64 processor +// supported by WoA has crypto extensions. If this changes in the future, +// this can be verified via the runtime-only method of: +// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) +#if (defined(_M_ARM64) && !defined(__clang__)) || \ + (defined(__ARM_FEATURE_CRYPTO) && \ + (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))) +// Wraps vmull_p64 +FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); + poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); +#if defined(_MSC_VER) && !defined(__clang__) + __n64 a1 = {a}, b1 = {b}; + return vreinterpretq_u64_p128(vmull_p64(a1, b1)); +#else + return vreinterpretq_u64_p128(vmull_p64(a, b)); +#endif +} +#else // ARMv7 polyfill +// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. +// +// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a +// 64-bit->128-bit polynomial multiply. +// +// It needs some work and is somewhat slow, but it is still faster than all +// known scalar methods. +// +// Algorithm adapted to C from +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted +// from "Fast Software Polynomial Multiplication on ARM Processors Using the +// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab +// (https://hal.inria.fr/hal-01506572) +static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly8x8_t a = vreinterpret_p8_u64(_a); + poly8x8_t b = vreinterpret_p8_u64(_b); + + // Masks + uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), + vcreate_u8(0x00000000ffffffff)); + uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), + vcreate_u8(0x0000000000000000)); + + // Do the multiplies, rotating with vext to get all combinations + uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 + uint8x16_t e = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 + uint8x16_t f = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 + uint8x16_t g = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 + uint8x16_t h = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 + uint8x16_t i = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 + uint8x16_t j = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 + uint8x16_t k = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 + + // Add cross products + uint8x16_t l = veorq_u8(e, f); // L = E + F + uint8x16_t m = veorq_u8(g, h); // M = G + H + uint8x16_t n = veorq_u8(i, j); // N = I + J + + // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL + // instructions. +#if defined(__aarch64__) + uint8x16_t lm_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t lm_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t nk_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); + uint8x16_t nk_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); +#else + uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); + uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); + uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); + uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); +#endif + // t0 = (L) (P0 + P1) << 8 + // t1 = (M) (P2 + P3) << 16 + uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); + uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); + uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); + + // t2 = (N) (P4 + P5) << 24 + // t3 = (K) (P6 + P7) << 32 + uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); + uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); + uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); + + // De-interleave +#if defined(__aarch64__) + uint8x16_t t0 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t1 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t2 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); + uint8x16_t t3 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); +#else + uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); + uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); + uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); + uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); +#endif + // Shift the cross products + uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 + uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 + uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 + uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 + + // Accumulate the products + uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); + uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); + uint8x16_t mix = veorq_u8(d, cross1); + uint8x16_t r = veorq_u8(mix, cross2); + return vreinterpretq_u64_u8(r); +} +#endif // ARMv7 polyfill + +// C equivalent: +// __m128i _mm_shuffle_epi32_default(__m128i a, +// __constrange(0, 255) int imm) { +// __m128i ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; +// return ret; +// } +#define _mm_shuffle_epi32_default(a, imm) \ + vreinterpretq_m128i_s32(vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ + vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ + vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \ + ((imm) >> 2) & 0x3), \ + vmovq_n_s32(vgetq_lane_s32( \ + vreinterpretq_s32_m128i(a), (imm) & (0x3))), \ + 1), \ + 2), \ + 3)) + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of a and places it into the high end of the result. +FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in low end +// of result takes the higher two 32 bit values from a and swaps them and places +// in high end of result. +FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); +} + +// rotates the least significant 32 bits into the most significant 32 bits, and +// shifts the rest down +FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); +} + +// rotates the most significant 32 bits into the least significant 32 bits, and +// shifts the rest up +FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of a and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) +{ + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the +// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the +// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and +// places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) +{ + int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) +{ + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); +} + +#if defined(__aarch64__) || defined(_M_ARM64) +#define _mm_shuffle_epi32_splat(a, imm) \ + vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))) +#else +#define _mm_shuffle_epi32_splat(a, imm) \ + vreinterpretq_m128i_s32( \ + vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))) +#endif + +// NEON does not support a general purpose permute intrinsic. +// Shuffle single-precision (32-bit) floating-point elements in a using the +// control in imm8, and store the results in dst. +// +// C equivalent: +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, +// __constrange(0, 255) int imm) { +// __m128 ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// return ret; +// } +// +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps +#define _mm_shuffle_ps_default(a, b, imm) \ + vreinterpretq_m128_f32(vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ + vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ + vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ + vmovq_n_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \ + 1), \ + 2), \ + 3)) + +// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. +// Store the results in the low 64 bits of dst, with the high 64 bits being +// copied from a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16 +#define _mm_shufflelo_epi16_function(a, imm) \ + _sse2neon_define1( \ + __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \ + int16x4_t lowBits = vget_low_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ + 3); \ + _sse2neon_return(vreinterpretq_m128i_s16(ret));) + +// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. +// Store the results in the high 64 bits of dst, with the low 64 bits being +// copied from a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16 +#define _mm_shufflehi_epi16_function(a, imm) \ + _sse2neon_define1( \ + __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \ + int16x4_t highBits = vget_high_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ + 5); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ + 6); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ + 7); \ + _sse2neon_return(vreinterpretq_m128i_s16(ret));) + +/* MMX */ + +//_mm_empty is a no-op on arm +FORCE_INLINE void _mm_empty(void) {} + +/* SSE */ + +// Add packed single-precision (32-bit) floating-point elements in a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Add the lower single-precision (32-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); + // the upper values in the result must be the remnants of . + return vreinterpretq_m128_f32(vaddq_f32(a, value)); +} + +// Compute the bitwise AND of packed single-precision (32-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise NOT of packed single-precision (32-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vbicq_s32(vreinterpretq_s32_m128(b), + vreinterpretq_s32_m128(a))); // *NOTE* argument swap +} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16 +FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16( + vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8 +FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for equality, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss +FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for greater-than-or-equal, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss +FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpge_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for greater-than, store the result in the lower element of dst, and copy +// the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss +FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for less-than-or-equal, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss +FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmple_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for less-than, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss +FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmplt_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-equal, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss +FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps +FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-greater-than-or-equal, store the result in the lower element of +// dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss +FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpnge_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps +FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-greater-than, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss +FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpngt_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-less-than-or-equal, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss +FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpnle_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// for not-less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b for not-less-than, store the result in the lower element of dst, and copy +// the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss +FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpnlt_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// to see if neither is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps +// +// See also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) +{ + // Note: NEON does not have ordered compare builtin + // Need to compare a eq a and b eq b to check for NaN + // Do AND of results to get final + uint32x4_t ceqaa = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t ceqbb = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b to see if neither is NaN, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss +FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpord_ps(a, b)); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b +// to see if either is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps +FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) +{ + uint32x4_t f32a = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t f32b = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b to see if either is NaN, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss +FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for equality, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) +{ + uint32x4_t a_eq_b = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_eq_b, 0) & 0x1; +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for greater-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) +{ + uint32x4_t a_ge_b = + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_ge_b, 0) & 0x1; +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for greater-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) +{ + uint32x4_t a_gt_b = + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_gt_b, 0) & 0x1; +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for less-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) +{ + uint32x4_t a_le_b = + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_le_b, 0) & 0x1; +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for less-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) +{ + uint32x4_t a_lt_b = + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return vgetq_lane_u32(a_lt_b, 0) & 0x1; +} + +// Compare the lower single-precision (32-bit) floating-point element in a and b +// for not-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) +{ + return !_mm_comieq_ss(a, b); +} + +// Convert packed signed 32-bit integers in b to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, and copy the upper 2 packed elements from a to the upper elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps +FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi +FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return vreinterpret_m64_s32( + vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))))); +#else + return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32( + vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION))))); +#endif +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss +FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si +FORCE_INLINE int _mm_cvt_ss2si(__m128 a) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))), + 0); +#else + float32_t data = vgetq_lane_f32( + vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); + return (int32_t) data; +#endif +} + +// Convert packed 16-bit integers in a to packed single-precision (32-bit) +// floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps +FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); +} + +// Convert packed 32-bit integers in b to packed single-precision (32-bit) +// floating-point elements, store the results in the lower 2 elements of dst, +// and copy the upper 2 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps +FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, then convert the packed signed 32-bit integers in b to +// single-precision (32-bit) floating-point element, and store the results in +// the upper 2 elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps +FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); +} + +// Convert the lower packed 8-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps +FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 16-bit integers, and store the results in dst. Note: this intrinsic +// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and +// 0x7FFFFFFF. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16 +FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) +{ + return vreinterpret_m64_s16( + vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a)))); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32 +#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 8-bit integers, and store the results in lower 4 elements of dst. +// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values +// between 0x7F and 0x7FFFFFFF. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8 +FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a) +{ + return vreinterpret_m64_s8(vqmovn_s16( + vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0)))); +} + +// Convert packed unsigned 16-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps +FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); +} + +// Convert the lower packed unsigned 8-bit integers in a to packed +// single-precision (32-bit) floating-point elements, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps +FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_u32( + vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss +#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) + +// Convert the signed 64-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss +FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +} + +// Copy the lower single-precision (32-bit) floating-point element of a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32 +FORCE_INLINE float _mm_cvtss_f32(__m128 a) +{ + return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32 +#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64 +FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0); +#else + float32_t data = vgetq_lane_f32( + vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); + return (int64_t) data; +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi +FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) +{ + return vreinterpret_m64_s32( + vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si +FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) +{ + return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32 +#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32 +#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64 +FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) +{ + return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); +} + +// Divide packed single-precision (32-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement +// division by multiplying a by b's reciprocal before using the Newton-Raphson +// method to approximate the results. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); +#endif +} + +// Divide the lower single-precision (32-bit) floating-point element in a by the +// lower single-precision (32-bit) floating-point element in b, store the result +// in the lower element of dst, and copy the upper 3 packed elements from a to +// the upper elements of dst. +// Warning: ARMv7-A does not produce the same result compared to Intel and not +// IEEE-compliant. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16 +#define _mm_extract_pi16(a, imm) \ + (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) + +// Free aligned memory that was allocated with _mm_malloc. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free +#if !defined(SSE2NEON_ALLOC_DEFINED) +FORCE_INLINE void _mm_free(void *addr) +{ + free(addr); +} +#endif + +FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) +{ + uint64_t value; +#if defined(_MSC_VER) && !defined(__clang__) + value = _ReadStatusReg(ARM64_FPCR); +#else + __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */ +#endif + return value; +} + +FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) +{ +#if defined(_MSC_VER) && !defined(__clang__) + _WriteStatusReg(ARM64_FPCR, value); +#else + __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ +#endif +} + +// Macro: Get the flush zero bits from the MXCSR control and status register. +// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or +// _MM_FLUSH_ZERO_OFF +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE +FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) +{ + union { + fpcr_bitfield field; +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) || defined(_M_ARM64) + r.value = _sse2neon_get_fpcr(); +#else + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF; +} + +// Macro: Get the rounding mode bits from the MXCSR control and status register. +// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, +// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE +FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) +{ + switch (fegetround()) { + case FE_TONEAREST: + return _MM_ROUND_NEAREST; + case FE_DOWNWARD: + return _MM_ROUND_DOWN; + case FE_UPWARD: + return _MM_ROUND_UP; + case FE_TOWARDZERO: + return _MM_ROUND_TOWARD_ZERO; + default: + // fegetround() must return _MM_ROUND_NEAREST, _MM_ROUND_DOWN, + // _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO on success. all the other error + // cases we treat them as FE_TOWARDZERO (truncate). + return _MM_ROUND_TOWARD_ZERO; + } +} + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16 +#define _mm_insert_pi16(a, b, imm) \ + vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))) + +// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from memory into dst. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps +FORCE_INLINE __m128 _mm_load_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[63:32] := MEM[mem_addr+31:mem_addr] +// dst[95:64] := MEM[mem_addr+31:mem_addr] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1 +#define _mm_load_ps1 _mm_load1_ps + +// Load a single-precision (32-bit) floating-point element from memory into the +// lower of dst, and zero the upper 3 elements. mem_addr does not need to be +// aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss +FORCE_INLINE __m128 _mm_load_ss(const float *p) +{ + return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); +} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps +FORCE_INLINE __m128 _mm_load1_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_dup_f32(p)); +} + +// Load 2 single-precision (32-bit) floating-point elements from memory into the +// upper 2 elements of dst, and copy the lower 2 elements from a to dst. +// mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi +FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); +} + +// Load 2 single-precision (32-bit) floating-point elements from memory into the +// lower 2 elements of dst, and copy the upper 2 elements from a to dst. +// mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi +FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); +} + +// Load 4 single-precision (32-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps +FORCE_INLINE __m128 _mm_loadr_ps(const float *p) +{ + float32x4_t v = vrev64q_f32(vld1q_f32(p)); + return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); +} + +// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from memory into dst. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps +FORCE_INLINE __m128 _mm_loadu_ps(const float *p) +{ + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are + // equivalent for neon + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Load unaligned 16-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16 +FORCE_INLINE __m128i _mm_loadu_si16(const void *p) +{ + return vreinterpretq_m128i_s16( + vsetq_lane_s16(*(const unaligned_int16_t *) p, vdupq_n_s16(0), 0)); +} + +// Load unaligned 64-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64 +FORCE_INLINE __m128i _mm_loadu_si64(const void *p) +{ + return vreinterpretq_m128i_s64( + vsetq_lane_s64(*(const unaligned_int64_t *) p, vdupq_n_s64(0), 0)); +} + +// Allocate size bytes of memory, aligned to the alignment specified in align, +// and return a pointer to the allocated memory. _mm_free should be used to free +// memory that is allocated with _mm_malloc. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc +#if !defined(SSE2NEON_ALLOC_DEFINED) +FORCE_INLINE void *_mm_malloc(size_t size, size_t align) +{ + void *ptr; + if (align == 1) + return malloc(size); + if (align == 2 || (sizeof(void *) == 8 && align == 4)) + align = sizeof(void *); + if (!posix_memalign(&ptr, align, size)) + return ptr; + return NULL; +} +#endif + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64 +FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) +{ + int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7); + __m128 b = _mm_load_ps((const float *) mem_addr); + int8x8_t masked = + vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a), + vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b)))); + vst1_s8((int8_t *) mem_addr, masked); +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq +#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16 +FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b, +// and store packed maximum values in dst. dst does not follow the IEEE Standard +// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or +// signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b)); +#else + return vreinterpretq_m128_f32( + vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8 +FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b, store the maximum value in the lower element of dst, and copy the upper 3 +// packed elements from a to the upper element of dst. dst does not follow the +// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when +// inputs are NaN or signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16 +FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed single-precision (32-bit) floating-point elements in a and b, +// and store packed minimum values in dst. dst does not follow the IEEE Standard +// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or +// signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b)); +#else + return vreinterpretq_m128_f32( + vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8 +FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare the lower single-precision (32-bit) floating-point elements in a and +// b, store the minimum value in the lower element of dst, and copy the upper 3 +// packed elements from a to the upper element of dst. dst does not follow the +// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when +// inputs are NaN or signed-zero values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Move the lower single-precision (32-bit) floating-point element from b to the +// lower element of dst, and copy the upper 3 packed elements from a to the +// upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss +FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), + vreinterpretq_f32_m128(a), 0)); +} + +// Move the upper 2 single-precision (32-bit) floating-point elements from b to +// the lower 2 elements of dst, and copy the upper 2 elements from a to the +// upper 2 elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps +FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b) +{ +#if defined(aarch64__) + return vreinterpretq_m128_u64( + vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a))); +#else + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); +#endif +} + +// Move the lower 2 single-precision (32-bit) floating-point elements from b to +// the upper 2 elements of dst, and copy the lower 2 elements from a to the +// lower 2 elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps +FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8 +FORCE_INLINE int _mm_movemask_pi8(__m64 a) +{ + uint8x8_t input = vreinterpret_u8_m64(a); +#if defined(__aarch64__) || defined(_M_ARM64) + static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7}; + uint8x8_t tmp = vshr_n_u8(input, 7); + return vaddv_u8(vshl_u8(tmp, vld1_s8(shift))); +#else + // Refer the implementation of `_mm_movemask_epi8` + uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7)); + uint32x2_t paired16 = + vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); + uint8x8_t paired32 = + vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14)); + return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4); +#endif +} + +// Set each bit of mask dst based on the most significant bit of the +// corresponding packed single-precision (32-bit) floating-point element in a. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps +FORCE_INLINE int _mm_movemask_ps(__m128 a) +{ + uint32x4_t input = vreinterpretq_u32_m128(a); +#if defined(__aarch64__) || defined(_M_ARM64) + static const int32_t shift[4] = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(input, 31); + return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift))); +#else + // Uses the exact same method as _mm_movemask_epi8, see that for details. + // Shift out everything but the sign bits with a 32-bit unsigned shift + // right. + uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); + // Merge the two pairs together with a 64-bit unsigned shift right + add. + uint8x16_t paired = + vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); + // Extract the result. + return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); +#endif +} + +// Multiply packed single-precision (32-bit) floating-point elements in a and b, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Multiply the lower single-precision (32-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_mul_ps(a, b)); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16 +FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16(vshrn_n_u32( + vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); +} + +// Compute the bitwise OR of packed single-precision (32-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb +#define _m_pavgb(a, b) _mm_avg_pu8(a, b) + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw +#define _m_pavgw(a, b) _mm_avg_pu16(a, b) + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw +#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw +#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb +#define _m_pmovmskb(a) _mm_movemask_pi8(a) + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw +#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Fetch the line of data from memory that contains address p to a location in +// the cache hierarchy specified by the locality hint i. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch +FORCE_INLINE void _mm_prefetch(char const *p, int i) +{ + (void) i; +#if defined(_MSC_VER) && !defined(__clang__) + switch (i) { + case _MM_HINT_NTA: + __prefetch2(p, 1); + break; + case _MM_HINT_T0: + __prefetch2(p, 0); + break; + case _MM_HINT_T1: + __prefetch2(p, 2); + break; + case _MM_HINT_T2: + __prefetch2(p, 4); + break; + } +#else + switch (i) { + case _MM_HINT_NTA: + __builtin_prefetch(p, 0, 0); + break; + case _MM_HINT_T0: + __builtin_prefetch(p, 0, 3); + break; + case _MM_HINT_T1: + __builtin_prefetch(p, 0, 2); + break; + case _MM_HINT_T2: + __builtin_prefetch(p, 0, 1); + break; + } +#endif +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw +#define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Shuffle 16-bit integers in a using the control in imm8, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw +#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm) + +// Compute the approximate reciprocal of packed single-precision (32-bit) +// floating-point elements in a, and store the results in dst. The maximum +// relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps +FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) +{ + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#endif + return vreinterpretq_m128_f32(recip); +} + +// Compute the approximate reciprocal of the lower single-precision (32-bit) +// floating-point element in a, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss +FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) +{ + return _mm_move_ss(a, _mm_rcp_ps(a)); +} + +// Compute the approximate reciprocal square root of packed single-precision +// (32-bit) floating-point elements in a, and store the results in dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) +{ + float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + + // Generate masks for detecting whether input has any 0.0f/-0.0f + // (which becomes positive/negative infinity by IEEE-754 arithmetic rules). + const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); + const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000); + const uint32x4_t has_pos_zero = + vceqq_u32(pos_inf, vreinterpretq_u32_f32(out)); + const uint32x4_t has_neg_zero = + vceqq_u32(neg_inf, vreinterpretq_u32_f32(out)); + + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); +#if SSE2NEON_PRECISE_SQRT + // Additional Netwon-Raphson iteration for accuracy + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); +#endif + + // Set output vector element to infinity/negative-infinity if + // the corresponding input vector element is 0.0f/-0.0f. + out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out); + out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out); + + return vreinterpretq_m128_f32(out); +} + +// Compute the approximate reciprocal square root of the lower single-precision +// (32-bit) floating-point element in a, store the result in the lower element +// of dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) +{ + return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8 +FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) +{ + uint64x1_t t = vpaddl_u32(vpaddl_u16( + vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))))); + return vreinterpret_m64_u16( + vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0)); +} + +// Macro: Set the flush zero bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The flush zero may contain any of the +// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE +FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) +{ + // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, + // regardless of the value of the FZ bit. + union { + fpcr_bitfield field; +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) || defined(_M_ARM64) + r.value = _sse2neon_get_fpcr(); +#else + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON; + +#if defined(__aarch64__) || defined(_M_ARM64) + _sse2neon_set_fpcr(r.value); +#else + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif +} + +// Set packed single-precision (32-bit) floating-point elements in dst with the +// supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps +FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Broadcast single-precision (32-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1 +FORCE_INLINE __m128 _mm_set_ps1(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Macro: Set the rounding mode bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The rounding mode may contain any of +// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, +// _MM_ROUND_TOWARD_ZERO +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE +FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) +{ + switch (rounding) { + case _MM_ROUND_NEAREST: + rounding = FE_TONEAREST; + break; + case _MM_ROUND_DOWN: + rounding = FE_DOWNWARD; + break; + case _MM_ROUND_UP: + rounding = FE_UPWARD; + break; + case _MM_ROUND_TOWARD_ZERO: + rounding = FE_TOWARDZERO; + break; + default: + // rounding must be _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, + // _MM_ROUND_TOWARD_ZERO. all the other invalid values we treat them as + // FE_TOWARDZERO (truncate). + rounding = FE_TOWARDZERO; + } + fesetround(rounding); +} + +// Copy single-precision (32-bit) floating-point element a to the lower element +// of dst, and zero the upper 3 elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss +FORCE_INLINE __m128 _mm_set_ss(float a) +{ + return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0)); +} + +// Broadcast single-precision (32-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps +FORCE_INLINE __m128 _mm_set1_ps(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Set the MXCSR control and status register with the value in unsigned 32-bit +// integer a. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr +// FIXME: _mm_setcsr() implementation supports changing the rounding mode only. +FORCE_INLINE void _mm_setcsr(unsigned int a) +{ + _MM_SET_ROUNDING_MODE(a); +} + +// Get the unsigned 32-bit value of the MXCSR control and status register. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr +// FIXME: _mm_getcsr() implementation supports reading the rounding mode only. +FORCE_INLINE unsigned int _mm_getcsr(void) +{ + return _MM_GET_ROUNDING_MODE(); +} + +// Set packed single-precision (32-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps +FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Return vector of type __m128 with all elements set to zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps +FORCE_INLINE __m128 _mm_setzero_ps(void) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(0)); +} + +// Shuffle 16-bit integers in a using the control in imm8, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16 +#ifdef _sse2neon_shuffle +#define _mm_shuffle_pi16(a, imm) \ + vreinterpret_m64_s16(vshuffle_s16( \ + vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ + ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))) +#else +#define _mm_shuffle_pi16(a, imm) \ + _sse2neon_define1( \ + __m64, a, int16x4_t ret; \ + ret = vmov_n_s16( \ + vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \ + ret = vset_lane_s16( \ + vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vset_lane_s16( \ + vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vset_lane_s16( \ + vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \ + 3); \ + _sse2neon_return(vreinterpret_m64_s16(ret));) +#endif + +// Perform a serializing operation on all store-to-memory instructions that were +// issued prior to this instruction. Guarantees that every store instruction +// that precedes, in program order, is globally visible before any store +// instruction which follows the fence in program order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence +FORCE_INLINE void _mm_sfence(void) +{ + _sse2neon_smp_mb(); +} + +// Perform a serializing operation on all load-from-memory and store-to-memory +// instructions that were issued prior to this instruction. Guarantees that +// every memory access that precedes, in program order, the memory fence +// instruction is globally visible before any memory instruction which follows +// the fence in program order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence +FORCE_INLINE void _mm_mfence(void) +{ + _sse2neon_smp_mb(); +} + +// Perform a serializing operation on all load-from-memory instructions that +// were issued prior to this instruction. Guarantees that every load instruction +// that precedes, in program order, is globally visible before any load +// instruction which follows the fence in program order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence +FORCE_INLINE void _mm_lfence(void) +{ + _sse2neon_smp_mb(); +} + +// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) +// int imm) +#ifdef _sse2neon_shuffle +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + float32x4_t _input1 = vreinterpretq_f32_m128(a); \ + float32x4_t _input2 = vreinterpretq_f32_m128(b); \ + float32x4_t _shuf = \ + vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128_f32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_ps(a, b, imm) \ + _sse2neon_define2( \ + __m128, a, b, __m128 ret; switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_ps_1032(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_ps_2301(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_ps_0321(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_ps_2103(_a, _b); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_movelh_ps(_a, _b); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_ps_1001(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_ps_0101(_a, _b); \ + break; \ + case _MM_SHUFFLE(3, 2, 1, 0): \ + ret = _mm_shuffle_ps_3210(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 0, 1, 1): \ + ret = _mm_shuffle_ps_0011(_a, _b); \ + break; \ + case _MM_SHUFFLE(0, 0, 2, 2): \ + ret = _mm_shuffle_ps_0022(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 2, 0, 0): \ + ret = _mm_shuffle_ps_2200(_a, _b); \ + break; \ + case _MM_SHUFFLE(3, 2, 0, 2): \ + ret = _mm_shuffle_ps_3202(_a, _b); \ + break; \ + case _MM_SHUFFLE(3, 2, 3, 2): \ + ret = _mm_movehl_ps(_b, _a); \ + break; \ + case _MM_SHUFFLE(1, 1, 3, 3): \ + ret = _mm_shuffle_ps_1133(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 0, 1, 0): \ + ret = _mm_shuffle_ps_2010(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 0, 0, 1): \ + ret = _mm_shuffle_ps_2001(_a, _b); \ + break; \ + case _MM_SHUFFLE(2, 0, 3, 2): \ + ret = _mm_shuffle_ps_2032(_a, _b); \ + break; \ + default: \ + ret = _mm_shuffle_ps_default(_a, _b, (imm)); \ + break; \ + } _sse2neon_return(ret);) +#endif + +// Compute the square root of packed single-precision (32-bit) floating-point +// elements in a, and store the results in dst. +// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement +// square root by multiplying input in with its reciprocal square root before +// using the Newton-Raphson method to approximate the results. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) && !SSE2NEON_PRECISE_SQRT + return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); +#else + float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + + // Test for vrsqrteq_f32(0) -> positive infinity case. + // Change to zero, so that s * 1/sqrt(s) result is zero too. + const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); + const uint32x4_t div_by_zero = + vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); + recip = vreinterpretq_f32_u32( + vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); + + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + + // sqrt(s) = s * 1/sqrt(s) + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); +#endif +} + +// Compute the square root of the lower single-precision (32-bit) floating-point +// element in a, store the result in the lower element of dst, and copy the +// upper 3 packed elements from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps +FORCE_INLINE void _mm_store_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1 +FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + vst1q_f32(p, vdupq_n_f32(a0)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// memory. mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss +FORCE_INLINE void _mm_store_ss(float *p, __m128 a) +{ + vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps +#define _mm_store1_ps _mm_store_ps1 + +// Store the upper 2 single-precision (32-bit) floating-point elements from a +// into memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi +FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_high_f32(a)); +} + +// Store the lower 2 single-precision (32-bit) floating-point elements from a +// into memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi +FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_low_f32(a)); +} + +// Store 4 single-precision (32-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps +FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) +{ + float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); + float32x4_t rev = vextq_f32(tmp, tmp, 2); + vst1q_f32(p, rev); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps +FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Stores 16-bits of integer data a at the address p. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16 +FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) +{ + vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0); +} + +// Stores 64-bits of integer data a at the address p. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64 +FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) +{ + vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0); +} + +// Store 64-bits of integer data from a into memory using a non-temporal memory +// hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi +FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) +{ + vst1_s64((int64_t *) p, vreinterpret_s64_m64(a)); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- +// point elements) from a into memory using a non-temporal memory hint. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps +FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *) p); +#else + vst1q_f32(p, vreinterpretq_f32_m128(a)); +#endif +} + +// Subtract packed single-precision (32-bit) floating-point elements in b from +// packed single-precision (32-bit) floating-point elements in a, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Subtract the lower single-precision (32-bit) floating-point element in b from +// the lower single-precision (32-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper 3 packed elements from +// a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_sub_ps(a, b)); +} + +// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision +// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the +// transposed matrix in these vectors (row0 now contains column 0, etc.). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ + float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ + row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ + vget_low_f32(ROW23.val[0])); \ + row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ + vget_low_f32(ROW23.val[1])); \ + row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ + vget_high_f32(ROW23.val[0])); \ + row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ + vget_high_f32(ROW23.val[1])); \ + } while (0) + +// according to the documentation, these intrinsics behave the same as the +// non-'u' versions. We'll just alias them here. +#define _mm_ucomieq_ss _mm_comieq_ss +#define _mm_ucomige_ss _mm_comige_ss +#define _mm_ucomigt_ss _mm_comigt_ss +#define _mm_ucomile_ss _mm_comile_ss +#define _mm_ucomilt_ss _mm_comilt_ss +#define _mm_ucomineq_ss _mm_comineq_ss + +// Return vector of type __m128i with undefined elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128 +FORCE_INLINE __m128i _mm_undefined_si128(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128i a; +#if defined(_MSC_VER) + a = _mm_setzero_si128(); +#endif + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +// Return vector of type __m128 with undefined elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps +FORCE_INLINE __m128 _mm_undefined_ps(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128 a; +#if defined(_MSC_VER) + a = _mm_setzero_ps(); +#endif + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +// Unpack and interleave single-precision (32-bit) floating-point elements from +// the high half a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave single-precision (32-bit) floating-point elements from +// the low half of a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Compute the bitwise XOR of packed single-precision (32-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +/* SSE2 */ + +// Add packed 16-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16 +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Add packed 32-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32 +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Add packed 64-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64 +FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Add packed 8-bit integers in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8 +FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Add packed double-precision (64-bit) floating-point elements in a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd +FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[2]; + c[0] = a0 + b0; + c[1] = a1 + b1; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add the lower double-precision (64-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper element from +// a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd +FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_add_pd(a, b)); +#else + double a0, a1, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double c[2]; + c[0] = a0 + b0; + c[1] = a1; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add 64-bit integers a and b, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64 +FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// Add packed signed 16-bit integers in a and b using saturation, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16 +FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Add packed signed 8-bit integers in a and b using saturation, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8 +FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Add packed unsigned 16-bit integers in a and b using saturation, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16 +FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Add packed unsigned 8-bit integers in a and b using saturation, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8 +FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Compute the bitwise AND of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd +FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128 +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compute the bitwise NOT of packed double-precision (64-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd +FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) +{ + // *NOTE* argument swap + return vreinterpretq_m128d_s64( + vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); +} + +// Compute the bitwise NOT of 128 bits (representing integer data) in a and then +// AND with b, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128 +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vbicq_s32(vreinterpretq_s32_m128i(b), + vreinterpretq_s32_m128i(a))); // *NOTE* argument swap +} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16 +FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) +{ + return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), + vreinterpretq_u16_m128i(b)); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8 +FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Shift a left by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128 +#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) + +// Shift a right by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128 +#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) + +// Cast vector of type __m128d to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps +FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) +{ + return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); +} + +// Cast vector of type __m128d to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128 +FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) +{ + return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); +} + +// Cast vector of type __m128 to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd +FORCE_INLINE __m128d _mm_castps_pd(__m128 a) +{ + return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); +} + +// Cast vector of type __m128 to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128 +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) +{ + return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); +} + +// Cast vector of type __m128i to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd +FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); +#else + return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); +#endif +} + +// Cast vector of type __m128i to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) +{ + return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); +} + +// Invalidate and flush the cache line that contains p from all levels of the +// cache hierarchy. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush +#if defined(__APPLE__) +#include +#endif +FORCE_INLINE void _mm_clflush(void const *p) +{ + (void) p; + + /* sys_icache_invalidate is supported since macOS 10.5. + * However, it does not work on non-jailbroken iOS devices, although the + * compilation is successful. + */ +#if defined(__APPLE__) + sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE); +#elif defined(__GNUC__) || defined(__clang__) + uintptr_t ptr = (uintptr_t) p; + __builtin___clear_cache((char *) ptr, + (char *) ptr + SSE2NEON_CACHELINE_SIZE); +#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H + FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE); +#endif +} + +// Compare packed 16-bit integers in a and b for equality, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16 +FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed 32-bit integers in a and b for equality, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32 +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed 8-bit integers in a and b for equality, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8 +FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd +FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for equality, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd +FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpeq_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd +FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64( + vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than-or-equal, store the result in the lower element of dst, +// and copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd +FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmpge_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + uint64_t d[2]; + d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed signed 16-bit integers in a and b for greater-than, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16 +FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed signed 32-bit integers in a and b for greater-than, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32 +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b for greater-than, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8 +FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd +FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64( + vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than, store the result in the lower element of dst, and copy +// the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd +FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + uint64_t d[2]; + d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd +FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64( + vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than-or-equal, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd +FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmple_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + uint64_t d[2]; + d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed signed 16-bit integers in a and b for less-than, and store the +// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the +// order of the operands switched. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16 +FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed signed 32-bit integers in a and b for less-than, and store the +// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the +// order of the operands switched. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32 +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b for less-than, and store the +// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the +// order of the operands switched. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8 +FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd +FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64( + vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd +FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmplt_pd(a, b)); +#else + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + uint64_t d[2]; + d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd +FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped))); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-equal, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd +FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpneq_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-greater-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd +FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64(veorq_u64( + vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-greater-than-or-equal, store the result in the lower element of +// dst, and copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd +FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpnge_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-greater-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd +FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64(veorq_u64( + vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-greater-than, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd +FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpngt_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-less-than-or-equal, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd +FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64(veorq_u64( + vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-less-than-or-equal, store the result in the lower element of dst, +// and copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd +FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpnle_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-less-than, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd +FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_u64(veorq_u64( + vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), + vdupq_n_u64(UINT64_MAX))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-less-than, store the result in the lower element of dst, and copy +// the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd +FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpnlt_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// to see if neither is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd +FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + // Excluding NaNs, any two floating point numbers can be compared. + uint64x2_t not_nan_a = + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); + uint64x2_t not_nan_b = + vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b to see if neither is NaN, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd +FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmpord_pd(a, b)); +#else + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + uint64_t d[2]; + d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// to see if either is NaN, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd +FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + // Two NaNs are not equal in comparison operation. + uint64x2_t not_nan_a = + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); + uint64x2_t not_nan_b = + vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_s32( + vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + uint64_t d[2]; + d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0); + d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b to see if either is NaN, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd +FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); +#else + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + uint64_t d[2]; + d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for greater-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd +FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; +#else + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return a0 >= b0; +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for greater-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd +FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; +#else + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + + return a0 > b0; +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for less-than-or-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd +FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; +#else + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + + return a0 <= b0; +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for less-than, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd +FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; +#else + double a0, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + + return a0 < b0; +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for equality, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd +FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1; +#else + uint32x4_t a_not_nan = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a)); + uint32x4_t b_not_nan = + vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_eq_b = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan), + vreinterpretq_u64_u32(a_eq_b)); + return vgetq_lane_u64(and_results, 0) & 0x1; +#endif +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for not-equal, and return the boolean result (0 or 1). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd +FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) +{ + return !_mm_comieq_sd(a, b); +} + +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd +FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); +#else + double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); + double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32 +FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) +{ +// vrnd32xq_f64 not supported on clang +#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__) + float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a)); + int64x2_t integers = vcvtq_s64_f64(rounded); + return vreinterpretq_m128i_s32( + vcombine_s32(vmovn_s64(integers), vdup_n_s32(0))); +#else + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double d0, d1; + d0 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + d1 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1)); + return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); +#endif +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32 +FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) +{ + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double d0, d1; + d0 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + d1 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1)); + int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; + return vreinterpret_m64_s32(vld1_s32(data)); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed single-precision (32-bit) floating-point elements, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps +FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); + return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); +#else + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_ps(0, 0, (float) a1, (float) a0); +#endif +} + +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd +FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); +#else + double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0); + double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32 +// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A +// does not support! It is supported on ARMv8-A however. +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) +{ +#if defined(__ARM_FEATURE_FRINT) + return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a))); +#elif (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + switch (_MM_GET_ROUNDING_MODE()) { + case _MM_ROUND_NEAREST: + return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); + case _MM_ROUND_DOWN: + return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a)); + case _MM_ROUND_UP: + return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a)); + default: // _MM_ROUND_TOWARD_ZERO + return vreinterpretq_m128i_s32(vcvtq_s32_f32(a)); + } +#else + float *f = (float *) &a; + switch (_MM_GET_ROUNDING_MODE()) { + case _MM_ROUND_NEAREST: { + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = vcvtq_s32_f32( + vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = + vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128i_s32( + vbslq_s32(is_delta_half, r_even, r_normal)); + } + case _MM_ROUND_DOWN: + return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]), + floorf(f[0])); + case _MM_ROUND_UP: + return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), + ceilf(f[0])); + default: // _MM_ROUND_TOWARD_ZERO + return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1], + (int32_t) f[0]); + } +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed double-precision (64-bit) floating-point elements, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd +FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); +#else + double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Copy the lower double-precision (64-bit) floating-point element of a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64 +FORCE_INLINE double _mm_cvtsd_f64(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); +#else + double _a = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return _a; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32 +FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); +#else + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double ret = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + return (int32_t) ret; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64 +FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); +#else + __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); + double ret = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0)); + return (int64_t) ret; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x +#define _mm_cvtsd_si64x _mm_cvtsd_si64 + +// Convert the lower double-precision (64-bit) floating-point element in b to a +// single-precision (32-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper 3 packed elements from a to the +// upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss +FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32(vsetq_lane_f32( + vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), + vreinterpretq_f32_m128(a), 0)); +#else + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0)); +#endif +} + +// Copy the lower 32-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32 +FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) +{ + return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64 +FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) +{ + return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Convert the signed 32-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd +FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); +#else + int64_t _b = sse2neon_recast_f64_s64((double) b); + return vreinterpretq_m128d_s64( + vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0)); +#endif +} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Copy 32-bit integer a to the lower elements of dst, and zero the upper +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128 +FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) +{ + return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); +} + +// Convert the signed 64-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd +FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); +#else + int64_t _b = sse2neon_recast_f64_s64((double) b); + return vreinterpretq_m128d_s64( + vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0)); +#endif +} + +// Copy 64-bit integer a to the lower element of dst, and zero the upper +// element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128 +FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) +{ + return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); +} + +// Copy 64-bit integer a to the lower element of dst, and zero the upper +// element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128 +#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) + +// Convert the signed 64-bit integer b to a double-precision (64-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd +#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) + +// Convert the lower single-precision (32-bit) floating-point element in b to a +// double-precision (64-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd +FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) +{ + double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_s64(vsetq_lane_s64( + sse2neon_recast_f64_s64(d), vreinterpretq_s64_m128d(a), 0)); +#endif +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32 +FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) +{ + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 +FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) +{ + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; + return vreinterpret_m64_s32(vld1_s32(data)); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32 +FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) +{ + return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32 +FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) +{ + double _a = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return (int32_t) _a; +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64 +FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); +#else + double _a = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + return (int64_t) _a; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x +#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) + +// Divide packed double-precision (64-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd +FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[2]; + c[0] = a0 / b0; + c[1] = a1 / b1; + return vld1q_f32((float32_t *) c); +#endif +} + +// Divide the lower double-precision (64-bit) floating-point element in a by the +// lower double-precision (64-bit) floating-point element in b, store the result +// in the lower element of dst, and copy the upper element from a to the upper +// element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd +FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + float64x2_t tmp = + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_f64( + vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); +#else + return _mm_move_sd(a, _mm_div_pd(a, b)); +#endif +} + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16 +// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) +#define _mm_extract_epi16(a, imm) \ + vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16 +// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, +// __constrange(0,8) int imm) +#define _mm_insert_epi16(a, b, imm) \ + vreinterpretq_m128i_s16( \ + vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))) + +// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from memory into dst. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd +FORCE_INLINE __m128d _mm_load_pd(const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vld1q_f64(p)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower of dst, and zero the upper element. mem_addr does not need to be +// aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd +FORCE_INLINE __m128d _mm_load_sd(const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Load 128-bits of integer data from memory into dst. mem_addr must be aligned +// on a 16-byte boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128 +FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd +FORCE_INLINE __m128d _mm_load1_pd(const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// upper element of dst, and copy the lower element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd +FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); +#else + return vreinterpretq_m128d_f32(vcombine_f32( + vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); +#endif +} + +// Load 64-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64 +FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) +{ + /* Load the lower 64 bits of the value pointed to by p into the + * lower 64 bits of the result, zeroing the upper 64 bits of the result. + */ + return vreinterpretq_m128i_s32( + vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower element of dst, and copy the upper element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd +FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); +#else + return vreinterpretq_m128d_f32( + vcombine_f32(vld1_f32((const float *) p), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +#endif +} + +// Load 2 double-precision (64-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd +FORCE_INLINE __m128d _mm_loadr_pd(const double *p) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + float64x2_t v = vld1q_f64(p); + return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); +#else + int64x2_t v = vld1q_s64((const int64_t *) p); + return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); +#endif +} + +// Loads two double-precision from unaligned memory, floating-point values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd +FORCE_INLINE __m128d _mm_loadu_pd(const double *p) +{ + return _mm_load_pd(p); +} + +// Load 128-bits of integer data from memory into dst. mem_addr does not need to +// be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128 +FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const unaligned_int32_t *) p)); +} + +// Load unaligned 32-bit integer from memory into the first element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32 +FORCE_INLINE __m128i _mm_loadu_si32(const void *p) +{ + return vreinterpretq_m128i_s32( + vsetq_lane_s32(*(const unaligned_int32_t *) p, vdupq_n_s32(0), 0)); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Horizontally add adjacent pairs of intermediate +// 32-bit integers, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16 +FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) +{ + int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); +#if defined(__aarch64__) || defined(_M_ARM64) + int32x4_t high = + vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); + + return vreinterpretq_m128i_s32(vpaddq_s32(low, high)); +#else + int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); + int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); + + return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); +#endif +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. mem_addr does not need to be aligned +// on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128 +FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) +{ + int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7); + __m128 b = _mm_load_ps((const float *) mem_addr); + int8x16_t masked = + vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a), + vreinterpretq_s8_m128(b)); + vst1q_s8((int8_t *) mem_addr, masked); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16 +FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8 +FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed maximum values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd +FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_PRECISE_MINMAX + float64x2_t _a = vreinterpretq_f64_m128d(a); + float64x2_t _b = vreinterpretq_f64_m128d(b); + return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b)); +#else + return vreinterpretq_m128d_f64( + vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#endif +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + int64_t d[2]; + d[0] = a0 > b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0); + d[1] = a1 > b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1); + + return vreinterpretq_m128d_s64(vld1q_s64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the maximum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd +FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_max_pd(a, b)); +#else + double a0, a1, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double c[2] = {a0 > b0 ? a0 : b0, a1}; + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); +#endif +} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16 +FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8 +FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed minimum values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd +FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) +#if SSE2NEON_PRECISE_MINMAX + float64x2_t _a = vreinterpretq_f64_m128d(a); + float64x2_t _b = vreinterpretq_f64_m128d(b); + return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b)); +#else + return vreinterpretq_m128d_f64( + vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#endif +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + int64_t d[2]; + d[0] = a0 < b0 ? sse2neon_recast_f64_s64(a0) : sse2neon_recast_f64_s64(b0); + d[1] = a1 < b1 ? sse2neon_recast_f64_s64(a1) : sse2neon_recast_f64_s64(b1); + return vreinterpretq_m128d_s64(vld1q_s64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the minimum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd +FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_min_pd(a, b)); +#else + double a0, a1, b0; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + b0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double c[2] = {a0 < b0 ? a0 : b0, a1}; + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); +#endif +} + +// Copy the lower 64-bit integer in a to the lower element of dst, and zero the +// upper element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64 +FORCE_INLINE __m128i _mm_move_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); +} + +// Move the lower double-precision (64-bit) floating-point element from b to the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd +FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_f32( + vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +} + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8 +FORCE_INLINE int _mm_movemask_epi8(__m128i a) +{ + // Use increasingly wide shifts+adds to collect the sign bits + // together. + // Since the widening shifts would be rather confusing to follow in little + // endian, everything will be illustrated in big endian order instead. This + // has a different result - the bits would actually be reversed on a big + // endian machine. + + // Starting input (only half the elements are shown): + // 89 ff 1d c0 00 10 99 33 + uint8x16_t input = vreinterpretq_u8_m128i(a); + + // Shift out everything but the sign bits with an unsigned shift right. + // + // Bytes of the vector:: + // 89 ff 1d c0 00 10 99 33 + // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) + // | | | | | | | | + // 01 01 00 01 00 00 01 00 + // + // Bits of first important lane(s): + // 10001001 (89) + // \______ + // | + // 00000001 (01) + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + + // Merge the even lanes together with a 16-bit unsigned shift right + add. + // 'xx' represents garbage data which will be ignored in the final result. + // In the important bytes, the add functions like a binary OR. + // + // 01 01 00 01 00 00 01 00 + // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) + // \| \| \| \| + // xx 03 xx 01 xx 00 xx 02 + // + // 00000001 00000001 (01 01) + // \_______ | + // \| + // xxxxxxxx xxxxxx11 (xx 03) + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + + // Repeat with a wider 32-bit shift + add. + // xx 03 xx 01 xx 00 xx 02 + // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> + // 14)) + // \| \| + // xx xx xx 0d xx xx xx 02 + // + // 00000011 00000001 (03 01) + // \\_____ || + // '----.\|| + // xxxxxxxx xxxx1101 (xx 0d) + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + + // Last, an even wider 64-bit shift + add to get our result in the low 8 bit + // lanes. xx xx xx 0d xx xx xx 02 + // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> + // 28)) + // \| + // xx xx xx xx xx xx xx d2 + // + // 00001101 00000010 (0d 02) + // \ \___ | | + // '---. \| | + // xxxxxxxx 11010010 (xx d2) + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + + // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. + // xx xx xx xx xx xx xx d2 + // || return paired64[0] + // d2 + // Note: Little endian would return the correct value 4b (01001011) instead. + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); +} + +// Set each bit of mask dst based on the most significant bit of the +// corresponding packed double-precision (64-bit) floating-point element in a. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd +FORCE_INLINE int _mm_movemask_pd(__m128d a) +{ + uint64x2_t input = vreinterpretq_u64_m128d(a); + uint64x2_t high_bits = vshrq_n_u64(input, 63); + return (int) (vgetq_lane_u64(high_bits, 0) | + (vgetq_lane_u64(high_bits, 1) << 1)); +} + +// Copy the lower 64-bit integer in a to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64 +FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) +{ + return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); +} + +// Copy the 64-bit integer a to the lower element of dst, and zero the upper +// element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64 +FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); +} + +// Multiply the low unsigned 32-bit integers from each packed 64-bit element in +// a and b, and store the unsigned 64-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32 +FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ + // vmull_u32 upcasts instead of masking, so we downcast. + uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); + uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); + return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); +} + +// Multiply packed double-precision (64-bit) floating-point elements in a and b, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd +FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[2]; + c[0] = a0 * b0; + c[1] = a1 * b1; + return vld1q_f32((float32_t *) c); +#endif +} + +// Multiply the lower double-precision (64-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper element +// from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd +FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_mul_pd(a, b)); +} + +// Multiply the low unsigned 32-bit integers from a and b, and store the +// unsigned 64-bit result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32 +FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) +{ + return vreinterpret_m64_u64(vget_low_u64( + vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); +} + +// Multiply the packed signed 16-bit integers in a and b, producing intermediate +// 32-bit integers, and store the high 16 bits of the intermediate integers in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16 +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) +{ + /* FIXME: issue with large values because of result saturation */ + // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), + // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return + // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); + int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ + int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16 +FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) +{ + uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab3210 = vmull_u16(a3210, b3210); +#if defined(__aarch64__) || defined(_M_ARM64) + uint32x4_t ab7654 = + vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); + uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), + vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r); +#else + uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab7654 = vmull_u16(a7654, b7654); + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +#endif +} + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and store the low 16 bits of the intermediate integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16 +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compute the bitwise OR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd +FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Compute the bitwise OR of 128 bits (representing integer data) in a and b, +// and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128 +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Convert packed signed 16-bit integers from a and b to packed 8-bit integers +// using signed saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16 +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), + vqmovn_s16(vreinterpretq_s16_m128i(b)))); +} + +// Convert packed signed 32-bit integers from a and b to packed 16-bit integers +// using signed saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32 +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), + vqmovn_s32(vreinterpretq_s32_m128i(b)))); +} + +// Convert packed signed 16-bit integers from a and b to packed 8-bit integers +// using unsigned saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16 +FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) +{ + return vreinterpretq_m128i_u8( + vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), + vqmovun_s16(vreinterpretq_s16_m128i(b)))); +} + +// Pause the processor. This is typically used in spin-wait loops and depending +// on the x86 processor typical values are in the 40-100 cycle range. The +// 'yield' instruction isn't a good fit because it's effectively a nop on most +// Arm cores. Experience with several databases has shown has shown an 'isb' is +// a reasonable approximation. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause +FORCE_INLINE void _mm_pause(void) +{ +#if defined(_MSC_VER) && !defined(__clang__) + __isb(_ARM64_BARRIER_SY); +#else + __asm__ __volatile__("isb\n"); +#endif +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce two +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of 64-bit elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8 +FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) +{ + uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); + return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t))); +} + +// Set packed 16-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16 +FORCE_INLINE __m128i _mm_set_epi16(short i7, + short i6, + short i5, + short i4, + short i3, + short i2, + short i1, + short i0) +{ + int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vreinterpretq_m128i_s16(vld1q_s16(data)); +} + +// Set packed 32-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32 +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Set packed 64-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64 +FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) +{ + return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0)); +} + +// Set packed 64-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x +FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); +} + +// Set packed 8-bit integers in dst with the supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8 +FORCE_INLINE __m128i _mm_set_epi8(signed char b15, + signed char b14, + signed char b13, + signed char b12, + signed char b11, + signed char b10, + signed char b9, + signed char b8, + signed char b7, + signed char b6, + signed char b5, + signed char b4, + signed char b3, + signed char b2, + signed char b1, + signed char b0) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd +FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) +{ + double ALIGN_STRUCT(16) data[2] = {e0, e1}; +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); +#else + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); +#endif +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1 +#define _mm_set_pd1 _mm_set1_pd + +// Copy double-precision (64-bit) floating-point element a to the lower element +// of dst, and zero the upper element. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd +FORCE_INLINE __m128d _mm_set_sd(double a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0)); +#else + return _mm_set_pd(0, a); +#endif +} + +// Broadcast 16-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16 +FORCE_INLINE __m128i _mm_set1_epi16(short w) +{ + return vreinterpretq_m128i_s16(vdupq_n_s16(w)); +} + +// Broadcast 32-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32 +FORCE_INLINE __m128i _mm_set1_epi32(int _i) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); +} + +// Broadcast 64-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64 +FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) +{ + return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0)); +} + +// Broadcast 64-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x +FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) +{ + return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); +} + +// Broadcast 8-bit integer a to all elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8 +FORCE_INLINE __m128i _mm_set1_epi8(signed char w) +{ + return vreinterpretq_m128i_s8(vdupq_n_s8(w)); +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd +FORCE_INLINE __m128d _mm_set1_pd(double d) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vdupq_n_f64(d)); +#else + int64_t _d = sse2neon_recast_f64_s64(d); + return vreinterpretq_m128d_s64(vdupq_n_s64(_d)); +#endif +} + +// Set packed 16-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16 +FORCE_INLINE __m128i _mm_setr_epi16(short w0, + short w1, + short w2, + short w3, + short w4, + short w5, + short w6, + short w7) +{ + int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; + return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); +} + +// Set packed 32-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32 +FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Set packed 64-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64 +FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) +{ + return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); +} + +// Set packed 8-bit integers in dst with the supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8 +FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, + signed char b1, + signed char b2, + signed char b3, + signed char b4, + signed char b5, + signed char b6, + signed char b7, + signed char b8, + signed char b9, + signed char b10, + signed char b11, + signed char b12, + signed char b13, + signed char b14, + signed char b15) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd +FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) +{ + return _mm_set_pd(e0, e1); +} + +// Return vector of type __m128d with all elements set to zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd +FORCE_INLINE __m128d _mm_setzero_pd(void) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vdupq_n_f64(0)); +#else + return vreinterpretq_m128d_f32(vdupq_n_f32(0)); +#endif +} + +// Return vector of type __m128i with all elements set to zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128 +FORCE_INLINE __m128i _mm_setzero_si128(void) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(0)); +} + +// Shuffle 32-bit integers in a using the control in imm8, and store the results +// in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32 +// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, +// __constrange(0,255) int imm) +#if defined(_sse2neon_shuffle) +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + int32x4_t _input = vreinterpretq_s32_m128i(a); \ + int32x4_t _shuf = \ + vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ + vreinterpretq_m128i_s32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_epi32(a, imm) \ + _sse2neon_define1( \ + __m128i, a, __m128i ret; switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_epi_1032(_a); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_epi_2301(_a); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_epi_0321(_a); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_epi_2103(_a); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_shuffle_epi_1010(_a); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_epi_1001(_a); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_epi_0101(_a); \ + break; \ + case _MM_SHUFFLE(2, 2, 1, 1): \ + ret = _mm_shuffle_epi_2211(_a); \ + break; \ + case _MM_SHUFFLE(0, 1, 2, 2): \ + ret = _mm_shuffle_epi_0122(_a); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 2): \ + ret = _mm_shuffle_epi_3332(_a); \ + break; \ + case _MM_SHUFFLE(0, 0, 0, 0): \ + ret = _mm_shuffle_epi32_splat(_a, 0); \ + break; \ + case _MM_SHUFFLE(1, 1, 1, 1): \ + ret = _mm_shuffle_epi32_splat(_a, 1); \ + break; \ + case _MM_SHUFFLE(2, 2, 2, 2): \ + ret = _mm_shuffle_epi32_splat(_a, 2); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 3): \ + ret = _mm_shuffle_epi32_splat(_a, 3); \ + break; \ + default: \ + ret = _mm_shuffle_epi32_default(_a, (imm)); \ + break; \ + } _sse2neon_return(ret);) +#endif + +// Shuffle double-precision (64-bit) floating-point elements using the control +// in imm8, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd +#ifdef _sse2neon_shuffle +#define _mm_shuffle_pd(a, b, imm8) \ + vreinterpretq_m128d_s64( \ + vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \ + imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2)) +#else +#define _mm_shuffle_pd(a, b, imm8) \ + _mm_castsi128_pd(_mm_set_epi64x( \ + vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ + vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) +#endif + +// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, +// __constrange(0,255) int imm) +#if defined(_sse2neon_shuffle) +#define _mm_shufflehi_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = \ + vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ + (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ + (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +#endif + +// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, +// __constrange(0,255) int imm) +#if defined(_sse2neon_shuffle) +#define _mm_shufflelo_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = vshuffleq_s16( \ + _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ + (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) +#endif + +// Shift packed 16-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16 +FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16((int16_t) c); + return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); +} + +// Shift packed 32-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32 +FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32((int32_t) c); + return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); +} + +// Shift packed 64-bit integers in a left by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64 +FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64((int64_t) c); + return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); +} + +// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16 +FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) +{ + if (_sse2neon_unlikely(imm & ~15)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s16( + vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm))); +} + +// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32 +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) +{ + if (_sse2neon_unlikely(imm & ~31)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s32( + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); +} + +// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64 +FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) +{ + if (_sse2neon_unlikely(imm & ~63)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s64( + vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); +} + +// Shift a left by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128 +#define _mm_slli_si128(a, imm) \ + _sse2neon_define1( \ + __m128i, a, int8x16_t ret; \ + if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \ + else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ + else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \ + ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \ + _sse2neon_return(vreinterpretq_m128i_s8(ret));) + +// Compute the square root of packed double-precision (64-bit) floating-point +// elements in a, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd +FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); +#else + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double _a0 = sqrt(a0); + double _a1 = sqrt(a1); + return _mm_set_pd(_a1, _a0); +#endif +} + +// Compute the square root of the lower double-precision (64-bit) floating-point +// element in b, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd +FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return _mm_move_sd(a, _mm_sqrt_pd(b)); +#else + double _a, _b; + _a = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + _b = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + return _mm_set_pd(_a, sqrt(_b)); +#endif +} + +// Shift packed 16-bit integers in a right by count while shifting in sign bits, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16 +FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) +{ + int64_t c = vgetq_lane_s64(count, 0); + if (_sse2neon_unlikely(c & ~15)) + return _mm_cmplt_epi16(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s16( + vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c))); +} + +// Shift packed 32-bit integers in a right by count while shifting in sign bits, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32 +FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) +{ + int64_t c = vgetq_lane_s64(count, 0); + if (_sse2neon_unlikely(c & ~31)) + return _mm_cmplt_epi32(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s32( + vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c))); +} + +// Shift packed 16-bit integers in a right by imm8 while shifting in sign +// bits, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16 +FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) +{ + const int count = (imm & ~15) ? 15 : imm; + return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); +} + +// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32 +// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srai_epi32(a, imm) \ + _sse2neon_define0( \ + __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \ + ret = _a; \ + } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_s32( \ + vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \ + } else { \ + ret = vreinterpretq_m128i_s32( \ + vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \ + } _sse2neon_return(ret);) + +// Shift packed 16-bit integers in a right by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16 +FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16(-(int16_t) c); + return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); +} + +// Shift packed 32-bit integers in a right by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32 +FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32(-(int32_t) c); + return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); +} + +// Shift packed 64-bit integers in a right by count while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64 +FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (_sse2neon_unlikely(c & ~63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64(-(int64_t) c); + return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); +} + +// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16 +#define _mm_srli_epi16(a, imm) \ + _sse2neon_define0( \ + __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u16( \ + vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \ + } _sse2neon_return(ret);) + +// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32 +// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_epi32(a, imm) \ + _sse2neon_define0( \ + __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u32( \ + vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \ + } _sse2neon_return(ret);) + +// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64 +#define _mm_srli_epi64(a, imm) \ + _sse2neon_define0( \ + __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_u64( \ + vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \ + } _sse2neon_return(ret);) + +// Shift a right by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128 +#define _mm_srli_si128(a, imm) \ + _sse2neon_define1( \ + __m128i, a, int8x16_t ret; \ + if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ + else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \ + (imm > 15 ? 0 : imm)); \ + _sse2neon_return(vreinterpretq_m128i_s8(ret));) + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd +FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); +#else + vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1 +FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); + vst1q_f64((float64_t *) mem_addr, + vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); +#else + float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); + vst1q_f32((float32_t *) mem_addr, + vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. mem_addr does not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd +FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); +#endif +} + +// Store 128-bits of integer data from a into memory. mem_addr must be aligned +// on a 16-byte boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128 +FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd +#define _mm_store1_pd _mm_store_pd1 + +// Store the upper double-precision (64-bit) floating-point element from a into +// memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd +FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store 64-bit integer from the first element of a into memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64 +FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) +{ + vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b))); +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd +FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store 2 double-precision (64-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd +FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) +{ + float32x4_t f = vreinterpretq_f32_m128d(a); + _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd +FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) +{ + _mm_store_pd(mem_addr, a); +} + +// Store 128-bits of integer data from a into memory. mem_addr does not need to +// be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128 +FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Store 32-bit integer from the first element of a into memory. mem_addr does +// not need to be aligned on any particular boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32 +FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) +{ + vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0); +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory using a non-temporal memory hint. mem_addr must +// be aligned on a 16-byte boundary or a general-protection exception may be +// generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd +FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (__m128d *) p); +#elif defined(__aarch64__) || defined(_M_ARM64) + vst1q_f64(p, vreinterpretq_f64_m128d(a)); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a)); +#endif +} + +// Store 128-bits of integer data from a into memory using a non-temporal memory +// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection +// exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128 +FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); +#endif +} + +// Store 32-bit integer a into memory using a non-temporal hint to minimize +// cache pollution. If the cache line containing address mem_addr is already in +// the cache, the cache will be updated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32 +FORCE_INLINE void _mm_stream_si32(int *p, int a) +{ + vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0); +} + +// Store 64-bit integer a into memory using a non-temporal hint to minimize +// cache pollution. If the cache line containing address mem_addr is already in +// the cache, the cache will be updated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64 +FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a) +{ + vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a)); +} + +// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16 +FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32 +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64 +FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8 +FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtract packed double-precision (64-bit) floating-point elements in b from +// packed double-precision (64-bit) floating-point elements in a, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd +FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[2]; + c[0] = a0 - b0; + c[1] = a1 - b1; + return vld1q_f32((float32_t *) c); +#endif +} + +// Subtract the lower double-precision (64-bit) floating-point element in b from +// the lower double-precision (64-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd +FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_sub_pd(a, b)); +} + +// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64 +FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a +// using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16 +FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a +// using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8 +FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit +// integers in a using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16 +FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit +// integers in a using saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8 +FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +#define _mm_ucomieq_sd _mm_comieq_sd +#define _mm_ucomige_sd _mm_comige_sd +#define _mm_ucomigt_sd _mm_comigt_sd +#define _mm_ucomile_sd _mm_comile_sd +#define _mm_ucomilt_sd _mm_comilt_sd +#define _mm_ucomineq_sd _mm_comineq_sd + +// Return vector of type __m128d with undefined elements. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd +FORCE_INLINE __m128d _mm_undefined_pd(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128d a; +#if defined(_MSC_VER) && !defined(__clang__) + a = _mm_setzero_pd(); +#endif + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +// Unpack and interleave 16-bit integers from the high half of a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16 +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s16( + vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave 32-bit integers from the high half of a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32 +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s32( + vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave 64-bit integers from the high half of a and b, and +// store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64 +FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s64( + vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); +#endif +} + +// Unpack and interleave 8-bit integers from the high half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8 +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s8( + vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the high half of a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd +FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), + vget_high_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Unpack and interleave 16-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16 +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s16( + vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave 32-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32 +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s32( + vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave 64-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64 +FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s64( + vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); +#endif +} + +// Unpack and interleave 8-bit integers from the low half of a and b, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8 +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s8( + vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the low half of a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd +FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), + vget_low_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Compute the bitwise XOR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd +FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Compute the bitwise XOR of 128 bits (representing integer data) in a and b, +// and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128 +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +/* SSE3 */ + +// Alternatively add and subtract packed double-precision (64-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd +FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) +{ + _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), + vreinterpretq_f64_m128d(b), + vreinterpretq_f64_m128d(mask))); +#else + return _mm_add_pd(_mm_mul_pd(b, mask), a); +#endif +} + +// Alternatively add and subtract packed single-precision (32-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps +FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) +{ + _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f); +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_FMA) /* VFPv4+ */ + return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a), + vreinterpretq_f32_m128(mask), + vreinterpretq_f32_m128(b))); +#else + return _mm_add_ps(_mm_mul_ps(b, mask), a); +#endif +} + +// Horizontally add adjacent pairs of double-precision (64-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd +FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[] = {a0 + a1, b0 + b1}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); +#endif +} + +// Horizontally add adjacent pairs of single-precision (32-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32( + vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); +#endif +} + +// Horizontally subtract adjacent pairs of double-precision (64-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd +FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + float64x2_t _a = vreinterpretq_f64_m128d(a); + float64x2_t _b = vreinterpretq_f64_m128d(b); + return vreinterpretq_m128d_f64( + vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b))); +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double c[] = {a0 - a1, b0 - b1}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); +#endif +} + +// Horizontally subtract adjacent pairs of single-precision (32-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps +FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) +{ + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b))); +#else + float32x4x2_t c = vuzpq_f32(a, b); + return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); +#endif +} + +// Load 128-bits of integer data from unaligned memory into dst. This intrinsic +// may perform better than _mm_loadu_si128 when the data crosses a cache line +// boundary. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128 +#define _mm_lddqu_si128 _mm_loadu_si128 + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd +#define _mm_loaddup_pd _mm_load1_pd + +// Duplicate the low double-precision (64-bit) floating-point element from a, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd +FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64( + vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_u64( + vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); +#endif +} + +// Duplicate odd-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps +FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); +#elif defined(_sse2neon_shuffle) + return vreinterpretq_m128_f32(vshuffleq_s32( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); +#else + float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); + float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Duplicate even-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps +FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128_f32( + vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); +#elif defined(_sse2neon_shuffle) + return vreinterpretq_m128_f32(vshuffleq_s32( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); +#else + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); + float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +/* SSSE3 */ + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16 +FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) +{ + return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32 +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8 +FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) +{ + return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16 +FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) +{ + return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32 +FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) +{ + return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8 +FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) +{ + return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); +} + +// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift +// the result right by imm8 bytes, and store the low 16 bytes in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8 +#if defined(__GNUC__) && !defined(__clang__) +#define _mm_alignr_epi8(a, b, imm) \ + __extension__({ \ + uint8x16_t _a = vreinterpretq_u8_m128i(a); \ + uint8x16_t _b = vreinterpretq_u8_m128i(b); \ + __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~31)) \ + ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + else if (imm >= 16) \ + ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \ + else \ + ret = \ + vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \ + ret; \ + }) + +#else +#define _mm_alignr_epi8(a, b, imm) \ + _sse2neon_define2( \ + __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \ + uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \ + if (_sse2neon_unlikely((imm) & ~31)) ret = \ + vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + else if (imm >= 16) ret = \ + _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \ + else ret = \ + vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \ + _sse2neon_return(ret);) + +#endif + +// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift +// the result right by imm8 bytes, and store the low 8 bytes in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8 +#define _mm_alignr_pi8(a, b, imm) \ + _sse2neon_define2( \ + __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \ + ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ + } else { \ + uint8x8_t tmp_low; \ + uint8x8_t tmp_high; \ + if ((imm) >= 8) { \ + const int idx = (imm) -8; \ + tmp_low = vreinterpret_u8_m64(_a); \ + tmp_high = vdup_n_u8(0); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = (imm); \ + tmp_low = vreinterpret_u8_m64(_b); \ + tmp_high = vreinterpret_u8_m64(_a); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } \ + } _sse2neon_return(ret);) + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16 +FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); +#else + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), + vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); +#endif +} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32 +FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s32(vpaddq_s32(a, b)); +#else + return vreinterpretq_m128i_s32( + vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), + vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); +#endif +} + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16 +FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32 +FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) +{ + return vreinterpret_m64_s32( + vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); +} + +// Horizontally add adjacent pairs of signed 16-bit integers in a and b using +// saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16 +FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); +#endif +} + +// Horizontally add adjacent pairs of signed 16-bit integers in a and b using +// saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16 +FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); +#else + int16x4x2_t res = vuzp_s16(a, b); + return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack +// the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16 +FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s16( + vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int16x8x2_t c = vuzpq_s16(a, b); + return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack +// the signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32 +FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s32( + vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b))); +#else + int32x4x2_t c = vuzpq_s32(a, b); + return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack +// the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16 +FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); +#else + int16x4x2_t c = vuzp_s16(a, b); + return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack +// the signed 32-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32 +FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) +{ + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b))); +#else + int32x2x2_t c = vuzp_s32(a, b); + return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b +// using saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16 +FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s16( + vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int16x8x2_t c = vuzpq_s16(a, b); + return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1])); +#endif +} + +// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b +// using saturation, and pack the signed 16-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16 +FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); +#else + int16x4x2_t c = vuzp_s16(a, b); + return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1])); +#endif +} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, +// and pack the saturated results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16 +FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + uint8x16_t a = vreinterpretq_u8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vmovl_s8(vget_low_s8(b))); + int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), + vmovl_s8(vget_high_s8(b))); + return vreinterpretq_m128i_s16( + vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); +#else + // This would be much simpler if x86 would choose to zero extend OR sign + // extend, not both. This could probably be optimized better. + uint16x8_t a = vreinterpretq_u16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // Zero extend a + int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); + int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); + + // Sign extend by shifting left then shifting right. + int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); + int16x8_t b_odd = vshrq_n_s16(b, 8); + + // multiply + int16x8_t prod1 = vmulq_s16(a_even, b_even); + int16x8_t prod2 = vmulq_s16(a_odd, b_odd); + + // saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); +#endif +} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and +// pack the saturated results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16 +FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b) +{ + uint16x4_t a = vreinterpret_u16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // Zero extend a + int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8)); + int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff))); + + // Sign extend by shifting left then shifting right. + int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8); + int16x4_t b_odd = vshr_n_s16(b, 8); + + // multiply + int16x4_t prod1 = vmul_s16(a_even, b_even); + int16x4_t prod2 = vmul_s16(a_odd, b_odd); + + // saturated add + return vreinterpret_m64_s16(vqadd_s16(prod1, prod2)); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Shift right by 15 bits while rounding up, and store +// the packed 16-bit integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16 +FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) +{ + // Has issues due to saturation + // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); + + // Multiply + int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + // Rounding narrowing shift right + // narrow = (int16_t)((mul + 16384) >> 15); + int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); + int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); + + // Join together + return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Truncate each intermediate integer to the 18 most +// significant bits, round by adding 1, and store bits [16:1] to dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16 +FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) +{ + int32x4_t mul_extend = + vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b))); + + // Rounding narrowing shift right + return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15)); +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8 +FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) +{ + int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a + uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b + uint8x16_t idx_masked = + vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); +#elif defined(__GNUC__) + int8x16_t ret; + // %e and %f represent the even and odd D registers + // respectively. + __asm__ __volatile__( + "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" + "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" + : [ret] "=&w"(ret) + : [tbl] "w"(tbl), [idx] "w"(idx_masked)); + return vreinterpretq_m128i_s8(ret); +#else + // use this line if testing on aarch64 + int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; + return vreinterpretq_m128i_s8( + vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), + vtbl2_s8(a_split, vget_high_u8(idx_masked)))); +#endif +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8 +FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) +{ + const int8x8_t controlMask = + vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07))); + int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask); + return vreinterpret_m64_s8(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed +// 16-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16 +FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); +#else + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative + // 'a') based on ltMask + int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); + // res = masked & (~zeroMask) + int16x8_t res = vbicq_s16(masked, zeroMask); + return vreinterpretq_m128i_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed +// 32-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32 +FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); +#else + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative + // 'a') based on ltMask + int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); + // res = masked & (~zeroMask) + int32x4_t res = vbicq_s32(masked, zeroMask); + return vreinterpretq_m128i_s32(res); +} + +// Negate packed 8-bit integers in a when the corresponding signed +// 8-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8 +FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) +{ + int8x16_t a = vreinterpretq_s8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); +#else + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a') + // based on ltMask + int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); + // res = masked & (~zeroMask) + int8x16_t res = vbicq_s8(masked, zeroMask); + + return vreinterpretq_m128i_s8(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16 +FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); + + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); +#else + int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); +#endif + + // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a') + // based on ltMask + int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); + // res = masked & (~zeroMask) + int16x4_t res = vbic_s16(masked, zeroMask); + + return vreinterpret_m64_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed 32-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32 +FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) +{ + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); +#else + int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); +#endif + + // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a') + // based on ltMask + int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); + // res = masked & (~zeroMask) + int32x2_t res = vbic_s32(masked, zeroMask); + + return vreinterpret_m64_s32(res); +} + +// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer +// in b is negative, and store the results in dst. Element in dst are zeroed out +// when the corresponding element in b is zero. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8 +FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) +{ + int8x8_t a = vreinterpret_s8_m64(_a); + int8x8_t b = vreinterpret_s8_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) || defined(_M_ARM64) + int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); +#else + int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); +#endif + + // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a') + // based on ltMask + int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); + // res = masked & (~zeroMask) + int8x8_t res = vbic_s8(masked, zeroMask); + + return vreinterpret_m64_s8(res); +} + +/* SSE4.1 */ + +// Blend packed 16-bit integers from a and b using control mask imm8, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16 +// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, +// __constrange(0,255) int imm) +#define _mm_blend_epi16(a, b, imm) \ + _sse2neon_define2( \ + __m128i, a, b, \ + const uint16_t _mask[8] = \ + _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \ + uint16x8_t _mask_vec = vld1q_u16(_mask); \ + uint16x8_t __a = vreinterpretq_u16_m128i(_a); \ + uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \ + vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));) + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using control mask imm8, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd +#define _mm_blend_pd(a, b, imm) \ + _sse2neon_define2( \ + __m128d, a, b, \ + const uint64_t _mask[2] = \ + _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ + ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \ + uint64x2_t _mask_vec = vld1q_u64(_mask); \ + uint64x2_t __a = vreinterpretq_u64_m128d(_a); \ + uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \ + vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));) + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps +FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) +{ + const uint32_t ALIGN_STRUCT(16) + data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, + ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; + uint32x4_t mask = vld1q_u32(data); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Blend packed 8-bit integers from a and b using mask, and store the results in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8 +FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint8x16_t mask = + vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); + uint8x16_t a = vreinterpretq_u8_m128i(_a); + uint8x16_t b = vreinterpretq_u8_m128i(_b); + return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); +} + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd +FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) +{ + uint64x2_t mask = + vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); +#if defined(__aarch64__) || defined(_M_ARM64) + float64x2_t a = vreinterpretq_f64_m128d(_a); + float64x2_t b = vreinterpretq_f64_m128d(_b); + return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); +#else + uint64x2_t a = vreinterpretq_u64_m128d(_a); + uint64x2_t b = vreinterpretq_u64_m128d(_b); + return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); +#endif +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps +FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint32x4_t mask = + vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Round the packed double-precision (64-bit) floating-point elements in a up +// to an integer value, and store the results as packed double-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd +FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); +#else + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_pd(ceil(a1), ceil(a0)); +#endif +} + +// Round the packed single-precision (32-bit) floating-point elements in a up to +// an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps +FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); +#else + float *f = (float *) &a; + return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); +#endif +} + +// Round the lower double-precision (64-bit) floating-point element in b up to +// an integer value, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd +FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_ceil_pd(b)); +} + +// Round the lower single-precision (32-bit) floating-point element in b up to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss +FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_ceil_ps(b)); +} + +// Compare packed 64-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_u64( + vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); +#else + // ARMv7 lacks vceqq_u64 + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32 +FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32( + vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); +} + +// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64 +FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) +{ + int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64 +FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); +} + +// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16 +FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_s16(s16x8); +} + +// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store +// the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32 +FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_s32(s32x4); +} + +// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit +// integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64 +FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32 +FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) +{ + return vreinterpretq_m128i_u32( + vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); +} + +// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64 +FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) +{ + uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64 +FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) +{ + return vreinterpretq_m128i_u64( + vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); +} + +// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16 +FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */ + return vreinterpretq_m128i_u16(u16x8); +} + +// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, +// and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32 +FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_u32(u32x4); +} + +// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed +// 64-bit integers, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64 +FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Conditionally multiply the packed double-precision (64-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, and +// conditionally store the sum in dst using the low 4 bits of imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd +FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) +{ + // Generate mask value from constant immediate bit value + const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0; + const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0; +#if !SSE2NEON_PRECISE_DP + const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0; + const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0; +#endif + // Conditional multiplication +#if !SSE2NEON_PRECISE_DP + __m128d mul = _mm_mul_pd(a, b); + const __m128d mulMask = + _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask)); + __m128d tmp = _mm_and_pd(mul, mulMask); +#else +#if defined(__aarch64__) || defined(_M_ARM64) + double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) * + vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0) + : 0; + double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) * + vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1) + : 0; +#else + double a0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + double a1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + double b0 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0)); + double b1 = + sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1)); + double d0 = (imm & 0x10) ? a0 * b0 : 0; + double d1 = (imm & 0x20) ? a1 * b1 : 0; +#endif + __m128d tmp = _mm_set_pd(d1, d0); +#endif + // Sum the products +#if defined(__aarch64__) || defined(_M_ARM64) + double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); +#else + double _tmp0 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0)); + double _tmp1 = sse2neon_recast_u64_f64( + vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1)); + double sum = _tmp0 + _tmp1; +#endif + // Conditionally store the sum + const __m128d sumMask = + _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask)); + __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask); + return res; +} + +// Conditionally multiply the packed single-precision (32-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, +// and conditionally store the sum in dst using the low 4 bits of imm. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps +FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) +{ + float32x4_t elementwise_prod = _mm_mul_ps(a, b); + +#if defined(__aarch64__) || defined(_M_ARM64) + /* shortcuts */ + if (imm == 0xFF) { + return _mm_set1_ps(vaddvq_f32(elementwise_prod)); + } + + if ((imm & 0x0F) == 0x0F) { + if (!(imm & (1 << 4))) + elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0); + if (!(imm & (1 << 5))) + elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1); + if (!(imm & (1 << 6))) + elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2); + if (!(imm & (1 << 7))) + elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3); + + return _mm_set1_ps(vaddvq_f32(elementwise_prod)); + } +#endif + + float s = 0.0f; + + if (imm & (1 << 4)) + s += vgetq_lane_f32(elementwise_prod, 0); + if (imm & (1 << 5)) + s += vgetq_lane_f32(elementwise_prod, 1); + if (imm & (1 << 6)) + s += vgetq_lane_f32(elementwise_prod, 2); + if (imm & (1 << 7)) + s += vgetq_lane_f32(elementwise_prod, 3); + + const float32_t res[4] = { + (imm & 0x1) ? s : 0.0f, + (imm & 0x2) ? s : 0.0f, + (imm & 0x4) ? s : 0.0f, + (imm & 0x8) ? s : 0.0f, + }; + return vreinterpretq_m128_f32(vld1q_f32(res)); +} + +// Extract a 32-bit integer from a, selected with imm8, and store the result in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32 +// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) +#define _mm_extract_epi32(a, imm) \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) + +// Extract a 64-bit integer from a, selected with imm8, and store the result in +// dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64 +// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) +#define _mm_extract_epi64(a, imm) \ + vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) + +// Extract an 8-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a, +// __constrange(0,16) int imm) +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8 +#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) + +// Extracts the selected single-precision (32-bit) floating-point from a. +// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) +#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) + +// Round the packed double-precision (64-bit) floating-point elements in a down +// to an integer value, and store the results as packed double-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd +FORCE_INLINE __m128d _mm_floor_pd(__m128d a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); +#else + double a0, a1; + a0 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)); + a1 = sse2neon_recast_u64_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1)); + return _mm_set_pd(floor(a1), floor(a0)); +#endif +} + +// Round the packed single-precision (32-bit) floating-point elements in a down +// to an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps +FORCE_INLINE __m128 _mm_floor_ps(__m128 a) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); +#else + float *f = (float *) &a; + return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); +#endif +} + +// Round the lower double-precision (64-bit) floating-point element in b down to +// an integer value, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd +FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_floor_pd(b)); +} + +// Round the lower single-precision (32-bit) floating-point element in b down to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss +FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_floor_ps(b)); +} + +// Copy a to dst, and insert the 32-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32 +// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, +// __constrange(0,4) int imm) +#define _mm_insert_epi32(a, b, imm) \ + vreinterpretq_m128i_s32( \ + vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))) + +// Copy a to dst, and insert the 64-bit integer i into dst at the location +// specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64 +// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, +// __constrange(0,2) int imm) +#define _mm_insert_epi64(a, b, imm) \ + vreinterpretq_m128i_s64( \ + vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))) + +// Copy a to dst, and insert the lower 8-bit integer from i into dst at the +// location specified by imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8 +// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, +// __constrange(0,16) int imm) +#define _mm_insert_epi8(a, b, imm) \ + vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))) + +// Copy a to tmp, then insert a single-precision (32-bit) floating-point +// element from b into tmp using the control in imm8. Store tmp to dst using +// the mask in imm8 (elements are zeroed out when the corresponding bit is set). +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps +#define _mm_insert_ps(a, b, imm8) \ + _sse2neon_define2( \ + __m128, a, b, \ + float32x4_t tmp1 = \ + vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \ + vreinterpretq_f32_m128(_a), 0); \ + float32x4_t tmp2 = \ + vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \ + vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \ + const uint32_t data[4] = \ + _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ + ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \ + uint32x4_t mask = vld1q_u32(data); \ + float32x4_t all_zeros = vdupq_n_f32(0); \ + \ + _sse2neon_return(vreinterpretq_m128_f32( \ + vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));) + +// Compare packed signed 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32 +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8 +FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16 +FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Compare packed signed 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32 +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8 +FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16 +FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Horizontally compute the minimum amongst the packed unsigned 16-bit integers +// in a, store the minimum and index in dst, and zero the remaining bits in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16 +FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) +{ + __m128i dst; + uint16_t min, idx = 0; +#if defined(__aarch64__) || defined(_M_ARM64) + // Find the minimum value + min = vminvq_u16(vreinterpretq_u16_m128i(a)); + + // Get the index of the minimum value + static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7}; + uint16x8_t minv = vdupq_n_u16(min); + uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a)); + idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq)); +#else + // Find the minimum value + __m64 tmp; + tmp = vreinterpret_m64_u16( + vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), + vget_high_u16(vreinterpretq_u16_m128i(a)))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); + // Get the index of the minimum value + int i; + for (i = 0; i < 8; i++) { + if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { + idx = (uint16_t) i; + break; + } + a = _mm_srli_si128(a, 2); + } +#endif + // Generate result + dst = _mm_setzero_si128(); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); + return dst; +} + +// Compute the sum of absolute differences (SADs) of quadruplets of unsigned +// 8-bit integers in a compared to those in b, and store the 16-bit results in +// dst. Eight SADs are performed using one quadruplet from b and eight +// quadruplets from a. One quadruplet is selected from b starting at on the +// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit +// integers selected from a starting at the offset specified in imm8. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8 +FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) +{ + uint8x16_t _a, _b; + + switch (imm & 0x4) { + case 0: + // do nothing + _a = vreinterpretq_u8_m128i(a); + break; + case 4: + _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a), + vreinterpretq_u32_m128i(a), 1)); + break; + default: +#if defined(__GNUC__) || defined(__clang__) + __builtin_unreachable(); +#elif defined(_MSC_VER) + __assume(0); +#endif + break; + } + + switch (imm & 0x3) { + case 0: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0))); + break; + case 1: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1))); + break; + case 2: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2))); + break; + case 3: + _b = vreinterpretq_u8_u32( + vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3))); + break; + default: +#if defined(__GNUC__) || defined(__clang__) + __builtin_unreachable(); +#elif defined(_MSC_VER) + __assume(0); +#endif + break; + } + + int16x8_t c04, c15, c26, c37; + uint8x8_t low_b = vget_low_u8(_b); + c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b)); + uint8x16_t _a_1 = vextq_u8(_a, _a, 1); + c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b)); + uint8x16_t _a_2 = vextq_u8(_a, _a, 2); + c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b)); + uint8x16_t _a_3 = vextq_u8(_a, _a, 3); + c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b)); +#if defined(__aarch64__) || defined(_M_ARM64) + // |0|4|2|6| + c04 = vpaddq_s16(c04, c26); + // |1|5|3|7| + c15 = vpaddq_s16(c15, c37); + + int32x4_t trn1_c = + vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); + int32x4_t trn2_c = + vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); + return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c), + vreinterpretq_s16_s32(trn2_c))); +#else + int16x4_t c01, c23, c45, c67; + c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15)); + c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37)); + c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15)); + c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37)); + + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67))); +#endif +} + +// Multiply the low signed 32-bit integers from each packed 64-bit element in +// a and b, and store the signed 64-bit results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32 +FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) +{ + // vmull_s32 upcasts instead of masking, so we downcast. + int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); + int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); +} + +// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit +// integers, and store the low 32 bits of the intermediate integers in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32 +FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Convert packed signed 32-bit integers from a and b to packed 16-bit integers +// using unsigned saturation, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32 +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), + vqmovun_s32(vreinterpretq_s32_m128i(b)))); +} + +// Round the packed double-precision (64-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed double-precision +// floating-point elements in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd +FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return _mm_floor_pd(a); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return _mm_ceil_pd(a); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a))); + } +#else + double *v_double = (double *) &a; + + if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { + double res[2], tmp; + for (int i = 0; i < 2; i++) { + tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i]; + double roundDown = floor(tmp); // Round down value + double roundUp = ceil(tmp); // Round up value + double diffDown = tmp - roundDown; + double diffUp = roundUp - tmp; + if (diffDown < diffUp) { + /* If it's closer to the round down value, then use it */ + res[i] = roundDown; + } else if (diffDown > diffUp) { + /* If it's closer to the round up value, then use it */ + res[i] = roundUp; + } else { + /* If it's equidistant between round up and round down value, + * pick the one which is an even number */ + double half = roundDown / 2; + if (half != floor(half)) { + /* If the round down value is odd, return the round up value + */ + res[i] = roundUp; + } else { + /* If the round up value is odd, return the round down value + */ + res[i] = roundDown; + } + } + res[i] = (v_double[i] < 0) ? -res[i] : res[i]; + } + return _mm_set_pd(res[1], res[0]); + } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { + return _mm_floor_pd(a); + } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { + return _mm_ceil_pd(a); + } + return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]), + v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0])); +#endif +} + +// Round the packed single-precision (32-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed single-precision +// floating-point elements in dst. +// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps +FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) +{ +#if (defined(__aarch64__) || defined(_M_ARM64)) || \ + defined(__ARM_FEATURE_DIRECTED_ROUNDING) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return _mm_floor_ps(a); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return _mm_ceil_ps(a); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); + } +#else + float *v_float = (float *) &a; + + if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = vcvtq_s32_f32( + vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = + vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal))); + } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { + return _mm_floor_ps(a); + } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || + (rounding == _MM_FROUND_CUR_DIRECTION && + _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { + return _mm_ceil_ps(a); + } + return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]), + v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]), + v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]), + v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0])); +#endif +} + +// Round the lower double-precision (64-bit) floating-point element in b using +// the rounding parameter, store the result as a double-precision floating-point +// element in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd +FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) +{ + return _mm_move_sd(a, _mm_round_pd(b, rounding)); +} + +// Round the lower single-precision (32-bit) floating-point element in b using +// the rounding parameter, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. Rounding is done according to the +// rounding[3:0] parameter, which can be one of: +// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and +// suppress exceptions +// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and +// suppress exceptions +// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress +// exceptions +// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress +// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see +// _MM_SET_ROUNDING_MODE +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss +FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) +{ + return _mm_move_ss(a, _mm_round_ps(b, rounding)); +} + +// Load 128-bits of integer data from memory into dst using a non-temporal +// memory hint. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128 +FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) +{ +#if __has_builtin(__builtin_nontemporal_store) + return __builtin_nontemporal_load(p); +#else + return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); +#endif +} + +// Compute the bitwise NOT of a and then AND with a 128-bit vector containing +// all 1's, and return 1 if the result is zero, otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones +FORCE_INLINE int _mm_test_all_ones(__m128i a) +{ + return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == + ~(uint64_t) 0; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and return 1 if the result is zero, otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros +FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) +{ + int64x2_t a_and_mask = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); + return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute +// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is +// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, +// otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero +// Note: Argument names may be wrong in the Intel intrinsics guide. +FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) +{ + uint64x2_t v = vreinterpretq_u64_m128i(a); + uint64x2_t m = vreinterpretq_u64_m128i(mask); + + // find ones (set-bits) and zeros (clear-bits) under clip mask + uint64x2_t ones = vandq_u64(m, v); + uint64x2_t zeros = vbicq_u64(m, v); + + // If both 128-bit variables are populated (non-zero) then return 1. + // For comparison purposes, first compact each var down to 32-bits. + uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros)); + + // if folding minimum is non-zero then both vars must be non-zero + return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the CF value. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128 +FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, +// otherwise return 0. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128 +#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the ZF value. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128 +FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +/* SSE4.2 */ + +static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, +}; +static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, +}; + +/* specify the source data format */ +#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */ +#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */ +#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */ +#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */ + +/* specify the comparison operation */ +#define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */ +#define _SIDD_CMP_RANGES 0x04 /* compare ranges */ +#define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */ +#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */ + +/* specify the polarity */ +#define _SIDD_POSITIVE_POLARITY 0x00 +#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 +#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */ +#define _SIDD_MASKED_NEGATIVE_POLARITY \ + 0x30 /* negate results only before end of string */ + +/* specify the output selection in _mm_cmpXstri */ +#define _SIDD_LEAST_SIGNIFICANT 0x00 +#define _SIDD_MOST_SIGNIFICANT 0x40 + +/* specify the output selection in _mm_cmpXstrm */ +#define _SIDD_BIT_MASK 0x00 +#define _SIDD_UNIT_MASK 0x40 + +/* Pattern Matching for C macros. + * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms + */ + +/* catenate */ +#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__ +#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b) + +#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c) +/* run the 2nd parameter */ +#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__ +/* run the 1st parameter */ +#define SSE2NEON_IIF_1(t, ...) t + +#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b) +#define SSE2NEON_COMPL_0 1 +#define SSE2NEON_COMPL_1 0 + +#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x) +#define SSE2NEON_DEC_1 0 +#define SSE2NEON_DEC_2 1 +#define SSE2NEON_DEC_3 2 +#define SSE2NEON_DEC_4 3 +#define SSE2NEON_DEC_5 4 +#define SSE2NEON_DEC_6 5 +#define SSE2NEON_DEC_7 6 +#define SSE2NEON_DEC_8 7 +#define SSE2NEON_DEC_9 8 +#define SSE2NEON_DEC_10 9 +#define SSE2NEON_DEC_11 10 +#define SSE2NEON_DEC_12 11 +#define SSE2NEON_DEC_13 12 +#define SSE2NEON_DEC_14 13 +#define SSE2NEON_DEC_15 14 +#define SSE2NEON_DEC_16 15 + +/* detection */ +#define SSE2NEON_CHECK_N(x, n, ...) n +#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, ) +#define SSE2NEON_PROBE(x) x, 1, + +#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x)) +#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~) + +#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x)) +#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c)) + +#define SSE2NEON_EAT(...) +#define SSE2NEON_EXPAND(...) __VA_ARGS__ +#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT) + +/* recursion */ +/* deferred expression */ +#define SSE2NEON_EMPTY() +#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY() +#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)() +#define SSE2NEON_EXPAND(...) __VA_ARGS__ + +#define SSE2NEON_EVAL(...) \ + SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__))) +#define SSE2NEON_EVAL1(...) \ + SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__))) +#define SSE2NEON_EVAL2(...) \ + SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__))) +#define SSE2NEON_EVAL3(...) __VA_ARGS__ + +#define SSE2NEON_REPEAT(count, macro, ...) \ + SSE2NEON_WHEN(count) \ + (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \ + SSE2NEON_DEC(count), macro, \ + __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \ + __VA_ARGS__)) +#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT + +#define SSE2NEON_SIZE_OF_byte 8 +#define SSE2NEON_NUMBER_OF_LANES_byte 16 +#define SSE2NEON_SIZE_OF_word 16 +#define SSE2NEON_NUMBER_OF_LANES_word 8 + +#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \ + mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \ + vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \ + vreinterpretq_##type##_m128i(a))); + +#define SSE2NEON_FILL_LANE(i, type) \ + vec_b[i] = \ + vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)); + +#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \ + number_of_lanes, byte_or_word) \ + do { \ + SSE2NEON_CAT( \ + data_type_prefix, \ + SSE2NEON_CAT(size, \ + SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \ + vec_b[number_of_lanes]; \ + __m128i mask = SSE2NEON_IIF(byte_or_word)( \ + vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \ + vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \ + SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \ + SSE2NEON_CAT(type_prefix, size))) \ + for (int i = 0; i < number_of_lanes; i++) { \ + mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \ + size)(SSE2NEON_CAT(vbslq_u, size)( \ + SSE2NEON_CAT(vreinterpretq_u, \ + SSE2NEON_CAT(size, _m128i))(mask), \ + SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \ + vec_b[i], \ + SSE2NEON_CAT( \ + vreinterpretq_, \ + SSE2NEON_CAT(type_prefix, \ + SSE2NEON_CAT(size, _m128i(a))))), \ + SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \ + vec_b[i], \ + SSE2NEON_CAT( \ + vreinterpretq_, \ + SSE2NEON_CAT(type_prefix, \ + SSE2NEON_CAT(size, _m128i(a))))))); \ + } \ + } while (0) + +#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \ + do { \ + SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \ + SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \ + SSE2NEON_CAT(u, size))) \ + } while (0) + +#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \ + static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \ + int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_equal_any_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ + type))))(la, lb, mtx); \ + } + +#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ + static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \ + int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_RANGES( \ + a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_ranges_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ + type))))(la, lb, mtx); \ + } + +#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \ + static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \ + __m128i b, int lb) \ + { \ + __m128i mtx[16]; \ + PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ + return SSE2NEON_CAT( \ + _sse2neon_aggregate_equal_ordered_, \ + SSE2NEON_CAT( \ + SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ + SSE2NEON_CAT(x, \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \ + SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \ + } + +static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16]) +{ + int res = 0; + int m = (1 << la) - 1; + uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); + uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); + uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); + uint8x16_t vec = vcombine_u8(t_lo, t_hi); + for (int j = 0; j < lb; j++) { + mtx[j] = vreinterpretq_m128i_u8( + vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); + mtx[j] = vreinterpretq_m128i_u8( + vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); + int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0; + res |= (tmp << j); + } + return res; +} + +static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) +{ + int res = 0; + int m = (1 << la) - 1; + uint16x8_t vec = + vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); + for (int j = 0; j < lb; j++) { + mtx[j] = vreinterpretq_m128i_u16( + vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); + mtx[j] = vreinterpretq_m128i_u16( + vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); + int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0; + res |= (tmp << j); + } + return res; +} + +/* clang-format off */ +#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \ + prefix##IMPL(byte) \ + prefix##IMPL(word) +/* clang-format on */ + +SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_) + +static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) +{ + int res = 0; + int m = (1 << la) - 1; + uint16x8_t vec = + vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); + for (int j = 0; j < lb; j++) { + mtx[j] = vreinterpretq_m128i_u16( + vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); + mtx[j] = vreinterpretq_m128i_u16( + vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); + __m128i tmp = vreinterpretq_m128i_u32( + vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16)); + uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]), + vreinterpretq_u32_m128i(tmp)); +#if defined(__aarch64__) || defined(_M_ARM64) + int t = vaddvq_u32(vec_res) ? 1 : 0; +#else + uint64x2_t sumh = vpaddlq_u32(vec_res); + int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); +#endif + res |= (t << j); + } + return res; +} + +static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) +{ + int res = 0; + int m = (1 << la) - 1; + uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); + uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); + uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); + uint8x16_t vec = vcombine_u8(t_lo, t_hi); + for (int j = 0; j < lb; j++) { + mtx[j] = vreinterpretq_m128i_u8( + vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); + mtx[j] = vreinterpretq_m128i_u8( + vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); + __m128i tmp = vreinterpretq_m128i_u16( + vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8)); + uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]), + vreinterpretq_u16_m128i(tmp)); + int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0; + res |= (t << j); + } + return res; +} + +#define SSE2NEON_CMP_RANGES_IS_BYTE 1 +#define SSE2NEON_CMP_RANGES_IS_WORD 0 + +/* clang-format off */ +#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \ + prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \ + prefix##IMPL(byte, int, s, prefix##IS_BYTE) \ + prefix##IMPL(word, uint, u, prefix##IS_WORD) \ + prefix##IMPL(word, int, s, prefix##IS_WORD) +/* clang-format on */ + +SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_) + +#undef SSE2NEON_CMP_RANGES_IS_BYTE +#undef SSE2NEON_CMP_RANGES_IS_WORD + +static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) +{ + uint8x16_t mtx = + vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)); + int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); + int m1 = 0x10000 - (1 << la); + int tb = 0x10000 - (1 << lb); + uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi; + uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi; + vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); + vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask); + vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask); + vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask); + vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask); + tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask); + tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask); + + res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx)); + res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx)); + res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo); + res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi); + res_lo = vand_u8(res_lo, vec_mask); + res_hi = vand_u8(res_hi, vec_mask); + + int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8); + return res; +} + +static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) +{ + uint16x8_t mtx = + vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); + int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); + int m1 = 0x100 - (1 << la); + int tb = 0x100 - (1 << lb); + uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b); + uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask); + uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask); + uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask); + mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx); + mtx = vbslq_u16(vec1, tmp, mtx); + mtx = vandq_u16(mtx, vec_mask); + return _sse2neon_vaddvq_u16(mtx); +} + +#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1 +#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0 + +#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \ + static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \ + int bound, int la, int lb, __m128i mtx[16]) \ + { \ + int res = 0; \ + int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \ + uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \ + vld1_u##size(_sse2neon_cmpestr_mask##size##b), \ + vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \ + uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \ + vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \ + vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \ + vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \ + uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \ + uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \ + for (int j = 0; j < lb; j++) { \ + mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \ + vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \ + } \ + for (int j = lb; j < bound; j++) { \ + mtx[j] = vreinterpretq_m128i_u##size( \ + vbslq_u##size(vec1, vec_minusone, vec_zero)); \ + } \ + unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \ + (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \ + for (int i = 0; i < bound; i++) { \ + int val = 1; \ + for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \ + val &= ptr[k * bound + j]; \ + res += val << i; \ + } \ + return res; \ + } + +/* clang-format off */ +#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \ + prefix##IMPL(8, 16, prefix##IS_UBYTE) \ + prefix##IMPL(16, 8, prefix##IS_UWORD) +/* clang-format on */ + +SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_) + +#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE +#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD + +/* clang-format off */ +#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \ + prefix##IMPL(byte) \ + prefix##IMPL(word) +/* clang-format on */ + +SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_) + +#define SSE2NEON_CMPESTR_LIST \ + _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \ + _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \ + _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \ + _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \ + _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \ + _(CMP_UWORD_RANGES, cmp_uword_ranges) \ + _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \ + _(CMP_SWORD_RANGES, cmp_sword_ranges) \ + _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \ + _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \ + _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \ + _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \ + _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ + _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \ + _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ + _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered) + +enum { +#define _(name, func_suffix) name, + SSE2NEON_CMPESTR_LIST +#undef _ +}; +typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); +static cmpestr_func_t _sse2neon_cmpfunc_table[] = { +#define _(name, func_suffix) _sse2neon_##func_suffix, + SSE2NEON_CMPESTR_LIST +#undef _ +}; + +FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) +{ + switch (imm8 & 0x30) { + case _SIDD_NEGATIVE_POLARITY: + res ^= 0xffffffff; + break; + case _SIDD_MASKED_NEGATIVE_POLARITY: + res ^= (1 << lb) - 1; + break; + default: + break; + } + + return res & ((bound == 8) ? 0xFF : 0xFFFF); +} + +FORCE_INLINE int _sse2neon_clz(unsigned int x) +{ +#if defined(_MSC_VER) && !defined(__clang__) + unsigned long cnt = 0; + if (_BitScanReverse(&cnt, x)) + return 31 - cnt; + return 32; +#else + return x != 0 ? __builtin_clz(x) : 32; +#endif +} + +FORCE_INLINE int _sse2neon_ctz(unsigned int x) +{ +#if defined(_MSC_VER) && !defined(__clang__) + unsigned long cnt = 0; + if (_BitScanForward(&cnt, x)) + return cnt; + return 32; +#else + return x != 0 ? __builtin_ctz(x) : 32; +#endif +} + +FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) +{ +#ifdef _MSC_VER + unsigned long cnt; +#if defined(SSE2NEON_HAS_BITSCAN64) + if (_BitScanForward64(&cnt, x)) + return (int) (cnt); +#else + if (_BitScanForward(&cnt, (unsigned long) (x))) + return (int) cnt; + if (_BitScanForward(&cnt, (unsigned long) (x >> 32))) + return (int) (cnt + 32); +#endif /* SSE2NEON_HAS_BITSCAN64 */ + return 64; +#else /* assume GNU compatible compilers */ + return x != 0 ? __builtin_ctzll(x) : 64; +#endif +} + +#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y) + +#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \ + const int var = (imm & 0x01) ? 8 : 16 + +#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \ + int tmp1 = la ^ (la >> 31); \ + la = tmp1 - (la >> 31); \ + int tmp2 = lb ^ (lb >> 31); \ + lb = tmp2 - (lb >> 31); \ + la = SSE2NEON_MIN(la, bound); \ + lb = SSE2NEON_MIN(lb, bound) + +// Compare all pairs of character in string a and b, +// then aggregate the result. +// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the +// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of +// string a and b. +#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \ + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \ + SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \ + int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \ + r2 = _sse2neon_sido_negative(r2, lb, imm8, bound) + +#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \ + return (r2 == 0) ? bound \ + : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \ + : _sse2neon_ctz(r2)) + +#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \ + __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ + if (imm8 & 0x40) { \ + if (bound == 8) { \ + uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \ + vld1q_u16(_sse2neon_cmpestr_mask16b)); \ + dst = vreinterpretq_m128i_u16(vbslq_u16( \ + tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \ + } else { \ + uint8x16_t vec_r2 = \ + vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \ + uint8x16_t tmp = \ + vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \ + dst = vreinterpretq_m128i_u8( \ + vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \ + } \ + } else { \ + if (bound == 16) { \ + dst = vreinterpretq_m128i_u16( \ + vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \ + } else { \ + dst = vreinterpretq_m128i_u8( \ + vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \ + } \ + } \ + return dst + +// Compare packed strings in a and b with lengths la and lb using the control +// in imm8, and returns 1 if b did not contain a null character and the +// resulting mask was zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra +FORCE_INLINE int _mm_cmpestra(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + int lb_cpy = lb; + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); + return !r2 & (lb_cpy > bound); +} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc +FORCE_INLINE int _mm_cmpestrc(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); + return r2 != 0; +} + +// Compare packed strings in a and b with lengths la and lb using the control +// in imm8, and store the generated index in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri +FORCE_INLINE int _mm_cmpestri(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); + SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8); +} + +// Compare packed strings in a and b with lengths la and lb using the control +// in imm8, and store the generated mask in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm +FORCE_INLINE __m128i +_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); + SSE2NEON_CMPSTR_GENERATE_MASK(dst); +} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns bit 0 of the resulting bit mask. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro +FORCE_INLINE int _mm_cmpestro(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); + return r2 & 1; +} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns 1 if any character in a was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs +FORCE_INLINE int _mm_cmpestrs(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + (void) a; + (void) b; + (void) lb; + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); + return la <= (bound - 1); +} + +// Compare packed strings in a and b with lengths la and lb using the control in +// imm8, and returns 1 if any character in b was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz +FORCE_INLINE int _mm_cmpestrz(__m128i a, + int la, + __m128i b, + int lb, + const int imm8) +{ + (void) a; + (void) b; + (void) la; + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); + return lb <= (bound - 1); +} + +#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \ + do { \ + if (imm8 & 0x01) { \ + uint16x8_t equal_mask_##str = \ + vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \ + uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ + uint64_t matches_##str = \ + vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \ + len = _sse2neon_ctzll(matches_##str) >> 3; \ + } else { \ + uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \ + vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \ + uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ + uint64_t matches_##str = \ + vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \ + len = _sse2neon_ctzll(matches_##str) >> 2; \ + } \ + } while (0) + +#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \ + int la, lb; \ + do { \ + SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \ + SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \ + } while (0) + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if b did not contain a null character and the resulting +// mask was zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra +FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); + return !r2 & (lb >= bound); +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc +FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); + return r2 != 0; +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and store the generated index in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri +FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); + SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8); +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and store the generated mask in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm +FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); + SSE2NEON_CMPSTR_GENERATE_MASK(dst); +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns bit 0 of the resulting bit mask. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro +FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8) +{ + SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); + return r2 & 1; +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if any character in a was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs +FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8) +{ + (void) b; + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); + int la; + SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); + return la <= (bound - 1); +} + +// Compare packed strings with implicit lengths in a and b using the control in +// imm8, and returns 1 if any character in b was null, and 0 otherwise. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz +FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8) +{ + (void) a; + SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); + int lb; + SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); + return lb <= (bound - 1); +} + +// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers +// in b for greater than. +FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + return vreinterpretq_m128i_u64( + vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + return vreinterpretq_m128i_s64(vshrq_n_s64( + vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)), + 63)); +#endif +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 16-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16 +FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ + (defined(_M_ARM64) && !defined(__clang__)) + crc = __crc32ch(crc, v); +#else + crc = _mm_crc32_u8(crc, v & 0xff); + crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 32-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32 +FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ + (defined(_M_ARM64) && !defined(__clang__)) + crc = __crc32cw(crc, v); +#else + crc = _mm_crc32_u16(crc, v & 0xffff); + crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 64-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64 +FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#elif (defined(_M_ARM64) && !defined(__clang__)) + crc = __crc32cd((uint32_t) crc, v); +#else + crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff); + crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 8-bit integer v, and stores the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8 +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ + (defined(_M_ARM64) && !defined(__clang__)) + crc = __crc32cb(crc, v); +#else + crc ^= v; +#if defined(__ARM_FEATURE_CRYPTO) + // Adapted from: https://mary.rs/lab/crc32/ + // Barrent reduction + uint64x2_t orig = + vcombine_u64(vcreate_u64((uint64_t) (crc) << 24), vcreate_u64(0x0)); + uint64x2_t tmp = orig; + + // Polynomial P(x) of CRC32C + uint64_t p = 0x105EC76F1; + // Barrett Reduction (in bit-reflected form) constant mu_{64} = \lfloor + // 2^{64} / P(x) \rfloor = 0x11f91caf6 + uint64_t mu = 0x1dea713f1; + + // Multiply by mu_{64} + tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(mu)); + // Divide by 2^{64} (mask away the unnecessary bits) + tmp = + vandq_u64(tmp, vcombine_u64(vcreate_u64(0xFFFFFFFF), vcreate_u64(0x0))); + // Multiply by P(x) (shifted left by 1 for alignment reasons) + tmp = _sse2neon_vmull_p64(vget_low_u64(tmp), vcreate_u64(p)); + // Subtract original from result + tmp = veorq_u64(tmp, orig); + + // Extract the 'lower' (in bit-reflected sense) 32 bits + crc = vgetq_lane_u32(vreinterpretq_u32_u64(tmp), 1); +#else // Fall back to the generic table lookup approach + // Adapted from: https://create.stephan-brumme.com/crc32/ + // Apply half-byte comparison algorithm for the best ratio between + // performance and lookup table. + + // The lookup table just needs to store every 16th entry + // of the standard look-up table. + static const uint32_t crc32_half_byte_tbl[] = { + 0x00000000, 0x105ec76f, 0x20bd8ede, 0x30e349b1, 0x417b1dbc, 0x5125dad3, + 0x61c69362, 0x7198540d, 0x82f63b78, 0x92a8fc17, 0xa24bb5a6, 0xb21572c9, + 0xc38d26c4, 0xd3d3e1ab, 0xe330a81a, 0xf36e6f75, + }; + + crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F]; + crc = (crc >> 4) ^ crc32_half_byte_tbl[crc & 0x0F]; +#endif +#endif + return crc; +} + +/* AES */ + +#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__)) +/* clang-format off */ +#define SSE2NEON_AES_SBOX(w) \ + { \ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ + w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ + w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ + w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ + w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ + w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ + w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ + w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ + w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ + w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ + w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ + w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ + w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ + w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ + w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ + w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ + w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ + w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ + w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ + w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ + w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ + w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ + w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ + w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ + w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ + w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ + w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ + w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ + w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ + w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ + w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ + w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ + w(0xb0), w(0x54), w(0xbb), w(0x16) \ + } +#define SSE2NEON_AES_RSBOX(w) \ + { \ + w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \ + w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \ + w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \ + w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \ + w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \ + w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \ + w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \ + w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \ + w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \ + w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \ + w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \ + w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \ + w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \ + w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \ + w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \ + w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \ + w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \ + w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \ + w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \ + w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \ + w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \ + w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \ + w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \ + w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \ + w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \ + w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \ + w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \ + w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \ + w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \ + w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \ + w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \ + w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \ + w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \ + w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \ + w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \ + w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \ + w(0x55), w(0x21), w(0x0c), w(0x7d) \ + } +/* clang-format on */ + +/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ +#define SSE2NEON_AES_H0(x) (x) +static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0); +static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0); +#undef SSE2NEON_AES_H0 + +/* x_time function and matrix multiply function */ +#if !defined(__aarch64__) && !defined(_M_ARM64) +#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) +#define SSE2NEON_MULTIPLY(x, y) \ + (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \ + ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \ + ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \ + ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))))) +#endif + +// In the absence of crypto extensions, implement aesenc using regular NEON +// intrinsics instead. See: +// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and +// for more information. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + static const uint8_t shift_rows[] = { + 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, + 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, + }; + static const uint8_t ror32by8[] = { + 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, + }; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(a); + + /* shift rows */ + w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + + /* sub bytes */ + // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and + // look up each of the table. After each lookup, we load the next table + // which locates at the next 64-bytes. In the meantime, the index in the + // table would be smaller than it was, so the index parameters of + // `vqtbx4q_u8()` need to be added the same constant as the loaded tables. + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w); + // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))' + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0); + + /* mix columns */ + w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); + w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + + /* add round key */ + return vreinterpretq_m128i_u8(w) ^ RoundKey; + +#else /* ARMv7-A implementation for a table-based AES */ +#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ + (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \ + ((uint32_t) (b1) << 8) | (uint32_t) (b0)) +// multiplying 'x' by 2 in GF(2^8) +#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) +// multiplying 'x' by 3 in GF(2^8) +#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) +#define SSE2NEON_AES_U0(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) +#define SSE2NEON_AES_U1(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) +#define SSE2NEON_AES_U2(p) \ + SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) +#define SSE2NEON_AES_U3(p) \ + SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) + + // this generates a table containing every possible permutation of + // shift_rows() and sub_bytes() with mix_columns(). + static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { + SSE2NEON_AES_SBOX(SSE2NEON_AES_U0), + SSE2NEON_AES_SBOX(SSE2NEON_AES_U1), + SSE2NEON_AES_SBOX(SSE2NEON_AES_U2), + SSE2NEON_AES_SBOX(SSE2NEON_AES_U3), + }; +#undef SSE2NEON_AES_B2W +#undef SSE2NEON_AES_F2 +#undef SSE2NEON_AES_F3 +#undef SSE2NEON_AES_U0 +#undef SSE2NEON_AES_U1 +#undef SSE2NEON_AES_U2 +#undef SSE2NEON_AES_U3 + + uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0] + uint32_t x1 = + _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32] + uint32_t x2 = + _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64] + uint32_t x3 = + _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96] + + // finish the modulo addition step in mix_columns() + __m128i out = _mm_set_epi32( + (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ + aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), + (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ + aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), + (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ + aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), + (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ + aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); + + return _mm_xor_si128(out, RoundKey); +#endif +} + +// Perform one round of an AES decryption flow on data (state) in a using the +// round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 +FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) +{ +#if defined(__aarch64__) + static const uint8_t inv_shift_rows[] = { + 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, + 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, + }; + static const uint8_t ror32by8[] = { + 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, + }; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(a); + + // inverse shift rows + w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows)); + + // inverse sub bytes + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0); + + // inverse mix columns + // multiplying 'v' by 4 in GF(2^8) + w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); + w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b); + v ^= w; + v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); + + w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & + 0x1b); // multiplying 'v' by 2 in GF(2^8) + w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + + // add round key + return vreinterpretq_m128i_u8(w) ^ RoundKey; + +#else /* ARMv7-A NEON implementation */ + /* FIXME: optimized for NEON */ + uint8_t i, e, f, g, h, v[4][4]; + uint8_t *_a = (uint8_t *) &a; + for (i = 0; i < 16; ++i) { + v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; + } + + // inverse mix columns + for (i = 0; i < 4; ++i) { + e = v[i][0]; + f = v[i][1]; + g = v[i][2]; + h = v[i][3]; + + v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^ + SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09); + v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^ + SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d); + v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^ + SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b); + v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^ + SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); + } + + return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; +#endif +} + +// Perform the last round of an AES encryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ +#if defined(__aarch64__) + static const uint8_t shift_rows[] = { + 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, + 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, + }; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(a); + + // shift rows + w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + + // sub bytes + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0); + + // add round key + return vreinterpretq_m128i_u8(v) ^ RoundKey; + +#else /* ARMv7-A implementation */ + uint8_t v[16] = { + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)], + _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)], + }; + + return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey; +#endif +} + +// Perform the last round of an AES decryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 +FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) +{ +#if defined(__aarch64__) + static const uint8_t inv_shift_rows[] = { + 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, + 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, + }; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(a); + + // inverse shift rows + w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows)); + + // inverse sub bytes + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0); + + // add round key + return vreinterpretq_m128i_u8(v) ^ RoundKey; + +#else /* ARMv7-A NEON implementation */ + /* FIXME: optimized for NEON */ + uint8_t v[4][4]; + uint8_t *_a = (uint8_t *) &a; + for (int i = 0; i < 16; ++i) { + v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; + } + + return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; +#endif +} + +// Perform the InvMixColumns transformation on a and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 +FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) +{ +#if defined(__aarch64__) + static const uint8_t ror32by8[] = { + 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, + }; + uint8x16_t v = vreinterpretq_u8_m128i(a); + uint8x16_t w; + + // multiplying 'v' by 4 in GF(2^8) + w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); + w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b); + v ^= w; + v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); + + // multiplying 'v' by 2 in GF(2^8) + w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); + w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + return vreinterpretq_m128i_u8(w); + +#else /* ARMv7-A NEON implementation */ + uint8_t i, e, f, g, h, v[4][4]; + vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a)); + for (i = 0; i < 4; ++i) { + e = v[i][0]; + f = v[i][1]; + g = v[i][2]; + h = v[i][3]; + + v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^ + SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09); + v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^ + SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d); + v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^ + SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b); + v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^ + SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); + } + + return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)); +#endif +} + +// Assist in expanding the AES cipher key by computing steps towards generating +// a round key for encryption cipher using data from a and an 8-bit round +// constant specified in imm8, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 +// +// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. +// This instruction generates a round key for AES encryption. See +// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ +// for details. +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ +#if defined(__aarch64__) + uint8x16_t _a = vreinterpretq_u8_m128i(a); + uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0); + + uint32x4_t v_u32 = vreinterpretq_u32_u8(v); + uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24)); + uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon)); + + return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v)); + +#else /* ARMv7-A NEON implementation */ + uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); + uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); + for (int i = 0; i < 4; ++i) { + ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]]; + ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]]; + } + return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, + ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); +#endif +} +#undef SSE2NEON_AES_SBOX +#undef SSE2NEON_AES_RSBOX + +#if defined(__aarch64__) +#undef SSE2NEON_XT +#undef SSE2NEON_MULTIPLY +#endif + +#else /* __ARM_FEATURE_CRYPTO */ +// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and +// AESMC and then manually applying the real key as an xor operation. This +// unfortunately means an additional xor op; the compiler should be able to +// optimize this away for repeated calls however. See +// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a +// for more details. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8(veorq_u8( + vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + vreinterpretq_u8_m128i(b))); +} + +// Perform one round of an AES decryption flow on data (state) in a using the +// round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 +FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) +{ + return vreinterpretq_m128i_u8(veorq_u8( + vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + vreinterpretq_u8_m128i(RoundKey))); +} + +// Perform the last round of an AES encryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ + return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( + vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + RoundKey); +} + +// Perform the last round of an AES decryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 +FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) +{ + return vreinterpretq_m128i_u8( + veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)), + vreinterpretq_u8_m128i(RoundKey))); +} + +// Perform the InvMixColumns transformation on a and store the result in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 +FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) +{ + return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a))); +} + +// Assist in expanding the AES cipher key by computing steps towards generating +// a round key for encryption cipher using data from a and an 8-bit round +// constant specified in imm8, and store the result in dst." +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + // AESE does ShiftRows and SubBytes on A + uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); + +#if !defined(_MSC_VER) || defined(__clang__) + uint8x16_t dest = { + // Undo ShiftRows step from AESE and extract X1 and X3 + u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) + u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) + u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) + u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) + }; + uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; + return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); +#else + // We have to do this hack because MSVC is strictly adhering to the CPP + // standard, in particular C++03 8.5.1 sub-section 15, which states that + // unions must be initialized by their first member type. + + // As per the Windows ARM64 ABI, it is always little endian, so this works + __n128 dest{ + ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) | + ((uint64_t) u8.n128_u8[0xE] << 16) | + ((uint64_t) u8.n128_u8[0xB] << 24) | + ((uint64_t) u8.n128_u8[0x1] << 32) | + ((uint64_t) u8.n128_u8[0xE] << 40) | + ((uint64_t) u8.n128_u8[0xB] << 48) | + ((uint64_t) u8.n128_u8[0x4] << 56), + ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) | + ((uint64_t) u8.n128_u8[0x6] << 16) | + ((uint64_t) u8.n128_u8[0x3] << 24) | + ((uint64_t) u8.n128_u8[0x9] << 32) | + ((uint64_t) u8.n128_u8[0x6] << 40) | + ((uint64_t) u8.n128_u8[0x3] << 48) | + ((uint64_t) u8.n128_u8[0xC] << 56)}; + + dest.n128_u32[1] = dest.n128_u32[1] ^ rcon; + dest.n128_u32[3] = dest.n128_u32[3] ^ rcon; + + return dest; +#endif +} +#endif + +/* Others */ + +// Perform a carry-less multiplication of two 64-bit integers, selected from a +// and b according to imm8, and store the results in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128 +FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) +{ + uint64x2_t a = vreinterpretq_u64_m128i(_a); + uint64x2_t b = vreinterpretq_u64_m128i(_b); + switch (imm & 0x11) { + case 0x00: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); + case 0x01: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); + case 0x10: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); + case 0x11: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); + default: + abort(); + } +} + +FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void) +{ + union { + fpcr_bitfield field; +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) || defined(_M_ARM64) + r.value = _sse2neon_get_fpcr(); +#else + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF; +} + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32 +FORCE_INLINE int _mm_popcnt_u32(unsigned int a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) +#if __has_builtin(__builtin_popcount) + return __builtin_popcount(a); +#elif defined(_MSC_VER) + return _CountOneBits(a); +#else + return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); +#endif +#else + uint32_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + + vst1_u32(&count, count32x2_val); + return count; +#endif +} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64 +FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) +{ +#if defined(__aarch64__) || defined(_M_ARM64) +#if __has_builtin(__builtin_popcountll) + return __builtin_popcountll(a); +#elif defined(_MSC_VER) + return _CountOneBits64(a); +#else + return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); +#endif +#else + uint64_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(&count, count64x1_val); + return count; +#endif +} + +FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) +{ + // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, + // regardless of the value of the FZ bit. + union { + fpcr_bitfield field; +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) || defined(_M_ARM64) + r.value = _sse2neon_get_fpcr(); +#else + __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON; + +#if defined(__aarch64__) || defined(_M_ARM64) + _sse2neon_set_fpcr(r.value); +#else + __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif +} + +// Return the current 64-bit value of the processor's time-stamp counter. +// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc +FORCE_INLINE uint64_t _rdtsc(void) +{ +#if defined(__aarch64__) || defined(_M_ARM64) + uint64_t val; + + /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the + * system counter is at least 56 bits wide; from Armv8.6, the counter + * must be 64 bits wide. So the system counter could be less than 64 + * bits wide and it is attributed with the flag 'cap_user_time_short' + * is true. + */ +#if defined(_MSC_VER) && !defined(__clang__) + val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2)); +#else + __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val)); +#endif + + return val; +#else + uint32_t pmccntr, pmuseren, pmcntenset; + // Read the user mode Performance Monitoring Unit (PMU) + // User Enable Register (PMUSERENR) access permissions. + __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); + if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code. + __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); + if (pmcntenset & 0x80000000UL) { // Is it counting? + __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); + // The counter is set up to count every 64th cycle + return (uint64_t) (pmccntr) << 6; + } + } + + // Fallback to syscall as we can't enable PMUSERENR in user mode. + struct timeval tv; + gettimeofday(&tv, NULL); + return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec; +#endif +} + +#if defined(__GNUC__) || defined(__clang__) +#pragma pop_macro("ALIGN_STRUCT") +#pragma pop_macro("FORCE_INLINE") +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC pop_options +#endif + +#endif diff --git a/gdal.cmake b/gdal.cmake index 6af60831fb01..1799ea709019 100644 --- a/gdal.cmake +++ b/gdal.cmake @@ -40,6 +40,13 @@ option(GDAL_OBJECT_LIBRARIES_POSITION_INDEPENDENT_CODE "Set ON to produce -fPIC # Option to set preferred C# compiler option(CSHARP_MONO "Whether to force the C# compiler to be Mono" OFF) +if (SSE2NEON_COMPILES) + option(GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS "Set ON to use ARM Neon FPU optimizations" ON) + if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS) + message(STATUS "Using ARM Neon optimizations") + endif() +endif() + # This line must be kept early in the CMake instructions. At time of writing, # this file is populated only be scripts/install_bash_completions.cmake.in install(CODE "file(REMOVE \"${PROJECT_BINARY_DIR}/install_manifest_extra.txt\")") diff --git a/ogr/ogr_geometry.h b/ogr/ogr_geometry.h index 7589a90864d4..2e07be78d05e 100644 --- a/ogr/ogr_geometry.h +++ b/ogr/ogr_geometry.h @@ -408,9 +408,11 @@ class CPL_DLL OGRGeometry OGRGeometry(); OGRGeometry(const OGRGeometry &other); + OGRGeometry(OGRGeometry &&other); virtual ~OGRGeometry(); OGRGeometry &operator=(const OGRGeometry &other); + OGRGeometry &operator=(OGRGeometry &&other); /** Returns if two geometries are equal. */ bool operator==(const OGRGeometry &other) const @@ -1145,9 +1147,13 @@ class CPL_DLL OGRPoint : public OGRGeometry OGRPoint(double x, double y, double z); OGRPoint(double x, double y, double z, double m); OGRPoint(const OGRPoint &other); + /** Move constructor */ + OGRPoint(OGRPoint &&other) = default; static OGRPoint *createXYM(double x, double y, double m); OGRPoint &operator=(const OGRPoint &other); + /** Move assignment operator */ + OGRPoint &operator=(OGRPoint &&other) = default; // IWks Interface size_t WkbSize() const override; @@ -1318,6 +1324,7 @@ class CPL_DLL OGRCurve : public OGRGeometry //! @cond Doxygen_Suppress OGRCurve() = default; OGRCurve(const OGRCurve &other) = default; + OGRCurve(OGRCurve &&other) = default; virtual OGRCurveCasterToLineString GetCasterToLineString() const = 0; virtual OGRCurveCasterToLinearRing GetCasterToLinearRing() const = 0; @@ -1351,6 +1358,7 @@ class CPL_DLL OGRCurve : public OGRGeometry public: //! @cond Doxygen_Suppress OGRCurve &operator=(const OGRCurve &other); + OGRCurve &operator=(OGRCurve &&other) = default; //! @endcond /** Type of child elements. */ @@ -1533,6 +1541,8 @@ class CPL_DLL OGRSimpleCurve : public OGRCurve OGRSimpleCurve(const OGRSimpleCurve &other); + OGRSimpleCurve(OGRSimpleCurve &&other); + private: class CPL_DLL Iterator { @@ -1577,6 +1587,8 @@ class CPL_DLL OGRSimpleCurve : public OGRCurve OGRSimpleCurve &operator=(const OGRSimpleCurve &other); + OGRSimpleCurve &operator=(OGRSimpleCurve &&other); + /** Type of child elements. */ typedef OGRPoint ChildType; @@ -1777,8 +1789,10 @@ class CPL_DLL OGRLineString : public OGRSimpleCurve /** Create an empty line string. */ OGRLineString() = default; OGRLineString(const OGRLineString &other); + OGRLineString(OGRLineString &&other); OGRLineString &operator=(const OGRLineString &other); + OGRLineString &operator=(OGRLineString &&other); virtual OGRLineString *clone() const override; virtual OGRLineString * @@ -1883,9 +1897,13 @@ class CPL_DLL OGRLinearRing : public OGRLineString /** Constructor */ OGRLinearRing() = default; OGRLinearRing(const OGRLinearRing &other); + /** Move constructor*/ + OGRLinearRing(OGRLinearRing &&other) = default; explicit OGRLinearRing(const OGRLinearRing *); OGRLinearRing &operator=(const OGRLinearRing &other); + /** Move assignment operator */ + OGRLinearRing &operator=(OGRLinearRing &&other) = default; // Non standard. virtual const char *getGeometryName() const override; @@ -1966,8 +1984,12 @@ class CPL_DLL OGRCircularString : public OGRSimpleCurve OGRCircularString() = default; OGRCircularString(const OGRCircularString &other); + /** Move constructor */ + OGRCircularString(OGRCircularString &&other) = default; OGRCircularString &operator=(const OGRCircularString &other); + /** Move assignment operator */ + OGRCircularString &operator=(OGRCircularString &&other) = default; // IWks Interface. virtual OGRErr importFromWkb(const unsigned char *, size_t, OGRwkbVariant, @@ -2075,9 +2097,11 @@ class CPL_DLL OGRCurveCollection public: OGRCurveCollection() = default; OGRCurveCollection(const OGRCurveCollection &other); + OGRCurveCollection(OGRCurveCollection &&other); ~OGRCurveCollection(); OGRCurveCollection &operator=(const OGRCurveCollection &other); + OGRCurveCollection &operator=(OGRCurveCollection &&other); /** Type of child elements. */ typedef OGRCurve ChildType; @@ -2208,8 +2232,12 @@ class CPL_DLL OGRCompoundCurve : public OGRCurve OGRCompoundCurve() = default; OGRCompoundCurve(const OGRCompoundCurve &other); + /** Move constructor */ + OGRCompoundCurve(OGRCompoundCurve &&other) = default; OGRCompoundCurve &operator=(const OGRCompoundCurve &other); + /** Move assignment operator */ + OGRCompoundCurve &operator=(OGRCompoundCurve &&other) = default; /** Type of child elements. */ typedef OGRCurve ChildType; @@ -2477,8 +2505,12 @@ class CPL_DLL OGRCurvePolygon : public OGRSurface OGRCurvePolygon() = default; OGRCurvePolygon(const OGRCurvePolygon &); + /** Move constructor */ + OGRCurvePolygon(OGRCurvePolygon &&) = default; OGRCurvePolygon &operator=(const OGRCurvePolygon &other); + /** Move assignment operator */ + OGRCurvePolygon &operator=(OGRCurvePolygon &&other) = default; /** Type of child elements. */ typedef OGRCurve ChildType; @@ -2686,8 +2718,12 @@ class CPL_DLL OGRPolygon : public OGRCurvePolygon OGRPolygon() = default; OGRPolygon(const OGRPolygon &other); + /** Move constructor */ + OGRPolygon(OGRPolygon &&other) = default; OGRPolygon &operator=(const OGRPolygon &other); + /** Move assignment operator */ + OGRPolygon &operator=(OGRPolygon &&other) = default; /** Type of child elements. */ typedef OGRLinearRing ChildType; @@ -2857,8 +2893,12 @@ class CPL_DLL OGRTriangle : public OGRPolygon OGRTriangle() = default; OGRTriangle(const OGRPoint &p, const OGRPoint &q, const OGRPoint &r); OGRTriangle(const OGRTriangle &other); + /** Move constructor */ + OGRTriangle(OGRTriangle &&other) = default; OGRTriangle(const OGRPolygon &other, OGRErr &eErr); OGRTriangle &operator=(const OGRTriangle &other); + /** Move assignment operator */ + OGRTriangle &operator=(OGRTriangle &&other) = default; virtual const char *getGeometryName() const override; virtual OGRwkbGeometryType getGeometryType() const override; @@ -2939,9 +2979,11 @@ class CPL_DLL OGRGeometryCollection : public OGRGeometry OGRGeometryCollection() = default; OGRGeometryCollection(const OGRGeometryCollection &other); + OGRGeometryCollection(OGRGeometryCollection &&other); ~OGRGeometryCollection() override; OGRGeometryCollection &operator=(const OGRGeometryCollection &other); + OGRGeometryCollection &operator=(OGRGeometryCollection &&other); /** Type of child elements. */ typedef OGRGeometry ChildType; @@ -3123,8 +3165,12 @@ class CPL_DLL OGRMultiSurface : public OGRGeometryCollection OGRMultiSurface() = default; OGRMultiSurface(const OGRMultiSurface &other); + /** Move constructor */ + OGRMultiSurface(OGRMultiSurface &&other) = default; OGRMultiSurface &operator=(const OGRMultiSurface &other); + /** Move assignment operator */ + OGRMultiSurface &operator=(OGRMultiSurface &&other) = default; /** Type of child elements. */ typedef OGRSurface ChildType; @@ -3291,8 +3337,12 @@ class CPL_DLL OGRMultiPolygon : public OGRMultiSurface OGRMultiPolygon() = default; OGRMultiPolygon(const OGRMultiPolygon &other); + /** Move constructor */ + OGRMultiPolygon(OGRMultiPolygon &&other) = default; OGRMultiPolygon &operator=(const OGRMultiPolygon &other); + /** Move assignment operator */ + OGRMultiPolygon &operator=(OGRMultiPolygon &&other) = default; /** Type of child elements. */ typedef OGRPolygon ChildType; @@ -3453,9 +3503,13 @@ class CPL_DLL OGRPolyhedralSurface : public OGRSurface /** Create an empty PolyhedralSurface */ OGRPolyhedralSurface() = default; - OGRPolyhedralSurface(const OGRPolyhedralSurface &poGeom); + OGRPolyhedralSurface(const OGRPolyhedralSurface &other); + /** Move constructor */ + OGRPolyhedralSurface(OGRPolyhedralSurface &&other) = default; OGRPolyhedralSurface &operator=(const OGRPolyhedralSurface &other); + /** Move assignment operator */ + OGRPolyhedralSurface &operator=(OGRPolyhedralSurface &&other) = default; /** Type of child elements. */ typedef OGRPolygon ChildType; @@ -3630,6 +3684,12 @@ class CPL_DLL OGRTriangulatedSurface : public OGRPolyhedralSurface OGRTriangulatedSurface() = default; OGRTriangulatedSurface(const OGRTriangulatedSurface &other); + /** Move constructor */ + OGRTriangulatedSurface(OGRTriangulatedSurface &&other) = default; + + OGRTriangulatedSurface &operator=(const OGRTriangulatedSurface &other); + /** Move assignment operator */ + OGRTriangulatedSurface &operator=(OGRTriangulatedSurface &&other) = default; /** Type of child elements. */ typedef OGRTriangle ChildType; @@ -3662,7 +3722,6 @@ class CPL_DLL OGRTriangulatedSurface : public OGRPolyhedralSurface return reinterpret_cast(oMP.end()); } - OGRTriangulatedSurface &operator=(const OGRTriangulatedSurface &other); virtual const char *getGeometryName() const override; virtual OGRwkbGeometryType getGeometryType() const override; virtual OGRTriangulatedSurface *clone() const override; @@ -3765,8 +3824,12 @@ class CPL_DLL OGRMultiPoint : public OGRGeometryCollection OGRMultiPoint() = default; OGRMultiPoint(const OGRMultiPoint &other); + /** Move constructor */ + OGRMultiPoint(OGRMultiPoint &&other) = default; OGRMultiPoint &operator=(const OGRMultiPoint &other); + /** Move assignment operator */ + OGRMultiPoint &operator=(OGRMultiPoint &&other) = default; /** Type of child elements. */ typedef OGRPoint ChildType; @@ -3923,8 +3986,12 @@ class CPL_DLL OGRMultiCurve : public OGRGeometryCollection OGRMultiCurve() = default; OGRMultiCurve(const OGRMultiCurve &other); + /** Move constructor */ + OGRMultiCurve(OGRMultiCurve &&other) = default; OGRMultiCurve &operator=(const OGRMultiCurve &other); + /** Move assignment operator */ + OGRMultiCurve &operator=(OGRMultiCurve &&other) = default; /** Type of child elements. */ typedef OGRCurve ChildType; @@ -4076,8 +4143,12 @@ class CPL_DLL OGRMultiLineString : public OGRMultiCurve OGRMultiLineString() = default; OGRMultiLineString(const OGRMultiLineString &other); + /** Move constructor */ + OGRMultiLineString(OGRMultiLineString &&other) = default; OGRMultiLineString &operator=(const OGRMultiLineString &other); + /** Move assignment operator */ + OGRMultiLineString &operator=(OGRMultiLineString &&other) = default; /** Type of child elements. */ typedef OGRLineString ChildType; diff --git a/ogr/ogr_recordbatch.h b/ogr/ogr_recordbatch.h index 6fcde1405b65..48d1a8d7c874 100644 --- a/ogr/ogr_recordbatch.h +++ b/ogr/ogr_recordbatch.h @@ -19,7 +19,8 @@ // https://github.com/apache/arrow/blob/main/cpp/src/arrow/c/abi.h WARNING: DO // NOT MODIFY the content as it would break interoperability ! -#pragma once +#ifndef OGR_RECORDBATCH_H_INCLUDED +#define OGR_RECORDBATCH_H_INCLUDED /*! @cond Doxygen_Suppress */ @@ -123,3 +124,5 @@ extern "C" #endif /*! @endcond */ + +#endif // OGR_RECORDBATCH_H_INCLUDED diff --git a/ogr/ogrcurve.cpp b/ogr/ogrcurve.cpp index 4e8b7d77f255..260e875571d1 100644 --- a/ogr/ogrcurve.cpp +++ b/ogr/ogrcurve.cpp @@ -756,7 +756,7 @@ int OGRCurve::isClockwise() const for (int i = 1; i < nPointCount - 1; i++) { ++oIter; - OGRPoint oPointCur = *oIter; + const OGRPoint oPointCur = *oIter; if (bNextPointIsNextSel) { oPointNextSel = oPointCur; diff --git a/ogr/ogrcurvecollection.cpp b/ogr/ogrcurvecollection.cpp index 83f3ae2e063a..38132dca41e7 100644 --- a/ogr/ogrcurvecollection.cpp +++ b/ogr/ogrcurvecollection.cpp @@ -59,6 +59,23 @@ OGRCurveCollection::OGRCurveCollection(const OGRCurveCollection &other) } } +/************************************************************************/ +/* OGRCurveCollection( OGRCurveCollection&& ) */ +/************************************************************************/ + +/** + * \brief Move constructor. + * + * @since GDAL 3.11 + */ + +OGRCurveCollection::OGRCurveCollection(OGRCurveCollection &&other) + : nCurveCount(other.nCurveCount), papoCurves(other.papoCurves) +{ + other.nCurveCount = 0; + other.papoCurves = nullptr; +} + /************************************************************************/ /* ~OGRCurveCollection() */ /************************************************************************/ @@ -107,6 +124,27 @@ OGRCurveCollection::operator=(const OGRCurveCollection &other) return *this; } +/************************************************************************/ +/* operator=( OGRCurveCollection&& ) */ +/************************************************************************/ + +/** + * \brief Move assignment operator. + * + * @since GDAL 3.11 + */ + +OGRCurveCollection &OGRCurveCollection::operator=(OGRCurveCollection &&other) +{ + if (this != &other) + { + empty(nullptr); + std::swap(nCurveCount, other.nCurveCount); + std::swap(papoCurves, other.papoCurves); + } + return *this; +} + /************************************************************************/ /* WkbSize() */ /************************************************************************/ diff --git a/ogr/ogrgeometry.cpp b/ogr/ogrgeometry.cpp index e18e1b467526..e7a1cb4c5025 100644 --- a/ogr/ogrgeometry.cpp +++ b/ogr/ogrgeometry.cpp @@ -109,6 +109,22 @@ OGRGeometry::OGRGeometry(const OGRGeometry &other) const_cast(poSRS)->Reference(); } +/************************************************************************/ +/* OGRGeometry( OGRGeometry&& ) */ +/************************************************************************/ + +/** + * \brief Move constructor. + * + * @since GDAL 3.11 + */ + +OGRGeometry::OGRGeometry(OGRGeometry &&other) + : poSRS(other.poSRS), flags(other.flags) +{ + other.poSRS = nullptr; +} + /************************************************************************/ /* ~OGRGeometry() */ /************************************************************************/ @@ -144,6 +160,27 @@ OGRGeometry &OGRGeometry::operator=(const OGRGeometry &other) return *this; } +/************************************************************************/ +/* operator=( OGRGeometry&&) */ +/************************************************************************/ + +/** + * \brief Move assignment operator. + * + * @since GDAL 3.11 + */ + +OGRGeometry &OGRGeometry::operator=(OGRGeometry &&other) +{ + if (this != &other) + { + poSRS = other.poSRS; + other.poSRS = nullptr; + flags = other.flags; + } + return *this; +} + /************************************************************************/ /* dumpReadable() */ /************************************************************************/ diff --git a/ogr/ogrgeometrycollection.cpp b/ogr/ogrgeometrycollection.cpp index 915dc7f0ca86..f1beb953ad96 100644 --- a/ogr/ogrgeometrycollection.cpp +++ b/ogr/ogrgeometrycollection.cpp @@ -57,6 +57,27 @@ OGRGeometryCollection::OGRGeometryCollection(const OGRGeometryCollection &other) } } +/************************************************************************/ +/* OGRGeometryCollection( OGRGeometryCollection&& ) */ +/************************************************************************/ + +/** + * \brief Move constructor. + * + * @since GDAL 3.11 + */ + +// cppcheck-suppress-begin accessMoved +OGRGeometryCollection::OGRGeometryCollection(OGRGeometryCollection &&other) + : OGRGeometry(std::move(other)), nGeomCount(other.nGeomCount), + papoGeoms(other.papoGeoms) +{ + other.nGeomCount = 0; + other.papoGeoms = nullptr; +} + +// cppcheck-suppress-end accessMoved + /************************************************************************/ /* ~OGRGeometryCollection() */ /************************************************************************/ @@ -112,6 +133,30 @@ OGRGeometryCollection::operator=(const OGRGeometryCollection &other) return *this; } +/************************************************************************/ +/* operator=( OGRGeometryCollection&&) */ +/************************************************************************/ + +/** + * \brief Move assignment operator. + * + * @since GDAL 3.11 + */ + +OGRGeometryCollection & +OGRGeometryCollection::operator=(OGRGeometryCollection &&other) +{ + if (this != &other) + { + empty(); + + OGRGeometry::operator=(std::move(other)); + std::swap(nGeomCount, other.nGeomCount); + std::swap(papoGeoms, other.papoGeoms); + } + return *this; +} + /************************************************************************/ /* empty() */ /************************************************************************/ diff --git a/ogr/ogrgeometryfactory.cpp b/ogr/ogrgeometryfactory.cpp index e68303667da9..dd4d5833d05d 100644 --- a/ogr/ogrgeometryfactory.cpp +++ b/ogr/ogrgeometryfactory.cpp @@ -2010,7 +2010,7 @@ OGRGeometry *OGRGeometryFactory::organizePolygons(OGRGeometry **papoPolygons, // If it is outside, then i cannot be inside j. break; } - previousPoint = point; + previousPoint = std::move(point); } if (!b_i_inside_j && k == nPoints && nPoints > 2) { diff --git a/ogr/ogrlinestring.cpp b/ogr/ogrlinestring.cpp index 80445fea3b6d..929b570901e0 100644 --- a/ogr/ogrlinestring.cpp +++ b/ogr/ogrlinestring.cpp @@ -60,6 +60,31 @@ OGRSimpleCurve::OGRSimpleCurve(const OGRSimpleCurve &other) setPoints(other.nPointCount, other.paoPoints, other.padfZ, other.padfM); } +/************************************************************************/ +/* OGRSimpleCurve( OGRSimpleCurve&& ) */ +/************************************************************************/ + +/** + * \brief Move constructor. + * + * @since GDAL 3.11 + */ + +// cppcheck-suppress-begin accessMoved +OGRSimpleCurve::OGRSimpleCurve(OGRSimpleCurve &&other) + : OGRCurve(std::move(other)), nPointCount(other.nPointCount), + m_nPointCapacity(other.m_nPointCapacity), paoPoints(other.paoPoints), + padfZ(other.padfZ), padfM(other.padfM) +{ + other.nPointCount = 0; + other.m_nPointCapacity = 0; + other.paoPoints = nullptr; + other.padfZ = nullptr; + other.padfM = nullptr; +} + +// cppcheck-suppress-end accessMoved + /************************************************************************/ /* ~OGRSimpleCurve() */ /************************************************************************/ @@ -73,7 +98,7 @@ OGRSimpleCurve::~OGRSimpleCurve() } /************************************************************************/ -/* operator=( const OGRPoint& ) */ +/* operator=(const OGRSimpleCurve &other) */ /************************************************************************/ /** @@ -98,6 +123,43 @@ OGRSimpleCurve &OGRSimpleCurve::operator=(const OGRSimpleCurve &other) return *this; } +/************************************************************************/ +/* operator=(OGRSimpleCurve &&other) */ +/************************************************************************/ + +/** + * \brief Move assignment operator. + * + * @since GDAL 3.11 + */ + +OGRSimpleCurve &OGRSimpleCurve::operator=(OGRSimpleCurve &&other) +{ + if (this != &other) + { + // cppcheck-suppress-begin accessMoved + OGRCurve::operator=(std::move(other)); + + nPointCount = other.nPointCount; + m_nPointCapacity = other.m_nPointCapacity; + CPLFree(paoPoints); + paoPoints = other.paoPoints; + CPLFree(padfZ); + padfZ = other.padfZ; + CPLFree(padfM); + padfM = other.padfM; + flags = other.flags; + other.nPointCount = 0; + other.m_nPointCapacity = 0; + other.paoPoints = nullptr; + other.padfZ = nullptr; + other.padfM = nullptr; + // cppcheck-suppress-end accessMoved + } + + return *this; +} + /************************************************************************/ /* flattenTo2D() */ /************************************************************************/ @@ -2806,6 +2868,18 @@ OGRPointIterator *OGRSimpleCurve::getPointIterator() const OGRLineString::OGRLineString(const OGRLineString &) = default; +/************************************************************************/ +/* OGRLineString( OGRLineString&& ) */ +/************************************************************************/ + +/** + * \brief Move constructor. + * + * @since GDAL 3.11 + */ + +OGRLineString::OGRLineString(OGRLineString &&) = default; + /************************************************************************/ /* operator=( const OGRLineString& ) */ /************************************************************************/ @@ -2828,6 +2902,25 @@ OGRLineString &OGRLineString::operator=(const OGRLineString &other) return *this; } +/************************************************************************/ +/* operator=( OGRLineString&& ) */ +/************************************************************************/ + +/** + * \brief Move assignment operator. + * + * @since GDAL 3.11 + */ + +OGRLineString &OGRLineString::operator=(OGRLineString &&other) +{ + if (this != &other) + { + OGRSimpleCurve::operator=(std::move(other)); + } + return *this; +} + /************************************************************************/ /* getGeometryType() */ /************************************************************************/ diff --git a/ogr/ogrsf_frmts/CMakeLists.txt b/ogr/ogrsf_frmts/CMakeLists.txt index d9337012cae0..15fc491e7665 100644 --- a/ogr/ogrsf_frmts/CMakeLists.txt +++ b/ogr/ogrsf_frmts/CMakeLists.txt @@ -90,6 +90,8 @@ if( NOT WORDS_BIGENDIAN ) ogr_optional_driver(miramon "MiraMonVector") endif() +ogr_optional_driver(aivector AIVector) + # ###################################################################################################################### # if (NOT OGR_ENABLE_DRIVER_GEOJSON_PLUGIN) diff --git a/ogr/ogrsf_frmts/aivector/CMakeLists.txt b/ogr/ogrsf_frmts/aivector/CMakeLists.txt new file mode 100644 index 000000000000..8b0b2ce48dac --- /dev/null +++ b/ogr/ogrsf_frmts/aivector/CMakeLists.txt @@ -0,0 +1,9 @@ +add_gdal_driver( + TARGET ogr_AIVector + SOURCES + ograivectordriver.cpp + PLUGIN_CAPABLE + NO_DEPS + STRONG_CXX_WFLAGS) + +gdal_standard_includes(ogr_AIVector) diff --git a/ogr/ogrsf_frmts/aivector/ograivectordriver.cpp b/ogr/ogrsf_frmts/aivector/ograivectordriver.cpp new file mode 100644 index 000000000000..6606311fffbc --- /dev/null +++ b/ogr/ogrsf_frmts/aivector/ograivectordriver.cpp @@ -0,0 +1,146 @@ +/****************************************************************************** + * + * Project: GDAL + * Purpose: Artificial Intelligence powered driver + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2024, Even Rouault + * + * SPDX-License-Identifier: MIT + ****************************************************************************/ + +#include "ogrsf_frmts.h" + +/************************************************************************/ +/* OGRAIVectorIdentify() */ +/************************************************************************/ + +static int OGRAIVectorIdentify(GDALOpenInfo *poOpenInfo) +{ + return STARTS_WITH_CI(poOpenInfo->pszFilename, "AIVector:") || + poOpenInfo->IsSingleAllowedDriver("AIVector"); +} + +/************************************************************************/ +/* OGRAIVectorOpen() */ +/************************************************************************/ + +static GDALDataset *OGRAIVectorOpen(GDALOpenInfo *poOpenInfo) +{ + if (!OGRAIVectorIdentify(poOpenInfo)) + return nullptr; + + class MyLayer final : public OGRLayer, + public OGRGetNextFeatureThroughRaw + { + OGRFeatureDefn *m_poLayerDefn = nullptr; + bool m_bReturnedFeature = false; + + CPL_DISALLOW_COPY_ASSIGN(MyLayer) + + public: + MyLayer() + { + m_poLayerDefn = new OGRFeatureDefn("result"); + SetDescription(m_poLayerDefn->GetName()); + m_poLayerDefn->Reference(); + OGRFieldDefn oFieldDefn("name", OFTString); + m_poLayerDefn->AddFieldDefn(&oFieldDefn); + OGRSpatialReference *poSRS = new OGRSpatialReference( + "GEOGCS[\"I don't know\",\n" + " DATUM[\"I don't care\",\n" + " SPHEROID[\"GRS 1980\",6378137,298.257222101,\n" + " AUTHORITY[\"EPSG\",\"7019\"]]],\n" + " PRIMEM[\"Greenwich\",0,\n" + " AUTHORITY[\"EPSG\",\"8901\"]],\n" + " UNIT[\"degree\",0.0174532925199433,\n" + " AUTHORITY[\"EPSG\",\"9122\"]],\n" + " AUTHORITY[\"AI\",\"TOTALLY_MADE_UP\"]]"); + m_poLayerDefn->GetGeomFieldDefn(0)->SetSpatialRef(poSRS); + poSRS->Release(); + } + + ~MyLayer() override + { + m_poLayerDefn->Release(); + } + + void ResetReading() override + { + m_bReturnedFeature = false; + } + + OGRFeatureDefn *GetLayerDefn() override + { + return m_poLayerDefn; + } + DEFINE_GET_NEXT_FEATURE_THROUGH_RAW(MyLayer) + + OGRFeature *GetNextRawFeature() + { + if (m_bReturnedFeature) + return nullptr; + m_bReturnedFeature = true; + OGRFeature *poFeature = new OGRFeature(m_poLayerDefn); + poFeature->SetFID(0); + poFeature->SetField(0, "Null Island: the place to be"); + OGRPoint *poPoint = new OGRPoint(0, 0); + poPoint->assignSpatialReference(GetSpatialRef()); + poFeature->SetGeometryDirectly(poPoint); + return poFeature; + } + + int TestCapability(const char *) override + { + return false; + } + }; + + class MyDataset final : public GDALDataset + { + MyLayer m_oLayer{}; + + public: + MyDataset() = default; + + int GetLayerCount() override + { + return 1; + } + + OGRLayer *GetLayer(int idx) override + { + return idx == 0 ? &m_oLayer : nullptr; + } + }; + + return new MyDataset(); +} + +/************************************************************************/ +/* RegisterOGRAIVector() */ +/************************************************************************/ + +void RegisterOGRAIVector() +{ + if (!GDAL_CHECK_VERSION("AIVector")) + return; + + if (GDALGetDriverByName("AIVector") != nullptr) + return; + + GDALDriver *poDriver = new GDALDriver(); + poDriver->SetDescription("AIVector"); + poDriver->SetMetadataItem(GDAL_DCAP_VECTOR, "YES"); + poDriver->SetMetadataItem(GDAL_DMD_LONGNAME, + "Artificial Intelligence powered vector driver"); + poDriver->SetMetadataItem(GDAL_DMD_HELPTOPIC, + "drivers/vector/aivector.html"); + + poDriver->SetMetadataItem(GDAL_DMD_CONNECTION_PREFIX, "AIVector:"); + + poDriver->pfnOpen = OGRAIVectorOpen; + poDriver->pfnIdentify = OGRAIVectorIdentify; + GetGDALDriverManager()->RegisterDriver(poDriver); +} diff --git a/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp b/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp index 4f5ae6ba6f4d..24ca0f768cac 100644 --- a/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp +++ b/ogr/ogrsf_frmts/dxf/ogrdxf_leader.cpp @@ -1369,7 +1369,7 @@ static void InterpolateSpline(OGRLineString *const poLine, aoDataPoints.push_back( DXFTriple(oPoint.getX(), oPoint.getY(), oPoint.getZ())); - oPrevPoint = oPoint; + oPrevPoint = std::move(oPoint); } nDataPoints = static_cast(aoDataPoints.size()); if (nDataPoints < 2) diff --git a/ogr/ogrsf_frmts/generic/ogrregisterall.cpp b/ogr/ogrsf_frmts/generic/ogrregisterall.cpp index b7d6744daea6..843197c6dac5 100644 --- a/ogr/ogrsf_frmts/generic/ogrregisterall.cpp +++ b/ogr/ogrsf_frmts/generic/ogrregisterall.cpp @@ -273,4 +273,9 @@ void OGRRegisterAllInternal() RegisterOGRAVCE00(); #endif + // Last but not the least +#ifdef AIVECTOR_ENABLED + RegisterOGRAIVector(); +#endif + } /* OGRRegisterAll */ diff --git a/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp b/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp index 796174cce9d1..b5550ee3f3f9 100644 --- a/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp +++ b/ogr/ogrsf_frmts/lvbag/ogrlvbaglayer.cpp @@ -638,7 +638,7 @@ void OGRLVBAGLayer::EndElementCbk(const char *pszName) poGeom->flattenTo2D(); #ifdef HAVE_GEOS - if (!poGeom->IsValid() && bFixInvalidData) + if (bFixInvalidData && !poGeom->IsValid()) { std::unique_ptr poSubGeom = std::unique_ptr{poGeom->MakeValid()}; diff --git a/ogr/ogrsf_frmts/ogrsf_frmts.h b/ogr/ogrsf_frmts/ogrsf_frmts.h index 994db366a89c..e7f28b1fb155 100644 --- a/ogr/ogrsf_frmts/ogrsf_frmts.h +++ b/ogr/ogrsf_frmts/ogrsf_frmts.h @@ -741,6 +741,7 @@ void CPL_DLL RegisterOGRXODR(); void DeclareDeferredOGRXODRPlugin(); void CPL_DLL RegisterOGRADBC(); void DeclareDeferredOGRADBCPlugin(); +void CPL_DLL RegisterOGRAIVector(); // @endcond CPL_C_END diff --git a/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h b/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h index ae2a5f3e3d0f..36c29b23f3a9 100644 --- a/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h +++ b/ogr/ogrsf_frmts/openfilegdb/ogr_openfilegdb.h @@ -28,7 +28,7 @@ using namespace OpenFileGDB; -std::string OFGDBGenerateUUID(); +std::string OFGDBGenerateUUID(bool bInit = false); int OGROpenFileGDBIsComparisonOp(int op); diff --git a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp index 4198263416ac..a8aa87b1341e 100644 --- a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp +++ b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdb_generate_uuid.cpp @@ -54,7 +54,7 @@ static int CPLGettimeofday(struct CPLTimeVal *tp, void * /* timezonep*/) // Probably not the best UUID generator ever. One issue is that mt19937 // uses only a 32-bit seed. CPL_NOSANITIZE_UNSIGNED_INT_OVERFLOW -std::string OFGDBGenerateUUID() +std::string OFGDBGenerateUUID(bool bInit) { struct CPLTimeVal tv; memset(&tv, 0, sizeof(tv)); @@ -62,57 +62,82 @@ std::string OFGDBGenerateUUID() const bool bReproducibleUUID = CPLTestBool(CPLGetConfigOption("OPENFILEGDB_REPRODUCIBLE_UUID", "NO")); + if (bInit) + { + if (bReproducibleUUID) + nCounter = 0; + return std::string(); + } + + uint32_t nCounterLocal = nCounter; + // From POSIX.1-2001 as an example of an implementation of rand() + // for reproducible output. + // We have to use that rather than relying on std::mt19937 + + // std::uniform_int_distribution since they don't given the same output + // from the same seed on all platforms. + const auto reproducibleRand = [&nCounterLocal]() + { + nCounterLocal = nCounterLocal * 1103515245U + 12345U; + return (nCounterLocal / 65536U) % 32768U; + }; + std::stringstream ss; { if (!bReproducibleUUID) + { CPLGettimeofday(&tv, nullptr); - std::mt19937 gen(++nCounter + - (bReproducibleUUID - ? 0 - : static_cast(tv.tv_sec ^ tv.tv_usec))); + ++nCounter; + } + std::mt19937 gen(nCounter + + static_cast(tv.tv_sec ^ tv.tv_usec)); std::uniform_int_distribution<> dis(0, 15); ss << "{"; ss << std::hex; for (int i = 0; i < 8; i++) { - ss << dis(gen); + ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen)); } ss << "-"; for (int i = 0; i < 4; i++) { - ss << dis(gen); + ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen)); } ss << "-4"; for (int i = 0; i < 3; i++) { - ss << dis(gen); + ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen)); } } { if (!bReproducibleUUID) + { CPLGettimeofday(&tv, nullptr); - std::mt19937 gen(++nCounter + - (bReproducibleUUID - ? 0 - : static_cast(tv.tv_sec ^ tv.tv_usec))); + ++nCounter; + } + std::mt19937 gen(nCounter + + static_cast(tv.tv_sec ^ tv.tv_usec)); std::uniform_int_distribution<> dis(0, 15); std::uniform_int_distribution<> dis2(8, 11); ss << "-"; - ss << dis2(gen); + ss << (bReproducibleUUID ? 8 : dis2(gen)); for (int i = 0; i < 3; i++) { - ss << dis(gen); + ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen)); } ss << "-"; for (int i = 0; i < 12; i++) { - ss << dis(gen); + ss << (bReproducibleUUID ? (reproducibleRand() % 16) : dis(gen)); }; ss << "}"; - return ss.str(); } + + if (bReproducibleUUID) + nCounter = nCounterLocal; + + return ss.str(); } diff --git a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp index f6e54d500998..27ba3a2dd9f4 100644 --- a/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp +++ b/ogr/ogrsf_frmts/openfilegdb/ogropenfilegdbdatasource_write.cpp @@ -1299,6 +1299,8 @@ bool OGROpenFileGDBDataSource::Create(const char *pszName) return false; } + CPL_IGNORE_RET_VAL(OFGDBGenerateUUID(/* bInit = */ true)); + m_osDirName = pszName; eAccess = GA_Update; diff --git a/perftests/CMakeLists.txt b/perftests/CMakeLists.txt index 4f365f34a326..a103013d3c9b 100644 --- a/perftests/CMakeLists.txt +++ b/perftests/CMakeLists.txt @@ -1,5 +1,18 @@ include(GdalTestTarget) +include(GdalSetRuntimeEnv) +gdal_set_runtime_env(TEST_ENV) + +if (MINGW) + list(APPEND TEST_ENV SKIP_MEM_INTENSIVE_TEST=YES) +endif () + +if (WIN32) + # If running GDAL as a CustomBuild Command os MSBuild, "ERROR bla:" is considered as failing the job. This is rarely + # the intended behavior + list(APPEND TEST_ENV "CPL_ERROR_SEPARATOR=\\;") +endif () + gdal_test_target(testperfcopywords testperfcopywords.cpp) gdal_test_target(testperfdeinterleave testperfdeinterleave.cpp) @@ -10,3 +23,10 @@ target_link_libraries(bench_ogr_batch PRIVATE $) + +gdal_test_target(testperf_gdal_minmax_element testperf_gdal_minmax_element.cpp) +if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS) + target_compile_definitions(testperf_gdal_minmax_element PRIVATE -DUSE_NEON_OPTIMIZATIONS) +endif() +add_test(NAME testperf_gdal_minmax_element COMMAND testperf_gdal_minmax_element) +set_property(TEST testperf_gdal_minmax_element PROPERTY ENVIRONMENT "${TEST_ENV}") diff --git a/perftests/testperf_gdal_minmax_element.cpp b/perftests/testperf_gdal_minmax_element.cpp new file mode 100644 index 000000000000..2e2a0d2f408f --- /dev/null +++ b/perftests/testperf_gdal_minmax_element.cpp @@ -0,0 +1,332 @@ +/****************************************************************************** + * Project: GDAL Core + * Purpose: Test performance of gdal_minmax_element.hpp + * Author: Even Rouault, + * + ****************************************************************************** + * Copyright (c) 2024, Even Rouault + * + * SPDX-License-Identifier: MIT + ****************************************************************************/ + +#include "gdal_minmax_element.hpp" + +#include +#include + +template void randomFill(T *v, size_t size, bool withNaN = true) +{ + std::random_device rd; + std::mt19937 gen{rd()}; + std::normal_distribution<> dist{127, 30}; + for (size_t i = 0; i < size; i++) + { + v[i] = static_cast(dist(gen)); + if constexpr (std::is_same::value || + std::is_same::value) + { + if (withNaN && (i == 0 || (i > 10 && ((i + 1) % 1024) <= 4))) + v[i] = std::numeric_limits::quiet_NaN(); + } + } +} + +constexpr size_t SIZE = 10 * 1000 * 1000 + 1; +constexpr int N_ITERS = 1; + +template +#if defined(__GNUC__) +__attribute__((noinline)) +#endif +static void +benchIntegers(GDALDataType eDT, T noData) +{ + std::vector x; + x.resize(SIZE); + randomFill(x.data(), x.size()); + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast( + gdal::min_element(x.data(), x.size(), eDT, false, 0)); + } + idx /= N_ITERS; + printf("min at idx %d (optimized)\n", idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast( + std::distance(x.begin(), std::min_element(x.begin(), x.end()))); + } + idx /= N_ITERS; + printf("min at idx %d (using std::min_element)\n", idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast( + gdal::min_element(x.data(), x.size(), eDT, true, noData)); + } + idx /= N_ITERS; + printf("min at idx %d (nodata case, optimized)\n", idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast(std::distance( + x.begin(), std::min_element(x.begin(), x.end(), + [noData](T a, T b) { + return b == noData ? true + : a == noData ? false + : a < b; + }))); + } + idx /= N_ITERS; + printf("min at idx %d (nodata case, using std::min_element with " + "nodata aware comparison)\n", + idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } +} + +template +#if defined(__GNUC__) +__attribute__((noinline)) +#endif +static void +benchFloatingPointsWithNaN(GDALDataType eDT, T noData) +{ + std::vector x; + x.resize(SIZE); + randomFill(x.data(), x.size()); + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast( + gdal::min_element(x.data(), x.size(), eDT, false, 0)); + } + idx /= N_ITERS; + printf("min at idx %d (optimized)\n", idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast(std::distance( + x.begin(), std::min_element(x.begin(), x.end(), + [](T a, T b) { + return std::isnan(b) ? true + : std::isnan(a) ? false + : a < b; + }))); + } + idx /= N_ITERS; + printf("min at idx %d (using std::min_element with NaN aware " + "comparison)\n", + idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast( + gdal::min_element(x.data(), x.size(), eDT, true, noData)); + } + idx /= N_ITERS; + printf("min at idx %d (nodata case, optimized)\n", idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast(std::distance( + x.begin(), std::min_element(x.begin(), x.end(), + [noData](T a, T b) + { + return std::isnan(b) ? true + : std::isnan(a) ? false + : b == noData ? true + : a == noData ? false + : a < b; + }))); + } + idx /= N_ITERS; + printf("min at idx %d (nodata case, using std::min_element with " + "nodata aware and NaN aware comparison)\n", + idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } +} + +template +#if defined(__GNUC__) +__attribute__((noinline)) +#endif +static void +benchFloatingPointsWithoutNaN(GDALDataType eDT, T noData) +{ + std::vector x; + x.resize(SIZE); + randomFill(x.data(), x.size(), false); + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast( + gdal::min_element(x.data(), x.size(), eDT, false, 0)); + } + idx /= N_ITERS; + printf("min at idx %d (optimized)\n", idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast( + std::distance(x.begin(), std::min_element(x.begin(), x.end()))); + } + idx /= N_ITERS; + printf("min at idx %d (using std::min_element)\n", idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast( + gdal::min_element(x.data(), x.size(), eDT, true, noData)); + } + idx /= N_ITERS; + printf("min at idx %d (nodata case, optimized)\n", idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } + { + auto start = std::chrono::steady_clock::now(); + int idx = 0; + for (int i = 0; i < N_ITERS; ++i) + { + idx += static_cast(std::distance( + x.begin(), std::min_element(x.begin(), x.end(), + [noData](T a, T b) { + return b == noData ? true + : a == noData ? false + : a < b; + }))); + } + idx /= N_ITERS; + printf("min at idx %d (nodata case, using std::min_element with " + "nodata aware comparison)\n", + idx); + auto end = std::chrono::steady_clock::now(); + printf("-> elapsed=%d\n", static_cast((end - start).count())); + } +} + +int main(int /* argc */, char * /* argv */[]) +{ + { + using T = uint8_t; + constexpr GDALDataType eDT = GDT_Byte; + printf("uint8:\n"); + benchIntegers(eDT, 0); + } + printf("--------------------\n"); + { + using T = int8_t; + constexpr GDALDataType eDT = GDT_Int8; + printf("int8:\n"); + benchIntegers(eDT, 0); + } + printf("--------------------\n"); + { + using T = uint16_t; + constexpr GDALDataType eDT = GDT_UInt16; + printf("uint16:\n"); + benchIntegers(eDT, 0); + } + printf("--------------------\n"); + { + using T = int16_t; + constexpr GDALDataType eDT = GDT_Int16; + printf("int16:\n"); + benchIntegers(eDT, 0); + } + printf("--------------------\n"); + { + using T = uint32_t; + constexpr GDALDataType eDT = GDT_UInt32; + printf("uint32:\n"); + benchIntegers(eDT, 0); + } + printf("--------------------\n"); + { + using T = int32_t; + constexpr GDALDataType eDT = GDT_Int32; + printf("int32:\n"); + benchIntegers(eDT, 0); + } + printf("--------------------\n"); + { + using T = float; + constexpr GDALDataType eDT = GDT_Float32; + printf("float (*with* NaN):\n"); + benchFloatingPointsWithNaN(eDT, 0); + } + printf("--------------------\n"); + { + using T = float; + constexpr GDALDataType eDT = GDT_Float32; + printf("float (without NaN):\n"); + benchFloatingPointsWithoutNaN(eDT, 0); + } + printf("--------------------\n"); + { + using T = double; + constexpr GDALDataType eDT = GDT_Float64; + printf("double (*with* NaN):\n"); + benchFloatingPointsWithNaN(eDT, 0); + } + printf("--------------------\n"); + { + using T = double; + constexpr GDALDataType eDT = GDT_Float64; + printf("double (without NaN):\n"); + benchFloatingPointsWithoutNaN(eDT, 0); + } + return 0; +} diff --git a/port/cpl_cpu_features.h b/port/cpl_cpu_features.h index 9106ed5c39e5..10d3daaf9f1b 100644 --- a/port/cpl_cpu_features.h +++ b/port/cpl_cpu_features.h @@ -31,7 +31,12 @@ bool CPLHaveRuntimeSSE(); #endif #endif -#ifdef HAVE_SSSE3_AT_COMPILE_TIME +#ifdef USE_NEON_OPTIMIZATIONS +static bool inline CPLHaveRuntimeSSSE3() +{ + return true; +} +#elif defined(HAVE_SSSE3_AT_COMPILE_TIME) #if __SSSE3__ #define HAVE_INLINE_SSSE3 diff --git a/port/cpl_error.cpp b/port/cpl_error.cpp index f751837377b0..b25c6511d4aa 100644 --- a/port/cpl_error.cpp +++ b/port/cpl_error.cpp @@ -590,11 +590,18 @@ static void CPLvDebug(const char *pszCategory, /* -------------------------------------------------------------------- */ /* Does this message pass our current criteria? */ /* -------------------------------------------------------------------- */ - if (pszDebug == nullptr) + if (pszDebug == nullptr || EQUAL(pszDebug, "NO") || + EQUAL(pszDebug, "OFF") || EQUAL(pszDebug, "FALSE") || + EQUAL(pszDebug, "0")) + { return; + } - if (!EQUAL(pszDebug, "ON") && !EQUAL(pszDebug, "")) + if (!EQUAL(pszDebug, "ON") && !EQUAL(pszDebug, "YES") && + !EQUAL(pszDebug, "TRUE") && !EQUAL(pszDebug, "1") && + !EQUAL(pszDebug, "")) { + // check if value of CPL_DEBUG contains the category const size_t nLen = strlen(pszCategory); size_t i = 0; @@ -623,7 +630,7 @@ static void CPLvDebug(const char *pszCategory, pszMessage[0] = '\0'; #ifdef TIMESTAMP_DEBUG - if (CPLGetConfigOption("CPL_TIMESTAMP", nullptr) != nullptr) + if (CPLTestBool(CPLGetConfigOption("CPL_TIMESTAMP", "NO"))) { static struct CPLTimeVal tvStart; static const auto unused = CPLGettimeofday(&tvStart, nullptr); diff --git a/port/cpl_worker_thread_pool.cpp b/port/cpl_worker_thread_pool.cpp index c2134f0b7a30..387ad7bbba43 100644 --- a/port/cpl_worker_thread_pool.cpp +++ b/port/cpl_worker_thread_pool.cpp @@ -581,14 +581,14 @@ bool CPLJobQueue::SubmitJob(std::function task) m_nPendingJobs++; } - // cppcheck-suppress knownConditionTrueFalse // coverity[uninit_member,copy_constructor_call] - return m_poPool->SubmitJob( - [this, task] - { - task(); - DeclareJobFinished(); - }); + const auto lambda = [this, task] + { + task(); + DeclareJobFinished(); + }; + // cppcheck-suppress knownConditionTrueFalse + return m_poPool->SubmitJob(lambda); } /************************************************************************/ diff --git a/scripts/cppcheck.sh b/scripts/cppcheck.sh index ef7eb728a714..f3b9932c2077 100755 --- a/scripts/cppcheck.sh +++ b/scripts/cppcheck.sh @@ -146,10 +146,6 @@ ret_code=0 grep -v "unmatchedSuppression" ${LOG_FILE} | grep -v -e " yacc.c" -e PublicDecompWT -e "kdu_cache_wrapper.h" > ${LOG_FILE}.tmp mv ${LOG_FILE}.tmp ${LOG_FILE} -# I don't want to care about SDE -grep -v -e "frmts/sde" -e "ogr/ogrsf_frmts/sde" ${LOG_FILE} > ${LOG_FILE}.tmp -mv ${LOG_FILE}.tmp ${LOG_FILE} - # I don't want to care about flatbuffers grep -v -e "ogr/ogrsf_frmts/flatgeobuf/flatbuffers" ${LOG_FILE} > ${LOG_FILE}.tmp mv ${LOG_FILE}.tmp ${LOG_FILE} @@ -184,6 +180,12 @@ mv ${LOG_FILE}.tmp ${LOG_FILE} grep -v -e "The comparison '0 <= yystate' is always true" ${LOG_FILE} > ${LOG_FILE}.tmp mv ${LOG_FILE}.tmp ${LOG_FILE} +# False positives with cppcheck of ubuntu 20.04 +grep -v -e "ogrlinestring.cpp:.*warning,accessMoved" ${LOG_FILE} > ${LOG_FILE}.tmp +mv ${LOG_FILE}.tmp ${LOG_FILE} +grep -v -e "ogrgeometrycollection.cpp:.*warning,accessMoved" ${LOG_FILE} > ${LOG_FILE}.tmp +mv ${LOG_FILE}.tmp ${LOG_FILE} + if grep "null pointer" ${LOG_FILE} ; then echo "Null pointer check failed" ret_code=1 diff --git a/scripts/fix_typos.sh b/scripts/fix_typos.sh index f218033606eb..564212398542 100755 --- a/scripts/fix_typos.sh +++ b/scripts/fix_typos.sh @@ -77,6 +77,7 @@ EXCLUDED_FILES="$EXCLUDED_FILES,./cmake/modules/CMakeCheckCompilerFlagCommonPatt EXCLUDED_FILES="$EXCLUDED_FILES,./cmake/modules/Copyright.txt" EXCLUDED_FILES="$EXCLUDED_FILES,*/sqlite_rtree_bulk_load/*" EXCLUDED_FILES="$EXCLUDED_FILES,ogr_adbc_internal.h" +EXCLUDED_FILES="$EXCLUDED_FILES,sse2neon.h" EXCLUDED_FILES="$EXCLUDED_FILES,*/spelling_wordlist.txt" AUTHORIZED_LIST="poSession,FIDN,TRAFIC,HTINK,repID,oCurr,INTREST,oPosition" AUTHORIZED_LIST="$AUTHORIZED_LIST,CPL_SUPRESS_CPLUSPLUS,SRP_NAM,ADRG_NAM,'SRP_NAM,AuxilaryTarget" diff --git a/swig/include/Band.i b/swig/include/Band.i index ea4fd47f6e22..6c45f714e078 100644 --- a/swig/include/Band.i +++ b/swig/include/Band.i @@ -669,6 +669,23 @@ CPLErr AdviseRead( int xoff, int yoff, int xsize, int ysize, %clear (CPLErr); #endif +%apply (double *OUTPUT){double *pdfMin, double *pdfMax}; +%apply (int *OUTPUT){int *pnMinX, int *pnMinY}; +%apply (int *OUTPUT){int *pnMaxX, int *pnMaxY}; +#if !defined(SWIGPYTHON) +%apply (IF_ERROR_RETURN_NONE) { (CPLErr) }; +#endif + CPLErr ComputeMinMaxLocation( double *pdfMin, double *pdfMax, + int *pnMinX, int *pnMinY, + int *pnMaxX, int *pnMaxY ) { + return GDALComputeRasterMinMaxLocation( self, pdfMin, pdfMax, + pnMinX, pnMinY, + pnMaxX, pnMaxY ); + } +#if !defined(SWIGPYTHON) +%clear (CPLErr); +#endif + %newobject AsMDArray; GDALMDArrayHS *AsMDArray() { diff --git a/swig/include/python/gdal_python.i b/swig/include/python/gdal_python.i index 67610e5cfc4e..b9f48c0fa3b0 100644 --- a/swig/include/python/gdal_python.i +++ b/swig/include/python/gdal_python.i @@ -4997,6 +4997,50 @@ def InterpolateAtPoint(self, *args, **kwargs): return ret[1] %} +%feature("shadow") ComputeMinMaxLocation %{ +def ComputeMinMaxLocation(self, *args, **kwargs): + """Compute the min/max values for a band, and their location. + + Pixels whose value matches the nodata value or are masked by the mask + band are ignored. + + If the minimum or maximum value is hit in several locations, it is not + specified which one will be returned. + + This is a mapping of :cpp:func:`GDALRasterBand::ComputeRasterMinMaxLocation`. + + Parameters + ---------- + None + + Returns + ------- + a named tuple (min, max, minX, minY, maxX, maxY) or or ``None`` + in case of error or no valid pixel. + """ + + ret = $action(self, *args, **kwargs) + if ret[0] != CE_None: + return None + + import collections + tuple = collections.namedtuple('ComputeMinMaxLocationResult', + ['min', + 'max', + 'minX', + 'minY', + 'maxX', + 'maxY', + ]) + tuple.min = ret[1] + tuple.max = ret[2] + tuple.minX = ret[3] + tuple.minY = ret[4] + tuple.maxX = ret[5] + tuple.maxY = ret[6] + return tuple +%} + %pythoncode %{ # VSIFile: Copyright (c) 2024, Dan Baston diff --git a/swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py b/swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py new file mode 100644 index 000000000000..018ec00f0d29 --- /dev/null +++ b/swig/python/gdal-utils/osgeo_utils/samples/gdal_minmax_location.py @@ -0,0 +1,128 @@ +# !/usr/bin/env python3 +############################################################################### +# Project: GDAL utils +# Purpose: Get min/max location +# Author: Even Rouault +# +############################################################################### +# Copyright (c) 2024, Even Rouault +# +# SPDX-License-Identifier: MIT +############################################################################### + +import sys +import textwrap +from typing import Optional + +from osgeo import gdal, osr +from osgeo_utils.auxiliary.gdal_argparse import GDALArgumentParser, GDALScript +from osgeo_utils.auxiliary.util import PathOrDS, open_ds + + +def gdalminmaxlocation_util( + filename_or_ds: PathOrDS, + band_num: int, + open_options: Optional[dict] = None, + **kwargs, +): + ds = open_ds(filename_or_ds, open_options=open_options) + band = ds.GetRasterBand(band_num) + ret = band.ComputeMinMaxLocation() + if ret is None: + print("No valid pixels") + return 1 + gt = ds.GetGeoTransform(can_return_null=True) + if gt: + srs = ds.GetSpatialRef() + if srs: + wgs84 = osr.SpatialReference() + wgs84.SetFromUserInput("WGS84") + wgs84.SetAxisMappingStrategy(osr.OAMS_TRADITIONAL_GIS_ORDER) + ct = osr.CreateCoordinateTransformation(srs, wgs84) + georefX, georefY = gdal.ApplyGeoTransform( + gt, ret.minX + 0.5, ret.minY + 0.5 + ) + long, lat, _ = ct.TransformPoint(georefX, georefY) + print( + f"Minimum={ret.min} at (col,line)=({ret.minX},{ret.minY}), (X,Y)_georef=({georefX},{georefY}), (long,lat)_WGS84=({long:.7f},{lat:.7f})" + ) + georefX, georefY = gdal.ApplyGeoTransform( + gt, ret.maxX + 0.5, ret.maxY + 0.5 + ) + long, lat, _ = ct.TransformPoint(georefX, georefY) + print( + f"Maximum={ret.max} at (col,line)=({ret.maxX},{ret.maxY}), (X,Y)_georef=({georefX},{georefY}), (long,lat)_WGS84=({long:.7f},{lat:.7f})" + ) + else: + georefX, georefY = gdal.ApplyGeoTransform( + gt, ret.minX + 0.5, ret.minY + 0.5 + ) + print( + f"Minimum={ret.min} at (col,line)=({ret.minX},{ret.minY}), (X,Y)_georef=({georefX},{georefY})" + ) + georefX, georefY = gdal.ApplyGeoTransform( + gt, ret.maxX + 0.5, ret.maxY + 0.5 + ) + print( + f"Maximum={ret.max} at (col,line)=({ret.maxX},{ret.maxY}), (X,Y)_georef=({georefX},{georefY})" + ) + else: + print(f"Minimum={ret.min} at (col,line)=({ret.minX},{ret.minY})") + print(f"Maximum={ret.max} at (col,line)=({ret.maxX},{ret.maxY})") + + return 0 + + +class GDALMinMaxLocation(GDALScript): + def __init__(self): + super().__init__() + self.title = "Raster min/max location query tool" + self.description = textwrap.dedent( + """\ + The gdal_minmax_location utility returns the location where min/max values of a raster are hit.""" + ) + self.interactive_mode = None + + def get_parser(self, argv) -> GDALArgumentParser: + parser = self.parser + + parser.add_argument( + "-b", + dest="band_num", + metavar="band", + type=int, + default=1, + help="Selects a band to query (default: first one).", + ) + + parser.add_argument( + "-oo", + dest="open_options", + metavar="NAME=VALUE", + help="Dataset open option (format specific).", + nargs="+", + ) + + parser.add_argument( + "filename_or_ds", + metavar="filename", + type=str, + help="The source GDAL raster datasource name.", + ) + + return parser + + def augment_kwargs(self, kwargs) -> dict: + return kwargs + + def doit(self, **kwargs): + return gdalminmaxlocation_util(**kwargs) + + +def main(argv=sys.argv): + gdal.UseExceptions() + return GDALMinMaxLocation().main(argv) + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/third_party/fast_float/PROVENANCE.TXT b/third_party/fast_float/PROVENANCE.TXT index 0ea9d9ccd1d2..5a310b33ce6f 100644 --- a/third_party/fast_float/PROVENANCE.TXT +++ b/third_party/fast_float/PROVENANCE.TXT @@ -1,4 +1,4 @@ https://github.com/fastfloat/fast_float -Retrieved at commit https://github.com/fastfloat/fast_float/commit/a5ea2059295260922aa300d676a43a76b5e19a35 +Retrieved at commit https://github.com/fastfloat/fast_float/commit/9058831e6884e95358bcad29139a8b9d6cf0b534 Using the MIT license choice. diff --git a/third_party/fast_float/ascii_number.h b/third_party/fast_float/ascii_number.h index d18e3d5360af..c027435e2a01 100644 --- a/third_party/fast_float/ascii_number.h +++ b/third_party/fast_float/ascii_number.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "float_common.h" @@ -19,8 +20,7 @@ namespace fast_float { -template -fastfloat_really_inline constexpr bool has_simd_opt() { +template fastfloat_really_inline constexpr bool has_simd_opt() { #ifdef FASTFLOAT_HAS_SIMD return std::is_same::value; #else @@ -36,24 +36,20 @@ fastfloat_really_inline constexpr bool is_integer(UC c) noexcept { } fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { - return (val & 0xFF00000000000000) >> 56 - | (val & 0x00FF000000000000) >> 40 - | (val & 0x0000FF0000000000) >> 24 - | (val & 0x000000FF00000000) >> 8 - | (val & 0x00000000FF000000) << 8 - | (val & 0x0000000000FF0000) << 24 - | (val & 0x000000000000FF00) << 40 - | (val & 0x00000000000000FF) << 56; + return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 | + (val & 0x0000FF0000000000) >> 24 | (val & 0x000000FF00000000) >> 8 | + (val & 0x00000000FF000000) << 8 | (val & 0x0000000000FF0000) << 24 | + (val & 0x000000000000FF00) << 40 | (val & 0x00000000000000FF) << 56; } // Read 8 UC into a u64. Truncates UC if not char. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint64_t read8_to_u64(const UC *chars) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +read8_to_u64(const UC *chars) { if (cpp20_and_in_constexpr() || !std::is_same::value) { uint64_t val = 0; - for(int i = 0; i < 8; ++i) { - val |= uint64_t(uint8_t(*chars)) << (i*8); + for (int i = 0; i < 8; ++i) { + val |= uint64_t(uint8_t(*chars)) << (i * 8); ++chars; } return val; @@ -69,44 +65,41 @@ uint64_t read8_to_u64(const UC *chars) { #ifdef FASTFLOAT_SSE2 -fastfloat_really_inline -uint64_t simd_read8_to_u64(const __m128i data) { -FASTFLOAT_SIMD_DISABLE_WARNINGS +fastfloat_really_inline uint64_t simd_read8_to_u64(const __m128i data) { + FASTFLOAT_SIMD_DISABLE_WARNINGS const __m128i packed = _mm_packus_epi16(data, data); #ifdef FASTFLOAT_64BIT return uint64_t(_mm_cvtsi128_si64(packed)); #else uint64_t value; // Visual Studio + older versions of GCC don't support _mm_storeu_si64 - _mm_storel_epi64(reinterpret_cast<__m128i*>(&value), packed); + _mm_storel_epi64(reinterpret_cast<__m128i *>(&value), packed); return value; #endif -FASTFLOAT_SIMD_RESTORE_WARNINGS + FASTFLOAT_SIMD_RESTORE_WARNINGS } -fastfloat_really_inline -uint64_t simd_read8_to_u64(const char16_t* chars) { -FASTFLOAT_SIMD_DISABLE_WARNINGS - return simd_read8_to_u64(_mm_loadu_si128(reinterpret_cast(chars))); -FASTFLOAT_SIMD_RESTORE_WARNINGS +fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + return simd_read8_to_u64( + _mm_loadu_si128(reinterpret_cast(chars))); + FASTFLOAT_SIMD_RESTORE_WARNINGS } #elif defined(FASTFLOAT_NEON) - -fastfloat_really_inline -uint64_t simd_read8_to_u64(const uint16x8_t data) { -FASTFLOAT_SIMD_DISABLE_WARNINGS +fastfloat_really_inline uint64_t simd_read8_to_u64(const uint16x8_t data) { + FASTFLOAT_SIMD_DISABLE_WARNINGS uint8x8_t utf8_packed = vmovn_u16(data); return vget_lane_u64(vreinterpret_u64_u8(utf8_packed), 0); -FASTFLOAT_SIMD_RESTORE_WARNINGS + FASTFLOAT_SIMD_RESTORE_WARNINGS } -fastfloat_really_inline -uint64_t simd_read8_to_u64(const char16_t* chars) { -FASTFLOAT_SIMD_DISABLE_WARNINGS - return simd_read8_to_u64(vld1q_u16(reinterpret_cast(chars))); -FASTFLOAT_SIMD_RESTORE_WARNINGS +fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + return simd_read8_to_u64( + vld1q_u16(reinterpret_cast(chars))); + FASTFLOAT_SIMD_RESTORE_WARNINGS } #endif // FASTFLOAT_SSE2 @@ -115,34 +108,16 @@ FASTFLOAT_SIMD_RESTORE_WARNINGS #if defined(_MSC_VER) && _MSC_VER <= 1900 template #else -template ())> +template ()) = 0> #endif // dummy for compile -uint64_t simd_read8_to_u64(UC const*) { +uint64_t simd_read8_to_u64(UC const *) { return 0; } - -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void write_u64(uint8_t *chars, uint64_t val) { - if (cpp20_and_in_constexpr()) { - for(int i = 0; i < 8; ++i) { - *chars = uint8_t(val); - val >>= 8; - ++chars; - } - return; - } -#if FASTFLOAT_IS_BIG_ENDIAN == 1 - // Need to read as-if the number was in little-endian order. - val = byteswap(val); -#endif - ::memcpy(chars, &val, sizeof(uint64_t)); -} - // credit @aqrit -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -uint32_t parse_eight_digits_unrolled(uint64_t val) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t +parse_eight_digits_unrolled(uint64_t val) { const uint64_t mask = 0x000000FF000000FF; const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32) const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32) @@ -152,38 +127,38 @@ uint32_t parse_eight_digits_unrolled(uint64_t val) { return uint32_t(val); } - // Call this if chars are definitely 8 digits. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint32_t parse_eight_digits_unrolled(UC const * chars) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t +parse_eight_digits_unrolled(UC const *chars) noexcept { if (cpp20_and_in_constexpr() || !has_simd_opt()) { return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay } return parse_eight_digits_unrolled(simd_read8_to_u64(chars)); } - // credit @aqrit -fastfloat_really_inline constexpr bool is_made_of_eight_digits_fast(uint64_t val) noexcept { +fastfloat_really_inline constexpr bool +is_made_of_eight_digits_fast(uint64_t val) noexcept { return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) & - 0x8080808080808080)); + 0x8080808080808080)); } - #ifdef FASTFLOAT_HAS_SIMD // Call this if chars might not be 8 digits. -// Using this style (instead of is_made_of_eight_digits_fast() then parse_eight_digits_unrolled()) -// ensures we don't load SIMD registers twice. -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -bool simd_parse_if_eight_digits_unrolled(const char16_t* chars, uint64_t& i) noexcept { +// Using this style (instead of is_made_of_eight_digits_fast() then +// parse_eight_digits_unrolled()) ensures we don't load SIMD registers twice. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +simd_parse_if_eight_digits_unrolled(const char16_t *chars, + uint64_t &i) noexcept { if (cpp20_and_in_constexpr()) { return false; - } + } #ifdef FASTFLOAT_SSE2 -FASTFLOAT_SIMD_DISABLE_WARNINGS - const __m128i data = _mm_loadu_si128(reinterpret_cast(chars)); + FASTFLOAT_SIMD_DISABLE_WARNINGS + const __m128i data = + _mm_loadu_si128(reinterpret_cast(chars)); // (x - '0') <= 9 // http://0x80.pl/articles/simd-parsing-int-sequences.html @@ -193,13 +168,13 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS if (_mm_movemask_epi8(t1) == 0) { i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data)); return true; - } - else return false; -FASTFLOAT_SIMD_RESTORE_WARNINGS + } else + return false; + FASTFLOAT_SIMD_RESTORE_WARNINGS #elif defined(FASTFLOAT_NEON) -FASTFLOAT_SIMD_DISABLE_WARNINGS - const uint16x8_t data = vld1q_u16(reinterpret_cast(chars)); - + FASTFLOAT_SIMD_DISABLE_WARNINGS + const uint16x8_t data = vld1q_u16(reinterpret_cast(chars)); + // (x - '0') <= 9 // http://0x80.pl/articles/simd-parsing-int-sequences.html const uint16x8_t t0 = vsubq_u16(data, vmovq_n_u16('0')); @@ -208,11 +183,12 @@ FASTFLOAT_SIMD_DISABLE_WARNINGS if (vminvq_u16(mask) == 0xFFFF) { i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data)); return true; - } - else return false; -FASTFLOAT_SIMD_RESTORE_WARNINGS + } else + return false; + FASTFLOAT_SIMD_RESTORE_WARNINGS #else - (void)chars; (void)i; + (void)chars; + (void)i; return false; #endif // FASTFLOAT_SSE2 } @@ -223,55 +199,90 @@ FASTFLOAT_SIMD_RESTORE_WARNINGS #if defined(_MSC_VER) && _MSC_VER <= 1900 template #else -template ())> +template ()) = 0> #endif // dummy for compile -bool simd_parse_if_eight_digits_unrolled(UC const*, uint64_t&) { +bool simd_parse_if_eight_digits_unrolled(UC const *, uint64_t &) { return 0; } - -template ::value)> -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void loop_parse_if_eight_digits(const UC*& p, const UC* const pend, uint64_t& i) { +template ::value) = 0> +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +loop_parse_if_eight_digits(const UC *&p, const UC *const pend, uint64_t &i) { if (!has_simd_opt()) { return; } - while ((std::distance(p, pend) >= 8) && simd_parse_if_eight_digits_unrolled(p, i)) { // in rare cases, this will overflow, but that's ok + while ((std::distance(p, pend) >= 8) && + simd_parse_if_eight_digits_unrolled( + p, i)) { // in rare cases, this will overflow, but that's ok p += 8; } } -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void loop_parse_if_eight_digits(const char*& p, const char* const pend, uint64_t& i) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +loop_parse_if_eight_digits(const char *&p, const char *const pend, + uint64_t &i) { // optimizes better than parse_if_eight_digits_unrolled() for UC = char. - while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(read8_to_u64(p))) { - i = i * 100000000 + parse_eight_digits_unrolled(read8_to_u64(p)); // in rare cases, this will overflow, but that's ok + while ((std::distance(p, pend) >= 8) && + is_made_of_eight_digits_fast(read8_to_u64(p))) { + i = i * 100000000 + + parse_eight_digits_unrolled(read8_to_u64( + p)); // in rare cases, this will overflow, but that's ok p += 8; } } -template -struct parsed_number_string_t { +enum class parse_error { + no_error, + // [JSON-only] The minus sign must be followed by an integer. + missing_integer_after_sign, + // A sign must be followed by an integer or dot. + missing_integer_or_dot_after_sign, + // [JSON-only] The integer part must not have leading zeros. + leading_zeros_in_integer_part, + // [JSON-only] The integer part must have at least one digit. + no_digits_in_integer_part, + // [JSON-only] If there is a decimal point, there must be digits in the + // fractional part. + no_digits_in_fractional_part, + // The mantissa must have at least one digit. + no_digits_in_mantissa, + // Scientific notation requires an exponential part. + missing_exponential_part, +}; + +template struct parsed_number_string_t { int64_t exponent{0}; uint64_t mantissa{0}; - UC const * lastmatch{nullptr}; + UC const *lastmatch{nullptr}; bool negative{false}; bool valid{false}; bool too_many_digits{false}; // contains the range of the significant digits span integer{}; // non-nullable span fraction{}; // nullable + parse_error error{parse_error::no_error}; }; using byte_span = span; using parsed_number_string = parsed_number_string_t; +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t +report_parse_error(UC const *p, parse_error error) { + parsed_number_string_t answer; + answer.valid = false; + answer.lastmatch = p; + answer.error = error; + return answer; +} + // Assuming that you use no more than 19 digits, this will // parse an ASCII string. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -parsed_number_string_t parse_number_string(UC const *p, UC const * pend, parse_options_t options) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t +parse_number_string(UC const *p, UC const *pend, + parse_options_t options) noexcept { chars_format const fmt = options.format; UC const decimal_point = options.decimal_point; @@ -286,19 +297,24 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par #endif ++p; if (p == pend) { - return answer; + return report_parse_error( + p, parse_error::missing_integer_or_dot_after_sign); } if (fmt & FASTFLOAT_JSONFMT) { if (!is_integer(*p)) { // a sign must be followed by an integer - return answer; - } + return report_parse_error(p, + parse_error::missing_integer_after_sign); + } } else { - if (!is_integer(*p) && (*p != decimal_point)) { // a sign must be followed by an integer or the dot - return answer; + if (!is_integer(*p) && + (*p != + decimal_point)) { // a sign must be followed by an integer or the dot + return report_parse_error( + p, parse_error::missing_integer_or_dot_after_sign); } } } - UC const * const start_digits = p; + UC const *const start_digits = p; uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) @@ -306,16 +322,21 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par // a multiplication by 10 is cheaper than an arbitrary integer // multiplication i = 10 * i + - uint64_t(*p - UC('0')); // might overflow, we will handle the overflow later + uint64_t(*p - + UC('0')); // might overflow, we will handle the overflow later ++p; } - UC const * const end_of_integer_part = p; + UC const *const end_of_integer_part = p; int64_t digit_count = int64_t(end_of_integer_part - start_digits); answer.integer = span(start_digits, size_t(digit_count)); if (fmt & FASTFLOAT_JSONFMT) { // at least 1 digit in integer part, without leading zeros - if (digit_count == 0 || (start_digits[0] == UC('0') && digit_count > 1)) { - return answer; + if (digit_count == 0) { + return report_parse_error(p, parse_error::no_digits_in_integer_part); + } + if ((start_digits[0] == UC('0') && digit_count > 1)) { + return report_parse_error(start_digits, + parse_error::leading_zeros_in_integer_part); } } @@ -323,7 +344,7 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par const bool has_decimal_point = (p != pend) && (*p == decimal_point); if (has_decimal_point) { ++p; - UC const * before = p; + UC const *before = p; // can occur at most twice without overflowing, but let it occur more, since // for integers with many digits, digit parsing is the primary bottleneck. loop_parse_if_eight_digits(p, pend, i); @@ -340,35 +361,39 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par if (fmt & FASTFLOAT_JSONFMT) { // at least 1 digit in fractional part if (has_decimal_point && exponent == 0) { - return answer; + return report_parse_error(p, + parse_error::no_digits_in_fractional_part); } - } - else if (digit_count == 0) { // we must have encountered at least one integer! - return answer; + } else if (digit_count == + 0) { // we must have encountered at least one integer! + return report_parse_error(p, parse_error::no_digits_in_mantissa); } - int64_t exp_number = 0; // explicit exponential part - if ( ((fmt & chars_format::scientific) && - (p != pend) && - ((UC('e') == *p) || (UC('E') == *p))) - || - ((fmt & FASTFLOAT_FORTRANFMT) && - (p != pend) && - ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) || (UC('D') == *p)))) { - UC const * location_of_e = p; - if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) || (UC('D') == *p)) { + int64_t exp_number = 0; // explicit exponential part + if (((fmt & chars_format::scientific) && (p != pend) && + ((UC('e') == *p) || (UC('E') == *p))) || + ((fmt & FASTFLOAT_FORTRANFMT) && (p != pend) && + ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) || + (UC('D') == *p)))) { + UC const *location_of_e = p; + if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) || + (UC('D') == *p)) { ++p; } bool neg_exp = false; if ((p != pend) && (UC('-') == *p)) { neg_exp = true; ++p; - } else if ((p != pend) && (UC('+') == *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1) + } else if ((p != pend) && + (UC('+') == + *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1) ++p; } if ((p == pend) || !is_integer(*p)) { - if(!(fmt & chars_format::fixed)) { - // We are in error. - return answer; + if (!(fmt & chars_format::fixed)) { + // The exponential part is invalid for scientific notation, so it must + // be a trailing token for fixed notation. However, fixed notation is + // disabled, so report a scientific notation error. + return report_parse_error(p, parse_error::missing_exponential_part); } // Otherwise, we will be ignoring the 'e'. p = location_of_e; @@ -380,12 +405,16 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par } ++p; } - if(neg_exp) { exp_number = - exp_number; } + if (neg_exp) { + exp_number = -exp_number; + } exponent += exp_number; } } else { // If it scientific and not fixed, we have to bail out. - if((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { return answer; } + if ((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { + return report_parse_error(p, parse_error::missing_exponential_part); + } } answer.lastmatch = p; answer.valid = true; @@ -400,9 +429,11 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par // We have to handle the case where we have 0.0000somenumber. // We need to be mindful of the case where we only have zeroes... // E.g., 0.000000000...000. - UC const * start = start_digits; + UC const *start = start_digits; while ((start != pend) && (*start == UC('0') || *start == decimal_point)) { - if(*start == UC('0')) { digit_count --; } + if (*start == UC('0')) { + digit_count--; + } start++; } @@ -413,18 +444,17 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par // pre-tokenized spans from above. i = 0; p = answer.integer.ptr; - UC const* int_end = p + answer.integer.len(); - const uint64_t minimal_nineteen_digit_integer{ 1000000000000000000 }; + UC const *int_end = p + answer.integer.len(); + const uint64_t minimal_nineteen_digit_integer{1000000000000000000}; while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { i = i * 10 + uint64_t(*p - UC('0')); ++p; } if (i >= minimal_nineteen_digit_integer) { // We have a big integers exponent = end_of_integer_part - p + exp_number; - } - else { // We have a value with a fractional component. + } else { // We have a value with a fractional component. p = answer.fraction.ptr; - UC const* frac_end = p + answer.fraction.len(); + UC const *frac_end = p + answer.fraction.len(); while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { i = i * 10 + uint64_t(*p - UC('0')); ++p; @@ -439,6 +469,111 @@ parsed_number_string_t parse_number_string(UC const *p, UC const * pend, par return answer; } +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t +parse_int_string(UC const *p, UC const *pend, T &value, int base) { + from_chars_result_t answer; + + UC const *const first = p; + + bool negative = (*p == UC('-')); + if (!std::is_signed::value && negative) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } +#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if ((*p == UC('-')) || (*p == UC('+'))) { +#else + if (*p == UC('-')) { +#endif + ++p; + } + + UC const *const start_num = p; + + while (p != pend && *p == UC('0')) { + ++p; + } + + const bool has_leading_zeros = p > start_num; + + UC const *const start_digits = p; + + uint64_t i = 0; + if (base == 10) { + loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible + } + while (p != pend) { + uint8_t digit = ch_to_digit(*p); + if (digit >= base) { + break; + } + i = uint64_t(base) * i + digit; // might overflow, check this later + p++; + } + + size_t digit_count = size_t(p - start_digits); + + if (digit_count == 0) { + if (has_leading_zeros) { + value = 0; + answer.ec = std::errc(); + answer.ptr = p; + } else { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + } + return answer; + } + + answer.ptr = p; + + // check u64 overflow + size_t max_digits = max_digits_u64(base); + if (digit_count > max_digits) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + // this check can be eliminated for all other types, but they will all require + // a max_digits(base) equivalent + if (digit_count == max_digits && i < min_safe_u64(base)) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + + // check other types overflow + if (!std::is_same::value) { + if (i > uint64_t(std::numeric_limits::max()) + uint64_t(negative)) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + } + + if (negative) { +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(push) +#pragma warning(disable : 4146) +#endif + // this weird workaround is required because: + // - converting unsigned to signed when its value is greater than signed max + // is UB pre-C++23. + // - reinterpret_casting (~i + 1) would work, but it is not constexpr + // this is always optimized into a neg instruction (note: T is an integer + // type) + value = T(-std::numeric_limits::max() - + T(i - uint64_t(std::numeric_limits::max()))); +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(pop) +#endif + } else { + value = T(i); + } + + answer.ec = std::errc(); + return answer; +} + } // namespace fast_float #endif diff --git a/third_party/fast_float/bigint.h b/third_party/fast_float/bigint.h index 5076b47cc5c9..03a5caa4a532 100644 --- a/third_party/fast_float/bigint.h +++ b/third_party/fast_float/bigint.h @@ -37,8 +37,7 @@ constexpr size_t bigint_limbs = bigint_bits / limb_bits; // vector-like type that is allocated on the stack. the entire // buffer is pre-allocated, and only the length changes. -template -struct stackvec { +template struct stackvec { limb data[size]; // we never need more than 150 limbs uint16_t length{0}; @@ -54,16 +53,16 @@ struct stackvec { FASTFLOAT_ASSERT(try_extend(s)); } - FASTFLOAT_CONSTEXPR14 limb& operator[](size_t index) noexcept { + FASTFLOAT_CONSTEXPR14 limb &operator[](size_t index) noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); return data[index]; } - FASTFLOAT_CONSTEXPR14 const limb& operator[](size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 const limb &operator[](size_t index) const noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); return data[index]; } // index from the end of the container - FASTFLOAT_CONSTEXPR14 const limb& rindex(size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 const limb &rindex(size_t index) const noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); size_t rindex = length - index - 1; return data[rindex]; @@ -73,15 +72,9 @@ struct stackvec { FASTFLOAT_CONSTEXPR14 void set_len(size_t len) noexcept { length = uint16_t(len); } - constexpr size_t len() const noexcept { - return length; - } - constexpr bool is_empty() const noexcept { - return length == 0; - } - constexpr size_t capacity() const noexcept { - return size; - } + constexpr size_t len() const noexcept { return length; } + constexpr bool is_empty() const noexcept { return length == 0; } + constexpr size_t capacity() const noexcept { return size; } // append item to vector, without bounds checking FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept { data[length] = value; @@ -98,7 +91,7 @@ struct stackvec { } // add items to the vector, from a span, without bounds checking FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept { - limb* ptr = data + length; + limb *ptr = data + length; std::copy_n(s.ptr, s.len(), ptr); set_len(len() + s.len()); } @@ -118,8 +111,8 @@ struct stackvec { void resize_unchecked(size_t new_len, limb value) noexcept { if (new_len > len()) { size_t count = new_len - len(); - limb* first = data + len(); - limb* last = first + count; + limb *first = data + len(); + limb *last = first + count; ::std::fill(first, last, value); set_len(new_len); } else { @@ -155,21 +148,21 @@ struct stackvec { } }; -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -uint64_t empty_hi64(bool& truncated) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t +empty_hi64(bool &truncated) noexcept { truncated = false; return 0; } -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint64_t uint64_hi64(uint64_t r0, bool& truncated) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint64_hi64(uint64_t r0, bool &truncated) noexcept { truncated = false; int shl = leading_zeroes(r0); return r0 << shl; } -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint64_hi64(uint64_t r0, uint64_t r1, bool &truncated) noexcept { int shl = leading_zeroes(r0); if (shl == 0) { truncated = r1 != 0; @@ -181,20 +174,20 @@ uint64_t uint64_hi64(uint64_t r0, uint64_t r1, bool& truncated) noexcept { } } -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint64_t uint32_hi64(uint32_t r0, bool& truncated) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, bool &truncated) noexcept { return uint64_hi64(r0, truncated); } -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint64_t uint32_hi64(uint32_t r0, uint32_t r1, bool& truncated) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, uint32_t r1, bool &truncated) noexcept { uint64_t x0 = r0; uint64_t x1 = r1; return uint64_hi64((x0 << 32) | x1, truncated); } -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool &truncated) noexcept { uint64_t x0 = r0; uint64_t x1 = r1; uint64_t x2 = r2; @@ -205,17 +198,17 @@ uint64_t uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool& truncated) noe // we want an efficient operation. for msvc, where // we don't have built-in intrinsics, this is still // pretty fast. -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -limb scalar_add(limb x, limb y, bool& overflow) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb +scalar_add(limb x, limb y, bool &overflow) noexcept { limb z; // gcc and clang #if defined(__has_builtin) - #if __has_builtin(__builtin_add_overflow) - if (!cpp20_and_in_constexpr()) { - overflow = __builtin_add_overflow(x, y, &z); - return z; - } - #endif +#if __has_builtin(__builtin_add_overflow) + if (!cpp20_and_in_constexpr()) { + overflow = __builtin_add_overflow(x, y, &z); + return z; + } +#endif #endif // generic, this still optimizes correctly on MSVC. @@ -225,24 +218,24 @@ limb scalar_add(limb x, limb y, bool& overflow) noexcept { } // multiply two small integers, getting both the high and low bits. -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -limb scalar_mul(limb x, limb y, limb& carry) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb +scalar_mul(limb x, limb y, limb &carry) noexcept { #ifdef FASTFLOAT_64BIT_LIMB - #if defined(__SIZEOF_INT128__) +#if defined(__SIZEOF_INT128__) // GCC and clang both define it as an extension. __uint128_t z = __uint128_t(x) * __uint128_t(y) + __uint128_t(carry); carry = limb(z >> limb_bits); return limb(z); - #else +#else // fallback, no native 128-bit integer multiplication with carry. // on msvc, this optimizes identically, somehow. value128 z = full_multiplication(x, y); bool overflow; z.low = scalar_add(z.low, carry, overflow); - z.high += uint64_t(overflow); // cannot overflow + z.high += uint64_t(overflow); // cannot overflow carry = z.high; return z.low; - #endif +#endif #else uint64_t z = uint64_t(x) * uint64_t(y) + uint64_t(carry); carry = limb(z >> limb_bits); @@ -253,8 +246,8 @@ limb scalar_mul(limb x, limb y, limb& carry) noexcept { // add scalar value to bigint starting from offset. // used in grade school multiplication template -inline FASTFLOAT_CONSTEXPR20 -bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { +inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec &vec, limb y, + size_t start) noexcept { size_t index = start; limb carry = y; bool overflow; @@ -271,15 +264,15 @@ bool small_add_from(stackvec& vec, limb y, size_t start) noexcept { // add scalar value to bigint. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -bool small_add(stackvec& vec, limb y) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +small_add(stackvec &vec, limb y) noexcept { return small_add_from(vec, y, 0); } // multiply bigint by scalar value. template -inline FASTFLOAT_CONSTEXPR20 -bool small_mul(stackvec& vec, limb y) noexcept { +inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec &vec, + limb y) noexcept { limb carry = 0; for (size_t index = 0; index < vec.len(); index++) { vec[index] = scalar_mul(vec[index], y, carry); @@ -293,12 +286,12 @@ bool small_mul(stackvec& vec, limb y) noexcept { // add bigint to bigint starting from index. // used in grade school multiplication template -FASTFLOAT_CONSTEXPR20 -bool large_add_from(stackvec& x, limb_span y, size_t start) noexcept { +FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec &x, limb_span y, + size_t start) noexcept { // the effective x buffer is from `xstart..x.len()`, so exit early // if we can't get that current range. if (x.len() < start || y.len() > x.len() - start) { - FASTFLOAT_TRY(x.try_resize(y.len() + start, 0)); + FASTFLOAT_TRY(x.try_resize(y.len() + start, 0)); } bool carry = false; @@ -324,15 +317,14 @@ bool large_add_from(stackvec& x, limb_span y, size_t start) noexcept { // add bigint to bigint. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -bool large_add_from(stackvec& x, limb_span y) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +large_add_from(stackvec &x, limb_span y) noexcept { return large_add_from(x, y, 0); } // grade-school multiplication algorithm template -FASTFLOAT_CONSTEXPR20 -bool long_mul(stackvec& x, limb_span y) noexcept { +FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec &x, limb_span y) noexcept { limb_span xs = limb_span(x.data, x.len()); stackvec z(xs); limb_span zs = limb_span(z.data, z.len()); @@ -360,8 +352,7 @@ bool long_mul(stackvec& x, limb_span y) noexcept { // grade-school multiplication algorithm template -FASTFLOAT_CONSTEXPR20 -bool large_mul(stackvec& x, limb_span y) noexcept { +FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec &x, limb_span y) noexcept { if (y.len() == 1) { FASTFLOAT_TRY(small_mul(x, y[0])); } else { @@ -370,36 +361,58 @@ bool large_mul(stackvec& x, limb_span y) noexcept { return true; } -template -struct pow5_tables { +template struct pow5_tables { static constexpr uint32_t large_step = 135; static constexpr uint64_t small_power_of_5[] = { - 1UL, 5UL, 25UL, 125UL, 625UL, 3125UL, 15625UL, 78125UL, 390625UL, - 1953125UL, 9765625UL, 48828125UL, 244140625UL, 1220703125UL, - 6103515625UL, 30517578125UL, 152587890625UL, 762939453125UL, - 3814697265625UL, 19073486328125UL, 95367431640625UL, 476837158203125UL, - 2384185791015625UL, 11920928955078125UL, 59604644775390625UL, - 298023223876953125UL, 1490116119384765625UL, 7450580596923828125UL, + 1UL, + 5UL, + 25UL, + 125UL, + 625UL, + 3125UL, + 15625UL, + 78125UL, + 390625UL, + 1953125UL, + 9765625UL, + 48828125UL, + 244140625UL, + 1220703125UL, + 6103515625UL, + 30517578125UL, + 152587890625UL, + 762939453125UL, + 3814697265625UL, + 19073486328125UL, + 95367431640625UL, + 476837158203125UL, + 2384185791015625UL, + 11920928955078125UL, + 59604644775390625UL, + 298023223876953125UL, + 1490116119384765625UL, + 7450580596923828125UL, }; #ifdef FASTFLOAT_64BIT_LIMB constexpr static limb large_power_of_5[] = { - 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, - 10482974169319127550UL, 198276706040285095UL}; + 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, + 10482974169319127550UL, 198276706040285095UL}; #else constexpr static limb large_power_of_5[] = { - 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, - 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; + 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, + 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; #endif }; -template -constexpr uint32_t pow5_tables::large_step; +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr uint32_t pow5_tables::large_step; -template -constexpr uint64_t pow5_tables::small_power_of_5[]; +template constexpr uint64_t pow5_tables::small_power_of_5[]; -template -constexpr limb pow5_tables::large_power_of_5[]; +template constexpr limb pow5_tables::large_power_of_5[]; + +#endif // big integer type. implements a small subset of big integer // arithmetic, using simple algorithms since asymptotically @@ -409,13 +422,13 @@ struct bigint : pow5_tables<> { // storage of the limbs, in little-endian order. stackvec vec; - FASTFLOAT_CONSTEXPR20 bigint(): vec() {} + FASTFLOAT_CONSTEXPR20 bigint() : vec() {} bigint(const bigint &) = delete; bigint &operator=(const bigint &) = delete; bigint(bigint &&) = delete; bigint &operator=(bigint &&other) = delete; - FASTFLOAT_CONSTEXPR20 bigint(uint64_t value): vec() { + FASTFLOAT_CONSTEXPR20 bigint(uint64_t value) : vec() { #ifdef FASTFLOAT_64BIT_LIMB vec.push_unchecked(value); #else @@ -427,7 +440,7 @@ struct bigint : pow5_tables<> { // get the high 64 bits from the vector, and if bits were truncated. // this is to get the significant digits for the float. - FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool& truncated) const noexcept { + FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool &truncated) const noexcept { #ifdef FASTFLOAT_64BIT_LIMB if (vec.len() == 0) { return empty_hi64(truncated); @@ -446,7 +459,8 @@ struct bigint : pow5_tables<> { } else if (vec.len() == 2) { return uint32_hi64(vec.rindex(0), vec.rindex(1), truncated); } else { - uint64_t result = uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated); + uint64_t result = + uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated); truncated |= vec.nonzero(3); return result; } @@ -459,7 +473,7 @@ struct bigint : pow5_tables<> { // positive, this is larger, otherwise they are equal. // the limbs are stored in little-endian order, so we // must compare the limbs in ever order. - FASTFLOAT_CONSTEXPR20 int compare(const bigint& other) const noexcept { + FASTFLOAT_CONSTEXPR20 int compare(const bigint &other) const noexcept { if (vec.len() > other.vec.len()) { return 1; } else if (vec.len() < other.vec.len()) { @@ -512,12 +526,12 @@ struct bigint : pow5_tables<> { return false; } else if (!vec.is_empty()) { // move limbs - limb* dst = vec.data + n; - const limb* src = vec.data; + limb *dst = vec.data + n; + const limb *src = vec.data; std::copy_backward(src, src + vec.len(), dst + vec.len()); // fill in empty limbs - limb* first = vec.data; - limb* last = first + n; + limb *first = vec.data; + limb *last = first + n; ::std::fill(first, last, 0); vec.set_len(n + vec.len()); return true; @@ -560,18 +574,12 @@ struct bigint : pow5_tables<> { return int(limb_bits * vec.len()) - lz; } - FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { - return small_mul(vec, y); - } + FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); } - FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { - return small_add(vec, y); - } + FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); } // multiply as if by 2 raised to a power. - FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { - return shl(exp); - } + FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); } // multiply as if by 5 raised to a power. FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept { @@ -597,9 +605,8 @@ struct bigint : pow5_tables<> { // Work around clang bug https://godbolt.org/z/zedh7rrhc // This is similar to https://github.com/llvm/llvm-project/issues/47746, // except the workaround described there don't work here - FASTFLOAT_TRY( - small_mul(vec, limb(((void)small_power_of_5[0], small_power_of_5[exp]))) - ); + FASTFLOAT_TRY(small_mul( + vec, limb(((void)small_power_of_5[0], small_power_of_5[exp])))); } return true; diff --git a/third_party/fast_float/constexpr_feature_detect.h b/third_party/fast_float/constexpr_feature_detect.h index ba8b65c64a16..7624beafcacf 100644 --- a/third_party/fast_float/constexpr_feature_detect.h +++ b/third_party/fast_float/constexpr_feature_detect.h @@ -20,16 +20,16 @@ #define FASTFLOAT_HAS_BIT_CAST 0 #endif -#if defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L +#if defined(__cpp_lib_is_constant_evaluated) && \ + __cpp_lib_is_constant_evaluated >= 201811L #define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1 #else #define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0 #endif // Testing for relevant C++20 constexpr library features -#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED \ - && FASTFLOAT_HAS_BIT_CAST \ - && __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/ +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED && FASTFLOAT_HAS_BIT_CAST && \ + __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/ #define FASTFLOAT_CONSTEXPR20 constexpr #define FASTFLOAT_IS_CONSTEXPR 1 #else @@ -37,4 +37,10 @@ #define FASTFLOAT_IS_CONSTEXPR 0 #endif +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 0 +#else +#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 1 +#endif + #endif // FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H diff --git a/third_party/fast_float/decimal_to_binary.h b/third_party/fast_float/decimal_to_binary.h index fec916f3a07b..70ecf73c8ea6 100644 --- a/third_party/fast_float/decimal_to_binary.h +++ b/third_party/fast_float/decimal_to_binary.h @@ -12,27 +12,34 @@ namespace fast_float { -// This will compute or rather approximate w * 5**q and return a pair of 64-bit words approximating -// the result, with the "high" part corresponding to the most significant bits and the -// low part corresponding to the least significant bits. +// This will compute or rather approximate w * 5**q and return a pair of 64-bit +// words approximating the result, with the "high" part corresponding to the +// most significant bits and the low part corresponding to the least significant +// bits. // template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -value128 compute_product_approximation(int64_t q, uint64_t w) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128 +compute_product_approximation(int64_t q, uint64_t w) { const int index = 2 * int(q - powers::smallest_power_of_five); - // For small values of q, e.g., q in [0,27], the answer is always exact because - // The line value128 firstproduct = full_multiplication(w, power_of_five_128[index]); - // gives the exact answer. - value128 firstproduct = full_multiplication(w, powers::power_of_five_128[index]); - static_assert((bit_precision >= 0) && (bit_precision <= 64), " precision should be in (0,64]"); - constexpr uint64_t precision_mask = (bit_precision < 64) ? - (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision) - : uint64_t(0xFFFFFFFFFFFFFFFF); - if((firstproduct.high & precision_mask) == precision_mask) { // could further guard with (lower + w < lower) - // regarding the second product, we only need secondproduct.high, but our expectation is that the compiler will optimize this extra work away if needed. - value128 secondproduct = full_multiplication(w, powers::power_of_five_128[index + 1]); + // For small values of q, e.g., q in [0,27], the answer is always exact + // because The line value128 firstproduct = full_multiplication(w, + // power_of_five_128[index]); gives the exact answer. + value128 firstproduct = + full_multiplication(w, powers::power_of_five_128[index]); + static_assert((bit_precision >= 0) && (bit_precision <= 64), + " precision should be in (0,64]"); + constexpr uint64_t precision_mask = + (bit_precision < 64) ? (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision) + : uint64_t(0xFFFFFFFFFFFFFFFF); + if ((firstproduct.high & precision_mask) == + precision_mask) { // could further guard with (lower + w < lower) + // regarding the second product, we only need secondproduct.high, but our + // expectation is that the compiler will optimize this extra work away if + // needed. + value128 secondproduct = + full_multiplication(w, powers::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; - if(secondproduct.high > firstproduct.low) { + if (secondproduct.high > firstproduct.low) { firstproduct.high++; } } @@ -55,43 +62,45 @@ namespace detail { * where * p = log(5**-q)/log(2) = -q * log(5)/log(2) */ - constexpr fastfloat_really_inline int32_t power(int32_t q) noexcept { - return (((152170 + 65536) * q) >> 16) + 63; - } +constexpr fastfloat_really_inline int32_t power(int32_t q) noexcept { + return (((152170 + 65536) * q) >> 16) + 63; +} } // namespace detail // create an adjusted mantissa, biased by the invalid power2 // for significant digits already multiplied by 10 ** q. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -adjusted_mantissa compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa +compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept { int hilz = int(w >> 63) ^ 1; adjusted_mantissa answer; answer.mantissa = w << hilz; int bias = binary::mantissa_explicit_bits() - binary::minimum_exponent(); - answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 + invalid_am_bias); + answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 + + invalid_am_bias); return answer; } // w * 10 ** q, without rounding the representation up. // the power2 in the exponent will be adjusted by invalid_am_bias. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -adjusted_mantissa compute_error(int64_t q, uint64_t w) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +compute_error(int64_t q, uint64_t w) noexcept { int lz = leading_zeroes(w); w <<= lz; - value128 product = compute_product_approximation(q, w); + value128 product = + compute_product_approximation(q, w); return compute_error_scaled(q, product.high, lz); } // w * 10 ** q -// The returned value should be a valid ieee64 number that simply need to be packed. -// However, in some very rare cases, the computation will fail. In such cases, we -// return an adjusted_mantissa with a negative power of 2: the caller should recompute -// in such cases. +// The returned value should be a valid ieee64 number that simply need to be +// packed. However, in some very rare cases, the computation will fail. In such +// cases, we return an adjusted_mantissa with a negative power of 2: the caller +// should recompute in such cases. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -adjusted_mantissa compute_float(int64_t q, uint64_t w) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +compute_float(int64_t q, uint64_t w) noexcept { adjusted_mantissa answer; if ((w == 0) || (q < binary::smallest_power_of_ten())) { answer.power2 = 0; @@ -105,7 +114,8 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w) noexcept { answer.mantissa = 0; return answer; } - // At this point in time q is in [powers::smallest_power_of_five, powers::largest_power_of_five]. + // At this point in time q is in [powers::smallest_power_of_five, + // powers::largest_power_of_five]. // We want the most significant bit of i to be 1. Shift if needed. int lz = leading_zeroes(w); @@ -114,26 +124,32 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w) noexcept { // The required precision is binary::mantissa_explicit_bits() + 3 because // 1. We need the implicit bit // 2. We need an extra bit for rounding purposes - // 3. We might lose a bit due to the "upperbit" routine (result too small, requiring a shift) + // 3. We might lose a bit due to the "upperbit" routine (result too small, + // requiring a shift) - value128 product = compute_product_approximation(q, w); + value128 product = + compute_product_approximation(q, w); // The computed 'product' is always sufficient. // Mathematical proof: - // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to appear) - // See script/mushtak_lemire.py + // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to + // appear) See script/mushtak_lemire.py - // The "compute_product_approximation" function can be slightly slower than a branchless approach: - // value128 product = compute_product(q, w); - // but in practice, we can win big with the compute_product_approximation if its additional branch - // is easily predicted. Which is best is data specific. + // The "compute_product_approximation" function can be slightly slower than a + // branchless approach: value128 product = compute_product(q, w); but in + // practice, we can win big with the compute_product_approximation if its + // additional branch is easily predicted. Which is best is data specific. int upperbit = int(product.high >> 63); + int shift = upperbit + 64 - binary::mantissa_explicit_bits() - 3; - answer.mantissa = product.high >> (upperbit + 64 - binary::mantissa_explicit_bits() - 3); + answer.mantissa = product.high >> shift; - answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz - binary::minimum_exponent()); + answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz - + binary::minimum_exponent()); if (answer.power2 <= 0) { // we have a subnormal? // Here have that answer.power2 <= 0 so -answer.power2 >= 0 - if(-answer.power2 + 1 >= 64) { // if we have more than 64 bits below the minimum exponent, you have a zero for sure. + if (-answer.power2 + 1 >= + 64) { // if we have more than 64 bits below the minimum exponent, you + // have a zero for sure. answer.power2 = 0; answer.mantissa = 0; // result should be zero @@ -152,20 +168,26 @@ adjusted_mantissa compute_float(int64_t q, uint64_t w) noexcept { // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer // subnormal, but we can only know this after rounding. // So we only declare a subnormal if we are smaller than the threshold. - answer.power2 = (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) ? 0 : 1; + answer.power2 = + (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) + ? 0 + : 1; return answer; } // usually, we round *up*, but if we fall right in between and and we have an // even basis, we need to round down // We are only concerned with the cases where 5**q fits in single 64-bit word. - if ((product.low <= 1) && (q >= binary::min_exponent_round_to_even()) && (q <= binary::max_exponent_round_to_even()) && - ((answer.mantissa & 3) == 1) ) { // we may fall between two floats! + if ((product.low <= 1) && (q >= binary::min_exponent_round_to_even()) && + (q <= binary::max_exponent_round_to_even()) && + ((answer.mantissa & 3) == 1)) { // we may fall between two floats! // To be in-between two floats we need that in doing - // answer.mantissa = product.high >> (upperbit + 64 - binary::mantissa_explicit_bits() - 3); - // ... we dropped out only zeroes. But if this happened, then we can go back!!! - if((answer.mantissa << (upperbit + 64 - binary::mantissa_explicit_bits() - 3)) == product.high) { - answer.mantissa &= ~uint64_t(1); // flip it so that we do not round up + // answer.mantissa = product.high >> (upperbit + 64 - + // binary::mantissa_explicit_bits() - 3); + // ... we dropped out only zeroes. But if this happened, then we can go + // back!!! + if ((answer.mantissa << shift) == product.high) { + answer.mantissa &= ~uint64_t(1); // flip it so that we do not round up } } diff --git a/third_party/fast_float/digit_comparison.h b/third_party/fast_float/digit_comparison.h index 512a27f5a5f4..303fff91eb1f 100644 --- a/third_party/fast_float/digit_comparison.h +++ b/third_party/fast_float/digit_comparison.h @@ -13,19 +13,34 @@ namespace fast_float { // 1e0 to 1e19 -constexpr static uint64_t powers_of_ten_uint64[] = { - 1UL, 10UL, 100UL, 1000UL, 10000UL, 100000UL, 1000000UL, 10000000UL, 100000000UL, - 1000000000UL, 10000000000UL, 100000000000UL, 1000000000000UL, 10000000000000UL, - 100000000000000UL, 1000000000000000UL, 10000000000000000UL, 100000000000000000UL, - 1000000000000000000UL, 10000000000000000000UL}; +constexpr static uint64_t powers_of_ten_uint64[] = {1UL, + 10UL, + 100UL, + 1000UL, + 10000UL, + 100000UL, + 1000000UL, + 10000000UL, + 100000000UL, + 1000000000UL, + 10000000000UL, + 100000000000UL, + 1000000000000UL, + 10000000000000UL, + 100000000000000UL, + 1000000000000000UL, + 10000000000000000UL, + 100000000000000000UL, + 1000000000000000000UL, + 10000000000000000000UL}; // calculate the exponent, in scientific notation, of the number. // this algorithm is not even close to optimized, but it has no practical // effect on performance: in order to have a faster algorithm, we'd need // to slow down performance for faster algorithms, and this is still fast. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -int32_t scientific_exponent(parsed_number_string_t & num) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int32_t +scientific_exponent(parsed_number_string_t &num) noexcept { uint64_t mantissa = num.mantissa; int32_t exponent = int32_t(num.exponent); while (mantissa >= 10000) { @@ -45,15 +60,16 @@ int32_t scientific_exponent(parsed_number_string_t & num) noexcept { // this converts a native floating-point number to an extended-precision float. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -adjusted_mantissa to_extended(T value) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +to_extended(T value) noexcept { using equiv_uint = typename binary_format::equiv_uint; constexpr equiv_uint exponent_mask = binary_format::exponent_mask(); constexpr equiv_uint mantissa_mask = binary_format::mantissa_mask(); constexpr equiv_uint hidden_bit_mask = binary_format::hidden_bit_mask(); adjusted_mantissa am; - int32_t bias = binary_format::mantissa_explicit_bits() - binary_format::minimum_exponent(); + int32_t bias = binary_format::mantissa_explicit_bits() - + binary_format::minimum_exponent(); equiv_uint bits; #if FASTFLOAT_HAS_BIT_CAST bits = std::bit_cast(value); @@ -66,7 +82,8 @@ adjusted_mantissa to_extended(T value) noexcept { am.mantissa = bits & mantissa_mask; } else { // normal - am.power2 = int32_t((bits & exponent_mask) >> binary_format::mantissa_explicit_bits()); + am.power2 = int32_t((bits & exponent_mask) >> + binary_format::mantissa_explicit_bits()); am.power2 -= bias; am.mantissa = (bits & mantissa_mask) | hidden_bit_mask; } @@ -78,8 +95,8 @@ adjusted_mantissa to_extended(T value) noexcept { // we are given a native float that represents b, so we need to adjust it // halfway between b and b+u. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -adjusted_mantissa to_extended_halfway(T value) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +to_extended_halfway(T value) noexcept { adjusted_mantissa am = to_extended(value); am.mantissa <<= 1; am.mantissa += 1; @@ -89,15 +106,18 @@ adjusted_mantissa to_extended_halfway(T value) noexcept { // round an extended-precision float to the nearest machine float. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -void round(adjusted_mantissa& am, callback cb) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am, + callback cb) noexcept { int32_t mantissa_shift = 64 - binary_format::mantissa_explicit_bits() - 1; if (-am.power2 >= mantissa_shift) { // have a denormal float int32_t shift = -am.power2 + 1; cb(am, std::min(shift, 64)); // check for round-up: if rounding-nearest carried us to the hidden bit. - am.power2 = (am.mantissa < (uint64_t(1) << binary_format::mantissa_explicit_bits())) ? 0 : 1; + am.power2 = (am.mantissa < + (uint64_t(1) << binary_format::mantissa_explicit_bits())) + ? 0 + : 1; return; } @@ -105,7 +125,8 @@ void round(adjusted_mantissa& am, callback cb) noexcept { cb(am, mantissa_shift); // check for carry - if (am.mantissa >= (uint64_t(2) << binary_format::mantissa_explicit_bits())) { + if (am.mantissa >= + (uint64_t(2) << binary_format::mantissa_explicit_bits())) { am.mantissa = (uint64_t(1) << binary_format::mantissa_explicit_bits()); am.power2++; } @@ -119,16 +140,11 @@ void round(adjusted_mantissa& am, callback cb) noexcept { } template -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) noexcept { - const uint64_t mask - = (shift == 64) - ? UINT64_MAX - : (uint64_t(1) << shift) - 1; - const uint64_t halfway - = (shift == 0) - ? 0 - : uint64_t(1) << (shift - 1); +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +round_nearest_tie_even(adjusted_mantissa &am, int32_t shift, + callback cb) noexcept { + const uint64_t mask = (shift == 64) ? UINT64_MAX : (uint64_t(1) << shift) - 1; + const uint64_t halfway = (shift == 0) ? 0 : uint64_t(1) << (shift - 1); uint64_t truncated_bits = am.mantissa & mask; bool is_above = truncated_bits > halfway; bool is_halfway = truncated_bits == halfway; @@ -145,8 +161,8 @@ void round_nearest_tie_even(adjusted_mantissa& am, int32_t shift, callback cb) n am.mantissa += uint64_t(cb(is_odd, is_halfway, is_above)); } -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -void round_down(adjusted_mantissa& am, int32_t shift) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +round_down(adjusted_mantissa &am, int32_t shift) noexcept { if (shift == 64) { am.mantissa = 0; } else { @@ -155,10 +171,11 @@ void round_down(adjusted_mantissa& am, int32_t shift) noexcept { am.power2 += shift; } template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void skip_zeros(UC const * & first, UC const * last) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +skip_zeros(UC const *&first, UC const *last) noexcept { uint64_t val; - while (!cpp20_and_in_constexpr() && std::distance(first, last) >= int_cmp_len()) { + while (!cpp20_and_in_constexpr() && + std::distance(first, last) >= int_cmp_len()) { ::memcpy(&val, first, sizeof(uint64_t)); if (val != int_cmp_zeros()) { break; @@ -176,11 +193,12 @@ void skip_zeros(UC const * & first, UC const * last) noexcept { // determine if any non-zero digits were truncated. // all characters must be valid digits. template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -bool is_truncated(UC const * first, UC const * last) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +is_truncated(UC const *first, UC const *last) noexcept { // do 8-bit optimizations, can just compare to 8 literal 0s. uint64_t val; - while (!cpp20_and_in_constexpr() && std::distance(first, last) >= int_cmp_len()) { + while (!cpp20_and_in_constexpr() && + std::distance(first, last) >= int_cmp_len()) { ::memcpy(&val, first, sizeof(uint64_t)); if (val != int_cmp_zeros()) { return true; @@ -196,15 +214,15 @@ bool is_truncated(UC const * first, UC const * last) noexcept { return false; } template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -bool is_truncated(span s) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +is_truncated(span s) noexcept { return is_truncated(s.ptr, s.ptr + s.len()); } - template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void parse_eight_digits(const UC*& p, limb& value, size_t& counter, size_t& count) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +parse_eight_digits(const UC *&p, limb &value, size_t &counter, + size_t &count) noexcept { value = value * 100000000 + parse_eight_digits_unrolled(p); p += 8; counter += 8; @@ -212,22 +230,23 @@ void parse_eight_digits(const UC*& p, limb& value, size_t& counter, size_t& coun } template -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -void parse_one_digit(UC const *& p, limb& value, size_t& counter, size_t& count) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +parse_one_digit(UC const *&p, limb &value, size_t &counter, + size_t &count) noexcept { value = value * 10 + limb(*p - UC('0')); p++; counter++; count++; } -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void add_native(bigint& big, limb power, limb value) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +add_native(bigint &big, limb power, limb value) noexcept { big.mul(power); big.add(value); } -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void round_up_bigint(bigint& big, size_t& count) noexcept { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +round_up_bigint(bigint &big, size_t &count) noexcept { // need to round-up the digits, but need to avoid rounding // ....9999 to ...10000, which could cause a false halfway point. add_native(big, 10, 1); @@ -236,8 +255,9 @@ void round_up_bigint(bigint& big, size_t& count) noexcept { // parse the significant digits into a big integer template -inline FASTFLOAT_CONSTEXPR20 -void parse_mantissa(bigint& result, parsed_number_string_t& num, size_t max_digits, size_t& digits) noexcept { +inline FASTFLOAT_CONSTEXPR20 void +parse_mantissa(bigint &result, parsed_number_string_t &num, + size_t max_digits, size_t &digits) noexcept { // try to minimize the number of big integer and scalar multiplication. // therefore, try to parse 8 digits at a time, and multiply by the largest // scalar value (9 or 19 digits) for each step. @@ -251,12 +271,13 @@ void parse_mantissa(bigint& result, parsed_number_string_t& num, size_t max_ #endif // process all integer digits. - UC const * p = num.integer.ptr; - UC const * pend = p + num.integer.len(); + UC const *p = num.integer.ptr; + UC const *pend = p + num.integer.len(); skip_zeros(p, pend); // process all digits, in increments of step per loop while (p != pend) { - while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) { + while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && + (max_digits - digits >= 8)) { parse_eight_digits(p, value, counter, digits); } while (counter < step && p != pend && digits < max_digits) { @@ -289,7 +310,8 @@ void parse_mantissa(bigint& result, parsed_number_string_t& num, size_t max_ } // process all digits, in increments of step per loop while (p != pend) { - while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && (max_digits - digits >= 8)) { + while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && + (max_digits - digits >= 8)) { parse_eight_digits(p, value, counter, digits); } while (counter < step && p != pend && digits < max_digits) { @@ -317,19 +339,23 @@ void parse_mantissa(bigint& result, parsed_number_string_t& num, size_t max_ } template -inline FASTFLOAT_CONSTEXPR20 -adjusted_mantissa positive_digit_comp(bigint& bigmant, int32_t exponent) noexcept { +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +positive_digit_comp(bigint &bigmant, int32_t exponent) noexcept { FASTFLOAT_ASSERT(bigmant.pow10(uint32_t(exponent))); adjusted_mantissa answer; bool truncated; answer.mantissa = bigmant.hi64(truncated); - int bias = binary_format::mantissa_explicit_bits() - binary_format::minimum_exponent(); + int bias = binary_format::mantissa_explicit_bits() - + binary_format::minimum_exponent(); answer.power2 = bigmant.bit_length() - 64 + bias; - round(answer, [truncated](adjusted_mantissa& a, int32_t shift) { - round_nearest_tie_even(a, shift, [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool { - return is_above || (is_halfway && truncated) || (is_odd && is_halfway); - }); + round(answer, [truncated](adjusted_mantissa &a, int32_t shift) { + round_nearest_tie_even( + a, shift, + [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool { + return is_above || (is_halfway && truncated) || + (is_odd && is_halfway); + }); }); return answer; @@ -341,15 +367,17 @@ adjusted_mantissa positive_digit_comp(bigint& bigmant, int32_t exponent) noexcep // we then need to scale by `2^(f- e)`, and then the two significant digits // are of the same magnitude. template -inline FASTFLOAT_CONSTEXPR20 -adjusted_mantissa negative_digit_comp(bigint& bigmant, adjusted_mantissa am, int32_t exponent) noexcept { - bigint& real_digits = bigmant; +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa negative_digit_comp( + bigint &bigmant, adjusted_mantissa am, int32_t exponent) noexcept { + bigint &real_digits = bigmant; int32_t real_exp = exponent; // get the value of `b`, rounded down, and get a bigint representation of b+h adjusted_mantissa am_b = am; - // gcc7 buf: use a lambda to remove the noexcept qualifier bug with -Wnoexcept-type. - round(am_b, [](adjusted_mantissa&a, int32_t shift) { round_down(a, shift); }); + // gcc7 buf: use a lambda to remove the noexcept qualifier bug with + // -Wnoexcept-type. + round(am_b, + [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); }); T b; to_float(false, am_b, b); adjusted_mantissa theor = to_extended_halfway(b); @@ -371,18 +399,19 @@ adjusted_mantissa negative_digit_comp(bigint& bigmant, adjusted_mantissa am, int // compare digits, and use it to director rounding int ord = real_digits.compare(theor_digits); adjusted_mantissa answer = am; - round(answer, [ord](adjusted_mantissa& a, int32_t shift) { - round_nearest_tie_even(a, shift, [ord](bool is_odd, bool _, bool __) -> bool { - (void)_; // not needed, since we've done our comparison - (void)__; // not needed, since we've done our comparison - if (ord > 0) { - return true; - } else if (ord < 0) { - return false; - } else { - return is_odd; - } - }); + round(answer, [ord](adjusted_mantissa &a, int32_t shift) { + round_nearest_tie_even( + a, shift, [ord](bool is_odd, bool _, bool __) -> bool { + (void)_; // not needed, since we've done our comparison + (void)__; // not needed, since we've done our comparison + if (ord > 0) { + return true; + } else if (ord < 0) { + return false; + } else { + return is_odd; + } + }); }); return answer; @@ -402,8 +431,8 @@ adjusted_mantissa negative_digit_comp(bigint& bigmant, adjusted_mantissa am, int // the actual digits. we then compare the big integer representations // of both, and use that to direct rounding. template -inline FASTFLOAT_CONSTEXPR20 -adjusted_mantissa digit_comp(parsed_number_string_t& num, adjusted_mantissa am) noexcept { +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +digit_comp(parsed_number_string_t &num, adjusted_mantissa am) noexcept { // remove the invalid exponent bias am.power2 -= invalid_am_bias; diff --git a/third_party/fast_float/fast_float.h b/third_party/fast_float/fast_float.h index 04efa877ee7b..42a3cdfaf4c4 100644 --- a/third_party/fast_float/fast_float.h +++ b/third_party/fast_float/fast_float.h @@ -6,36 +6,50 @@ namespace fast_float { /** - * This function parses the character sequence [first,last) for a number. It parses floating-point numbers expecting - * a locale-indepent format equivalent to what is used by std::strtod in the default ("C") locale. - * The resulting floating-point value is the closest floating-point values (using either float or double), - * using the "round to even" convention for values that would otherwise fall right in-between two values. - * That is, we provide exact parsing according to the IEEE standard. + * This function parses the character sequence [first,last) for a number. It + * parses floating-point numbers expecting a locale-indepent format equivalent + * to what is used by std::strtod in the default ("C") locale. The resulting + * floating-point value is the closest floating-point values (using either float + * or double), using the "round to even" convention for values that would + * otherwise fall right in-between two values. That is, we provide exact parsing + * according to the IEEE standard. * - * Given a successful parse, the pointer (`ptr`) in the returned value is set to point right after the - * parsed number, and the `value` referenced is set to the parsed value. In case of error, the returned - * `ec` contains a representative error, otherwise the default (`std::errc()`) value is stored. + * Given a successful parse, the pointer (`ptr`) in the returned value is set to + * point right after the parsed number, and the `value` referenced is set to the + * parsed value. In case of error, the returned `ec` contains a representative + * error, otherwise the default (`std::errc()`) value is stored. * - * The implementation does not throw and does not allocate memory (e.g., with `new` or `malloc`). + * The implementation does not throw and does not allocate memory (e.g., with + * `new` or `malloc`). * - * Like the C++17 standard, the `fast_float::from_chars` functions take an optional last argument of - * the type `fast_float::chars_format`. It is a bitset value: we check whether - * `fmt & fast_float::chars_format::fixed` and `fmt & fast_float::chars_format::scientific` are set - * to determine whether we allow the fixed point and scientific notation respectively. - * The default is `fast_float::chars_format::general` which allows both `fixed` and `scientific`. + * Like the C++17 standard, the `fast_float::from_chars` functions take an + * optional last argument of the type `fast_float::chars_format`. It is a bitset + * value: we check whether `fmt & fast_float::chars_format::fixed` and `fmt & + * fast_float::chars_format::scientific` are set to determine whether we allow + * the fixed point and scientific notation respectively. The default is + * `fast_float::chars_format::general` which allows both `fixed` and + * `scientific`. */ -template -FASTFLOAT_CONSTEXPR20 -from_chars_result_t from_chars(UC const * first, UC const * last, - T &value, chars_format fmt = chars_format::general) noexcept; +template ())> +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, + chars_format fmt = chars_format::general) noexcept; /** * Like from_chars, but accepts an `options` argument to govern number parsing. */ -template -FASTFLOAT_CONSTEXPR20 -from_chars_result_t from_chars_advanced(UC const * first, UC const * last, - T &value, parse_options_t options) noexcept; +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept; +/** + * from_chars for integer types. + */ +template ())> +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, int base = 10) noexcept; } // namespace fast_float #include "parse_number.h" diff --git a/third_party/fast_float/fast_table.h b/third_party/fast_float/fast_table.h index d8dc5690517c..69f9b2c9245f 100644 --- a/third_party/fast_float/fast_table.h +++ b/third_party/fast_float/fast_table.h @@ -29,669 +29,677 @@ namespace fast_float { * infinite in binary64 so we never need to worry about powers * of 5 greater than 308. */ -template -struct powers_template { +template struct powers_template { -constexpr static int smallest_power_of_five = binary_format::smallest_power_of_ten(); -constexpr static int largest_power_of_five = binary_format::largest_power_of_ten(); -constexpr static int number_of_entries = 2 * (largest_power_of_five - smallest_power_of_five + 1); -// Powers of five from 5^-342 all the way to 5^308 rounded toward one. -constexpr static uint64_t power_of_five_128[number_of_entries] = { - 0xeef453d6923bd65a,0x113faa2906a13b3f, - 0x9558b4661b6565f8,0x4ac7ca59a424c507, - 0xbaaee17fa23ebf76,0x5d79bcf00d2df649, - 0xe95a99df8ace6f53,0xf4d82c2c107973dc, - 0x91d8a02bb6c10594,0x79071b9b8a4be869, - 0xb64ec836a47146f9,0x9748e2826cdee284, - 0xe3e27a444d8d98b7,0xfd1b1b2308169b25, - 0x8e6d8c6ab0787f72,0xfe30f0f5e50e20f7, - 0xb208ef855c969f4f,0xbdbd2d335e51a935, - 0xde8b2b66b3bc4723,0xad2c788035e61382, - 0x8b16fb203055ac76,0x4c3bcb5021afcc31, - 0xaddcb9e83c6b1793,0xdf4abe242a1bbf3d, - 0xd953e8624b85dd78,0xd71d6dad34a2af0d, - 0x87d4713d6f33aa6b,0x8672648c40e5ad68, - 0xa9c98d8ccb009506,0x680efdaf511f18c2, - 0xd43bf0effdc0ba48,0x212bd1b2566def2, - 0x84a57695fe98746d,0x14bb630f7604b57, - 0xa5ced43b7e3e9188,0x419ea3bd35385e2d, - 0xcf42894a5dce35ea,0x52064cac828675b9, - 0x818995ce7aa0e1b2,0x7343efebd1940993, - 0xa1ebfb4219491a1f,0x1014ebe6c5f90bf8, - 0xca66fa129f9b60a6,0xd41a26e077774ef6, - 0xfd00b897478238d0,0x8920b098955522b4, - 0x9e20735e8cb16382,0x55b46e5f5d5535b0, - 0xc5a890362fddbc62,0xeb2189f734aa831d, - 0xf712b443bbd52b7b,0xa5e9ec7501d523e4, - 0x9a6bb0aa55653b2d,0x47b233c92125366e, - 0xc1069cd4eabe89f8,0x999ec0bb696e840a, - 0xf148440a256e2c76,0xc00670ea43ca250d, - 0x96cd2a865764dbca,0x380406926a5e5728, - 0xbc807527ed3e12bc,0xc605083704f5ecf2, - 0xeba09271e88d976b,0xf7864a44c633682e, - 0x93445b8731587ea3,0x7ab3ee6afbe0211d, - 0xb8157268fdae9e4c,0x5960ea05bad82964, - 0xe61acf033d1a45df,0x6fb92487298e33bd, - 0x8fd0c16206306bab,0xa5d3b6d479f8e056, - 0xb3c4f1ba87bc8696,0x8f48a4899877186c, - 0xe0b62e2929aba83c,0x331acdabfe94de87, - 0x8c71dcd9ba0b4925,0x9ff0c08b7f1d0b14, - 0xaf8e5410288e1b6f,0x7ecf0ae5ee44dd9, - 0xdb71e91432b1a24a,0xc9e82cd9f69d6150, - 0x892731ac9faf056e,0xbe311c083a225cd2, - 0xab70fe17c79ac6ca,0x6dbd630a48aaf406, - 0xd64d3d9db981787d,0x92cbbccdad5b108, - 0x85f0468293f0eb4e,0x25bbf56008c58ea5, - 0xa76c582338ed2621,0xaf2af2b80af6f24e, - 0xd1476e2c07286faa,0x1af5af660db4aee1, - 0x82cca4db847945ca,0x50d98d9fc890ed4d, - 0xa37fce126597973c,0xe50ff107bab528a0, - 0xcc5fc196fefd7d0c,0x1e53ed49a96272c8, - 0xff77b1fcbebcdc4f,0x25e8e89c13bb0f7a, - 0x9faacf3df73609b1,0x77b191618c54e9ac, - 0xc795830d75038c1d,0xd59df5b9ef6a2417, - 0xf97ae3d0d2446f25,0x4b0573286b44ad1d, - 0x9becce62836ac577,0x4ee367f9430aec32, - 0xc2e801fb244576d5,0x229c41f793cda73f, - 0xf3a20279ed56d48a,0x6b43527578c1110f, - 0x9845418c345644d6,0x830a13896b78aaa9, - 0xbe5691ef416bd60c,0x23cc986bc656d553, - 0xedec366b11c6cb8f,0x2cbfbe86b7ec8aa8, - 0x94b3a202eb1c3f39,0x7bf7d71432f3d6a9, - 0xb9e08a83a5e34f07,0xdaf5ccd93fb0cc53, - 0xe858ad248f5c22c9,0xd1b3400f8f9cff68, - 0x91376c36d99995be,0x23100809b9c21fa1, - 0xb58547448ffffb2d,0xabd40a0c2832a78a, - 0xe2e69915b3fff9f9,0x16c90c8f323f516c, - 0x8dd01fad907ffc3b,0xae3da7d97f6792e3, - 0xb1442798f49ffb4a,0x99cd11cfdf41779c, - 0xdd95317f31c7fa1d,0x40405643d711d583, - 0x8a7d3eef7f1cfc52,0x482835ea666b2572, - 0xad1c8eab5ee43b66,0xda3243650005eecf, - 0xd863b256369d4a40,0x90bed43e40076a82, - 0x873e4f75e2224e68,0x5a7744a6e804a291, - 0xa90de3535aaae202,0x711515d0a205cb36, - 0xd3515c2831559a83,0xd5a5b44ca873e03, - 0x8412d9991ed58091,0xe858790afe9486c2, - 0xa5178fff668ae0b6,0x626e974dbe39a872, - 0xce5d73ff402d98e3,0xfb0a3d212dc8128f, - 0x80fa687f881c7f8e,0x7ce66634bc9d0b99, - 0xa139029f6a239f72,0x1c1fffc1ebc44e80, - 0xc987434744ac874e,0xa327ffb266b56220, - 0xfbe9141915d7a922,0x4bf1ff9f0062baa8, - 0x9d71ac8fada6c9b5,0x6f773fc3603db4a9, - 0xc4ce17b399107c22,0xcb550fb4384d21d3, - 0xf6019da07f549b2b,0x7e2a53a146606a48, - 0x99c102844f94e0fb,0x2eda7444cbfc426d, - 0xc0314325637a1939,0xfa911155fefb5308, - 0xf03d93eebc589f88,0x793555ab7eba27ca, - 0x96267c7535b763b5,0x4bc1558b2f3458de, - 0xbbb01b9283253ca2,0x9eb1aaedfb016f16, - 0xea9c227723ee8bcb,0x465e15a979c1cadc, - 0x92a1958a7675175f,0xbfacd89ec191ec9, - 0xb749faed14125d36,0xcef980ec671f667b, - 0xe51c79a85916f484,0x82b7e12780e7401a, - 0x8f31cc0937ae58d2,0xd1b2ecb8b0908810, - 0xb2fe3f0b8599ef07,0x861fa7e6dcb4aa15, - 0xdfbdcece67006ac9,0x67a791e093e1d49a, - 0x8bd6a141006042bd,0xe0c8bb2c5c6d24e0, - 0xaecc49914078536d,0x58fae9f773886e18, - 0xda7f5bf590966848,0xaf39a475506a899e, - 0x888f99797a5e012d,0x6d8406c952429603, - 0xaab37fd7d8f58178,0xc8e5087ba6d33b83, - 0xd5605fcdcf32e1d6,0xfb1e4a9a90880a64, - 0x855c3be0a17fcd26,0x5cf2eea09a55067f, - 0xa6b34ad8c9dfc06f,0xf42faa48c0ea481e, - 0xd0601d8efc57b08b,0xf13b94daf124da26, - 0x823c12795db6ce57,0x76c53d08d6b70858, - 0xa2cb1717b52481ed,0x54768c4b0c64ca6e, - 0xcb7ddcdda26da268,0xa9942f5dcf7dfd09, - 0xfe5d54150b090b02,0xd3f93b35435d7c4c, - 0x9efa548d26e5a6e1,0xc47bc5014a1a6daf, - 0xc6b8e9b0709f109a,0x359ab6419ca1091b, - 0xf867241c8cc6d4c0,0xc30163d203c94b62, - 0x9b407691d7fc44f8,0x79e0de63425dcf1d, - 0xc21094364dfb5636,0x985915fc12f542e4, - 0xf294b943e17a2bc4,0x3e6f5b7b17b2939d, - 0x979cf3ca6cec5b5a,0xa705992ceecf9c42, - 0xbd8430bd08277231,0x50c6ff782a838353, - 0xece53cec4a314ebd,0xa4f8bf5635246428, - 0x940f4613ae5ed136,0x871b7795e136be99, - 0xb913179899f68584,0x28e2557b59846e3f, - 0xe757dd7ec07426e5,0x331aeada2fe589cf, - 0x9096ea6f3848984f,0x3ff0d2c85def7621, - 0xb4bca50b065abe63,0xfed077a756b53a9, - 0xe1ebce4dc7f16dfb,0xd3e8495912c62894, - 0x8d3360f09cf6e4bd,0x64712dd7abbbd95c, - 0xb080392cc4349dec,0xbd8d794d96aacfb3, - 0xdca04777f541c567,0xecf0d7a0fc5583a0, - 0x89e42caaf9491b60,0xf41686c49db57244, - 0xac5d37d5b79b6239,0x311c2875c522ced5, - 0xd77485cb25823ac7,0x7d633293366b828b, - 0x86a8d39ef77164bc,0xae5dff9c02033197, - 0xa8530886b54dbdeb,0xd9f57f830283fdfc, - 0xd267caa862a12d66,0xd072df63c324fd7b, - 0x8380dea93da4bc60,0x4247cb9e59f71e6d, - 0xa46116538d0deb78,0x52d9be85f074e608, - 0xcd795be870516656,0x67902e276c921f8b, - 0x806bd9714632dff6,0xba1cd8a3db53b6, - 0xa086cfcd97bf97f3,0x80e8a40eccd228a4, - 0xc8a883c0fdaf7df0,0x6122cd128006b2cd, - 0xfad2a4b13d1b5d6c,0x796b805720085f81, - 0x9cc3a6eec6311a63,0xcbe3303674053bb0, - 0xc3f490aa77bd60fc,0xbedbfc4411068a9c, - 0xf4f1b4d515acb93b,0xee92fb5515482d44, - 0x991711052d8bf3c5,0x751bdd152d4d1c4a, - 0xbf5cd54678eef0b6,0xd262d45a78a0635d, - 0xef340a98172aace4,0x86fb897116c87c34, - 0x9580869f0e7aac0e,0xd45d35e6ae3d4da0, - 0xbae0a846d2195712,0x8974836059cca109, - 0xe998d258869facd7,0x2bd1a438703fc94b, - 0x91ff83775423cc06,0x7b6306a34627ddcf, - 0xb67f6455292cbf08,0x1a3bc84c17b1d542, - 0xe41f3d6a7377eeca,0x20caba5f1d9e4a93, - 0x8e938662882af53e,0x547eb47b7282ee9c, - 0xb23867fb2a35b28d,0xe99e619a4f23aa43, - 0xdec681f9f4c31f31,0x6405fa00e2ec94d4, - 0x8b3c113c38f9f37e,0xde83bc408dd3dd04, - 0xae0b158b4738705e,0x9624ab50b148d445, - 0xd98ddaee19068c76,0x3badd624dd9b0957, - 0x87f8a8d4cfa417c9,0xe54ca5d70a80e5d6, - 0xa9f6d30a038d1dbc,0x5e9fcf4ccd211f4c, - 0xd47487cc8470652b,0x7647c3200069671f, - 0x84c8d4dfd2c63f3b,0x29ecd9f40041e073, - 0xa5fb0a17c777cf09,0xf468107100525890, - 0xcf79cc9db955c2cc,0x7182148d4066eeb4, - 0x81ac1fe293d599bf,0xc6f14cd848405530, - 0xa21727db38cb002f,0xb8ada00e5a506a7c, - 0xca9cf1d206fdc03b,0xa6d90811f0e4851c, - 0xfd442e4688bd304a,0x908f4a166d1da663, - 0x9e4a9cec15763e2e,0x9a598e4e043287fe, - 0xc5dd44271ad3cdba,0x40eff1e1853f29fd, - 0xf7549530e188c128,0xd12bee59e68ef47c, - 0x9a94dd3e8cf578b9,0x82bb74f8301958ce, - 0xc13a148e3032d6e7,0xe36a52363c1faf01, - 0xf18899b1bc3f8ca1,0xdc44e6c3cb279ac1, - 0x96f5600f15a7b7e5,0x29ab103a5ef8c0b9, - 0xbcb2b812db11a5de,0x7415d448f6b6f0e7, - 0xebdf661791d60f56,0x111b495b3464ad21, - 0x936b9fcebb25c995,0xcab10dd900beec34, - 0xb84687c269ef3bfb,0x3d5d514f40eea742, - 0xe65829b3046b0afa,0xcb4a5a3112a5112, - 0x8ff71a0fe2c2e6dc,0x47f0e785eaba72ab, - 0xb3f4e093db73a093,0x59ed216765690f56, - 0xe0f218b8d25088b8,0x306869c13ec3532c, - 0x8c974f7383725573,0x1e414218c73a13fb, - 0xafbd2350644eeacf,0xe5d1929ef90898fa, - 0xdbac6c247d62a583,0xdf45f746b74abf39, - 0x894bc396ce5da772,0x6b8bba8c328eb783, - 0xab9eb47c81f5114f,0x66ea92f3f326564, - 0xd686619ba27255a2,0xc80a537b0efefebd, - 0x8613fd0145877585,0xbd06742ce95f5f36, - 0xa798fc4196e952e7,0x2c48113823b73704, - 0xd17f3b51fca3a7a0,0xf75a15862ca504c5, - 0x82ef85133de648c4,0x9a984d73dbe722fb, - 0xa3ab66580d5fdaf5,0xc13e60d0d2e0ebba, - 0xcc963fee10b7d1b3,0x318df905079926a8, - 0xffbbcfe994e5c61f,0xfdf17746497f7052, - 0x9fd561f1fd0f9bd3,0xfeb6ea8bedefa633, - 0xc7caba6e7c5382c8,0xfe64a52ee96b8fc0, - 0xf9bd690a1b68637b,0x3dfdce7aa3c673b0, - 0x9c1661a651213e2d,0x6bea10ca65c084e, - 0xc31bfa0fe5698db8,0x486e494fcff30a62, - 0xf3e2f893dec3f126,0x5a89dba3c3efccfa, - 0x986ddb5c6b3a76b7,0xf89629465a75e01c, - 0xbe89523386091465,0xf6bbb397f1135823, - 0xee2ba6c0678b597f,0x746aa07ded582e2c, - 0x94db483840b717ef,0xa8c2a44eb4571cdc, - 0xba121a4650e4ddeb,0x92f34d62616ce413, - 0xe896a0d7e51e1566,0x77b020baf9c81d17, - 0x915e2486ef32cd60,0xace1474dc1d122e, - 0xb5b5ada8aaff80b8,0xd819992132456ba, - 0xe3231912d5bf60e6,0x10e1fff697ed6c69, - 0x8df5efabc5979c8f,0xca8d3ffa1ef463c1, - 0xb1736b96b6fd83b3,0xbd308ff8a6b17cb2, - 0xddd0467c64bce4a0,0xac7cb3f6d05ddbde, - 0x8aa22c0dbef60ee4,0x6bcdf07a423aa96b, - 0xad4ab7112eb3929d,0x86c16c98d2c953c6, - 0xd89d64d57a607744,0xe871c7bf077ba8b7, - 0x87625f056c7c4a8b,0x11471cd764ad4972, - 0xa93af6c6c79b5d2d,0xd598e40d3dd89bcf, - 0xd389b47879823479,0x4aff1d108d4ec2c3, - 0x843610cb4bf160cb,0xcedf722a585139ba, - 0xa54394fe1eedb8fe,0xc2974eb4ee658828, - 0xce947a3da6a9273e,0x733d226229feea32, - 0x811ccc668829b887,0x806357d5a3f525f, - 0xa163ff802a3426a8,0xca07c2dcb0cf26f7, - 0xc9bcff6034c13052,0xfc89b393dd02f0b5, - 0xfc2c3f3841f17c67,0xbbac2078d443ace2, - 0x9d9ba7832936edc0,0xd54b944b84aa4c0d, - 0xc5029163f384a931,0xa9e795e65d4df11, - 0xf64335bcf065d37d,0x4d4617b5ff4a16d5, - 0x99ea0196163fa42e,0x504bced1bf8e4e45, - 0xc06481fb9bcf8d39,0xe45ec2862f71e1d6, - 0xf07da27a82c37088,0x5d767327bb4e5a4c, - 0x964e858c91ba2655,0x3a6a07f8d510f86f, - 0xbbe226efb628afea,0x890489f70a55368b, - 0xeadab0aba3b2dbe5,0x2b45ac74ccea842e, - 0x92c8ae6b464fc96f,0x3b0b8bc90012929d, - 0xb77ada0617e3bbcb,0x9ce6ebb40173744, - 0xe55990879ddcaabd,0xcc420a6a101d0515, - 0x8f57fa54c2a9eab6,0x9fa946824a12232d, - 0xb32df8e9f3546564,0x47939822dc96abf9, - 0xdff9772470297ebd,0x59787e2b93bc56f7, - 0x8bfbea76c619ef36,0x57eb4edb3c55b65a, - 0xaefae51477a06b03,0xede622920b6b23f1, - 0xdab99e59958885c4,0xe95fab368e45eced, - 0x88b402f7fd75539b,0x11dbcb0218ebb414, - 0xaae103b5fcd2a881,0xd652bdc29f26a119, - 0xd59944a37c0752a2,0x4be76d3346f0495f, - 0x857fcae62d8493a5,0x6f70a4400c562ddb, - 0xa6dfbd9fb8e5b88e,0xcb4ccd500f6bb952, - 0xd097ad07a71f26b2,0x7e2000a41346a7a7, - 0x825ecc24c873782f,0x8ed400668c0c28c8, - 0xa2f67f2dfa90563b,0x728900802f0f32fa, - 0xcbb41ef979346bca,0x4f2b40a03ad2ffb9, - 0xfea126b7d78186bc,0xe2f610c84987bfa8, - 0x9f24b832e6b0f436,0xdd9ca7d2df4d7c9, - 0xc6ede63fa05d3143,0x91503d1c79720dbb, - 0xf8a95fcf88747d94,0x75a44c6397ce912a, - 0x9b69dbe1b548ce7c,0xc986afbe3ee11aba, - 0xc24452da229b021b,0xfbe85badce996168, - 0xf2d56790ab41c2a2,0xfae27299423fb9c3, - 0x97c560ba6b0919a5,0xdccd879fc967d41a, - 0xbdb6b8e905cb600f,0x5400e987bbc1c920, - 0xed246723473e3813,0x290123e9aab23b68, - 0x9436c0760c86e30b,0xf9a0b6720aaf6521, - 0xb94470938fa89bce,0xf808e40e8d5b3e69, - 0xe7958cb87392c2c2,0xb60b1d1230b20e04, - 0x90bd77f3483bb9b9,0xb1c6f22b5e6f48c2, - 0xb4ecd5f01a4aa828,0x1e38aeb6360b1af3, - 0xe2280b6c20dd5232,0x25c6da63c38de1b0, - 0x8d590723948a535f,0x579c487e5a38ad0e, - 0xb0af48ec79ace837,0x2d835a9df0c6d851, - 0xdcdb1b2798182244,0xf8e431456cf88e65, - 0x8a08f0f8bf0f156b,0x1b8e9ecb641b58ff, - 0xac8b2d36eed2dac5,0xe272467e3d222f3f, - 0xd7adf884aa879177,0x5b0ed81dcc6abb0f, - 0x86ccbb52ea94baea,0x98e947129fc2b4e9, - 0xa87fea27a539e9a5,0x3f2398d747b36224, - 0xd29fe4b18e88640e,0x8eec7f0d19a03aad, - 0x83a3eeeef9153e89,0x1953cf68300424ac, - 0xa48ceaaab75a8e2b,0x5fa8c3423c052dd7, - 0xcdb02555653131b6,0x3792f412cb06794d, - 0x808e17555f3ebf11,0xe2bbd88bbee40bd0, - 0xa0b19d2ab70e6ed6,0x5b6aceaeae9d0ec4, - 0xc8de047564d20a8b,0xf245825a5a445275, - 0xfb158592be068d2e,0xeed6e2f0f0d56712, - 0x9ced737bb6c4183d,0x55464dd69685606b, - 0xc428d05aa4751e4c,0xaa97e14c3c26b886, - 0xf53304714d9265df,0xd53dd99f4b3066a8, - 0x993fe2c6d07b7fab,0xe546a8038efe4029, - 0xbf8fdb78849a5f96,0xde98520472bdd033, - 0xef73d256a5c0f77c,0x963e66858f6d4440, - 0x95a8637627989aad,0xdde7001379a44aa8, - 0xbb127c53b17ec159,0x5560c018580d5d52, - 0xe9d71b689dde71af,0xaab8f01e6e10b4a6, - 0x9226712162ab070d,0xcab3961304ca70e8, - 0xb6b00d69bb55c8d1,0x3d607b97c5fd0d22, - 0xe45c10c42a2b3b05,0x8cb89a7db77c506a, - 0x8eb98a7a9a5b04e3,0x77f3608e92adb242, - 0xb267ed1940f1c61c,0x55f038b237591ed3, - 0xdf01e85f912e37a3,0x6b6c46dec52f6688, - 0x8b61313bbabce2c6,0x2323ac4b3b3da015, - 0xae397d8aa96c1b77,0xabec975e0a0d081a, - 0xd9c7dced53c72255,0x96e7bd358c904a21, - 0x881cea14545c7575,0x7e50d64177da2e54, - 0xaa242499697392d2,0xdde50bd1d5d0b9e9, - 0xd4ad2dbfc3d07787,0x955e4ec64b44e864, - 0x84ec3c97da624ab4,0xbd5af13bef0b113e, - 0xa6274bbdd0fadd61,0xecb1ad8aeacdd58e, - 0xcfb11ead453994ba,0x67de18eda5814af2, - 0x81ceb32c4b43fcf4,0x80eacf948770ced7, - 0xa2425ff75e14fc31,0xa1258379a94d028d, - 0xcad2f7f5359a3b3e,0x96ee45813a04330, - 0xfd87b5f28300ca0d,0x8bca9d6e188853fc, - 0x9e74d1b791e07e48,0x775ea264cf55347e, - 0xc612062576589dda,0x95364afe032a819e, - 0xf79687aed3eec551,0x3a83ddbd83f52205, - 0x9abe14cd44753b52,0xc4926a9672793543, - 0xc16d9a0095928a27,0x75b7053c0f178294, - 0xf1c90080baf72cb1,0x5324c68b12dd6339, - 0x971da05074da7bee,0xd3f6fc16ebca5e04, - 0xbce5086492111aea,0x88f4bb1ca6bcf585, - 0xec1e4a7db69561a5,0x2b31e9e3d06c32e6, - 0x9392ee8e921d5d07,0x3aff322e62439fd0, - 0xb877aa3236a4b449,0x9befeb9fad487c3, - 0xe69594bec44de15b,0x4c2ebe687989a9b4, - 0x901d7cf73ab0acd9,0xf9d37014bf60a11, - 0xb424dc35095cd80f,0x538484c19ef38c95, - 0xe12e13424bb40e13,0x2865a5f206b06fba, - 0x8cbccc096f5088cb,0xf93f87b7442e45d4, - 0xafebff0bcb24aafe,0xf78f69a51539d749, - 0xdbe6fecebdedd5be,0xb573440e5a884d1c, - 0x89705f4136b4a597,0x31680a88f8953031, - 0xabcc77118461cefc,0xfdc20d2b36ba7c3e, - 0xd6bf94d5e57a42bc,0x3d32907604691b4d, - 0x8637bd05af6c69b5,0xa63f9a49c2c1b110, - 0xa7c5ac471b478423,0xfcf80dc33721d54, - 0xd1b71758e219652b,0xd3c36113404ea4a9, - 0x83126e978d4fdf3b,0x645a1cac083126ea, - 0xa3d70a3d70a3d70a,0x3d70a3d70a3d70a4, - 0xcccccccccccccccc,0xcccccccccccccccd, - 0x8000000000000000,0x0, - 0xa000000000000000,0x0, - 0xc800000000000000,0x0, - 0xfa00000000000000,0x0, - 0x9c40000000000000,0x0, - 0xc350000000000000,0x0, - 0xf424000000000000,0x0, - 0x9896800000000000,0x0, - 0xbebc200000000000,0x0, - 0xee6b280000000000,0x0, - 0x9502f90000000000,0x0, - 0xba43b74000000000,0x0, - 0xe8d4a51000000000,0x0, - 0x9184e72a00000000,0x0, - 0xb5e620f480000000,0x0, - 0xe35fa931a0000000,0x0, - 0x8e1bc9bf04000000,0x0, - 0xb1a2bc2ec5000000,0x0, - 0xde0b6b3a76400000,0x0, - 0x8ac7230489e80000,0x0, - 0xad78ebc5ac620000,0x0, - 0xd8d726b7177a8000,0x0, - 0x878678326eac9000,0x0, - 0xa968163f0a57b400,0x0, - 0xd3c21bcecceda100,0x0, - 0x84595161401484a0,0x0, - 0xa56fa5b99019a5c8,0x0, - 0xcecb8f27f4200f3a,0x0, - 0x813f3978f8940984,0x4000000000000000, - 0xa18f07d736b90be5,0x5000000000000000, - 0xc9f2c9cd04674ede,0xa400000000000000, - 0xfc6f7c4045812296,0x4d00000000000000, - 0x9dc5ada82b70b59d,0xf020000000000000, - 0xc5371912364ce305,0x6c28000000000000, - 0xf684df56c3e01bc6,0xc732000000000000, - 0x9a130b963a6c115c,0x3c7f400000000000, - 0xc097ce7bc90715b3,0x4b9f100000000000, - 0xf0bdc21abb48db20,0x1e86d40000000000, - 0x96769950b50d88f4,0x1314448000000000, - 0xbc143fa4e250eb31,0x17d955a000000000, - 0xeb194f8e1ae525fd,0x5dcfab0800000000, - 0x92efd1b8d0cf37be,0x5aa1cae500000000, - 0xb7abc627050305ad,0xf14a3d9e40000000, - 0xe596b7b0c643c719,0x6d9ccd05d0000000, - 0x8f7e32ce7bea5c6f,0xe4820023a2000000, - 0xb35dbf821ae4f38b,0xdda2802c8a800000, - 0xe0352f62a19e306e,0xd50b2037ad200000, - 0x8c213d9da502de45,0x4526f422cc340000, - 0xaf298d050e4395d6,0x9670b12b7f410000, - 0xdaf3f04651d47b4c,0x3c0cdd765f114000, - 0x88d8762bf324cd0f,0xa5880a69fb6ac800, - 0xab0e93b6efee0053,0x8eea0d047a457a00, - 0xd5d238a4abe98068,0x72a4904598d6d880, - 0x85a36366eb71f041,0x47a6da2b7f864750, - 0xa70c3c40a64e6c51,0x999090b65f67d924, - 0xd0cf4b50cfe20765,0xfff4b4e3f741cf6d, - 0x82818f1281ed449f,0xbff8f10e7a8921a4, - 0xa321f2d7226895c7,0xaff72d52192b6a0d, - 0xcbea6f8ceb02bb39,0x9bf4f8a69f764490, - 0xfee50b7025c36a08,0x2f236d04753d5b4, - 0x9f4f2726179a2245,0x1d762422c946590, - 0xc722f0ef9d80aad6,0x424d3ad2b7b97ef5, - 0xf8ebad2b84e0d58b,0xd2e0898765a7deb2, - 0x9b934c3b330c8577,0x63cc55f49f88eb2f, - 0xc2781f49ffcfa6d5,0x3cbf6b71c76b25fb, - 0xf316271c7fc3908a,0x8bef464e3945ef7a, - 0x97edd871cfda3a56,0x97758bf0e3cbb5ac, - 0xbde94e8e43d0c8ec,0x3d52eeed1cbea317, - 0xed63a231d4c4fb27,0x4ca7aaa863ee4bdd, - 0x945e455f24fb1cf8,0x8fe8caa93e74ef6a, - 0xb975d6b6ee39e436,0xb3e2fd538e122b44, - 0xe7d34c64a9c85d44,0x60dbbca87196b616, - 0x90e40fbeea1d3a4a,0xbc8955e946fe31cd, - 0xb51d13aea4a488dd,0x6babab6398bdbe41, - 0xe264589a4dcdab14,0xc696963c7eed2dd1, - 0x8d7eb76070a08aec,0xfc1e1de5cf543ca2, - 0xb0de65388cc8ada8,0x3b25a55f43294bcb, - 0xdd15fe86affad912,0x49ef0eb713f39ebe, - 0x8a2dbf142dfcc7ab,0x6e3569326c784337, - 0xacb92ed9397bf996,0x49c2c37f07965404, - 0xd7e77a8f87daf7fb,0xdc33745ec97be906, - 0x86f0ac99b4e8dafd,0x69a028bb3ded71a3, - 0xa8acd7c0222311bc,0xc40832ea0d68ce0c, - 0xd2d80db02aabd62b,0xf50a3fa490c30190, - 0x83c7088e1aab65db,0x792667c6da79e0fa, - 0xa4b8cab1a1563f52,0x577001b891185938, - 0xcde6fd5e09abcf26,0xed4c0226b55e6f86, - 0x80b05e5ac60b6178,0x544f8158315b05b4, - 0xa0dc75f1778e39d6,0x696361ae3db1c721, - 0xc913936dd571c84c,0x3bc3a19cd1e38e9, - 0xfb5878494ace3a5f,0x4ab48a04065c723, - 0x9d174b2dcec0e47b,0x62eb0d64283f9c76, - 0xc45d1df942711d9a,0x3ba5d0bd324f8394, - 0xf5746577930d6500,0xca8f44ec7ee36479, - 0x9968bf6abbe85f20,0x7e998b13cf4e1ecb, - 0xbfc2ef456ae276e8,0x9e3fedd8c321a67e, - 0xefb3ab16c59b14a2,0xc5cfe94ef3ea101e, - 0x95d04aee3b80ece5,0xbba1f1d158724a12, - 0xbb445da9ca61281f,0x2a8a6e45ae8edc97, - 0xea1575143cf97226,0xf52d09d71a3293bd, - 0x924d692ca61be758,0x593c2626705f9c56, - 0xb6e0c377cfa2e12e,0x6f8b2fb00c77836c, - 0xe498f455c38b997a,0xb6dfb9c0f956447, - 0x8edf98b59a373fec,0x4724bd4189bd5eac, - 0xb2977ee300c50fe7,0x58edec91ec2cb657, - 0xdf3d5e9bc0f653e1,0x2f2967b66737e3ed, - 0x8b865b215899f46c,0xbd79e0d20082ee74, - 0xae67f1e9aec07187,0xecd8590680a3aa11, - 0xda01ee641a708de9,0xe80e6f4820cc9495, - 0x884134fe908658b2,0x3109058d147fdcdd, - 0xaa51823e34a7eede,0xbd4b46f0599fd415, - 0xd4e5e2cdc1d1ea96,0x6c9e18ac7007c91a, - 0x850fadc09923329e,0x3e2cf6bc604ddb0, - 0xa6539930bf6bff45,0x84db8346b786151c, - 0xcfe87f7cef46ff16,0xe612641865679a63, - 0x81f14fae158c5f6e,0x4fcb7e8f3f60c07e, - 0xa26da3999aef7749,0xe3be5e330f38f09d, - 0xcb090c8001ab551c,0x5cadf5bfd3072cc5, - 0xfdcb4fa002162a63,0x73d9732fc7c8f7f6, - 0x9e9f11c4014dda7e,0x2867e7fddcdd9afa, - 0xc646d63501a1511d,0xb281e1fd541501b8, - 0xf7d88bc24209a565,0x1f225a7ca91a4226, - 0x9ae757596946075f,0x3375788de9b06958, - 0xc1a12d2fc3978937,0x52d6b1641c83ae, - 0xf209787bb47d6b84,0xc0678c5dbd23a49a, - 0x9745eb4d50ce6332,0xf840b7ba963646e0, - 0xbd176620a501fbff,0xb650e5a93bc3d898, - 0xec5d3fa8ce427aff,0xa3e51f138ab4cebe, - 0x93ba47c980e98cdf,0xc66f336c36b10137, - 0xb8a8d9bbe123f017,0xb80b0047445d4184, - 0xe6d3102ad96cec1d,0xa60dc059157491e5, - 0x9043ea1ac7e41392,0x87c89837ad68db2f, - 0xb454e4a179dd1877,0x29babe4598c311fb, - 0xe16a1dc9d8545e94,0xf4296dd6fef3d67a, - 0x8ce2529e2734bb1d,0x1899e4a65f58660c, - 0xb01ae745b101e9e4,0x5ec05dcff72e7f8f, - 0xdc21a1171d42645d,0x76707543f4fa1f73, - 0x899504ae72497eba,0x6a06494a791c53a8, - 0xabfa45da0edbde69,0x487db9d17636892, - 0xd6f8d7509292d603,0x45a9d2845d3c42b6, - 0x865b86925b9bc5c2,0xb8a2392ba45a9b2, - 0xa7f26836f282b732,0x8e6cac7768d7141e, - 0xd1ef0244af2364ff,0x3207d795430cd926, - 0x8335616aed761f1f,0x7f44e6bd49e807b8, - 0xa402b9c5a8d3a6e7,0x5f16206c9c6209a6, - 0xcd036837130890a1,0x36dba887c37a8c0f, - 0x802221226be55a64,0xc2494954da2c9789, - 0xa02aa96b06deb0fd,0xf2db9baa10b7bd6c, - 0xc83553c5c8965d3d,0x6f92829494e5acc7, - 0xfa42a8b73abbf48c,0xcb772339ba1f17f9, - 0x9c69a97284b578d7,0xff2a760414536efb, - 0xc38413cf25e2d70d,0xfef5138519684aba, - 0xf46518c2ef5b8cd1,0x7eb258665fc25d69, - 0x98bf2f79d5993802,0xef2f773ffbd97a61, - 0xbeeefb584aff8603,0xaafb550ffacfd8fa, - 0xeeaaba2e5dbf6784,0x95ba2a53f983cf38, - 0x952ab45cfa97a0b2,0xdd945a747bf26183, - 0xba756174393d88df,0x94f971119aeef9e4, - 0xe912b9d1478ceb17,0x7a37cd5601aab85d, - 0x91abb422ccb812ee,0xac62e055c10ab33a, - 0xb616a12b7fe617aa,0x577b986b314d6009, - 0xe39c49765fdf9d94,0xed5a7e85fda0b80b, - 0x8e41ade9fbebc27d,0x14588f13be847307, - 0xb1d219647ae6b31c,0x596eb2d8ae258fc8, - 0xde469fbd99a05fe3,0x6fca5f8ed9aef3bb, - 0x8aec23d680043bee,0x25de7bb9480d5854, - 0xada72ccc20054ae9,0xaf561aa79a10ae6a, - 0xd910f7ff28069da4,0x1b2ba1518094da04, - 0x87aa9aff79042286,0x90fb44d2f05d0842, - 0xa99541bf57452b28,0x353a1607ac744a53, - 0xd3fa922f2d1675f2,0x42889b8997915ce8, - 0x847c9b5d7c2e09b7,0x69956135febada11, - 0xa59bc234db398c25,0x43fab9837e699095, - 0xcf02b2c21207ef2e,0x94f967e45e03f4bb, - 0x8161afb94b44f57d,0x1d1be0eebac278f5, - 0xa1ba1ba79e1632dc,0x6462d92a69731732, - 0xca28a291859bbf93,0x7d7b8f7503cfdcfe, - 0xfcb2cb35e702af78,0x5cda735244c3d43e, - 0x9defbf01b061adab,0x3a0888136afa64a7, - 0xc56baec21c7a1916,0x88aaa1845b8fdd0, - 0xf6c69a72a3989f5b,0x8aad549e57273d45, - 0x9a3c2087a63f6399,0x36ac54e2f678864b, - 0xc0cb28a98fcf3c7f,0x84576a1bb416a7dd, - 0xf0fdf2d3f3c30b9f,0x656d44a2a11c51d5, - 0x969eb7c47859e743,0x9f644ae5a4b1b325, - 0xbc4665b596706114,0x873d5d9f0dde1fee, - 0xeb57ff22fc0c7959,0xa90cb506d155a7ea, - 0x9316ff75dd87cbd8,0x9a7f12442d588f2, - 0xb7dcbf5354e9bece,0xc11ed6d538aeb2f, - 0xe5d3ef282a242e81,0x8f1668c8a86da5fa, - 0x8fa475791a569d10,0xf96e017d694487bc, - 0xb38d92d760ec4455,0x37c981dcc395a9ac, - 0xe070f78d3927556a,0x85bbe253f47b1417, - 0x8c469ab843b89562,0x93956d7478ccec8e, - 0xaf58416654a6babb,0x387ac8d1970027b2, - 0xdb2e51bfe9d0696a,0x6997b05fcc0319e, - 0x88fcf317f22241e2,0x441fece3bdf81f03, - 0xab3c2fddeeaad25a,0xd527e81cad7626c3, - 0xd60b3bd56a5586f1,0x8a71e223d8d3b074, - 0x85c7056562757456,0xf6872d5667844e49, - 0xa738c6bebb12d16c,0xb428f8ac016561db, - 0xd106f86e69d785c7,0xe13336d701beba52, - 0x82a45b450226b39c,0xecc0024661173473, - 0xa34d721642b06084,0x27f002d7f95d0190, - 0xcc20ce9bd35c78a5,0x31ec038df7b441f4, - 0xff290242c83396ce,0x7e67047175a15271, - 0x9f79a169bd203e41,0xf0062c6e984d386, - 0xc75809c42c684dd1,0x52c07b78a3e60868, - 0xf92e0c3537826145,0xa7709a56ccdf8a82, - 0x9bbcc7a142b17ccb,0x88a66076400bb691, - 0xc2abf989935ddbfe,0x6acff893d00ea435, - 0xf356f7ebf83552fe,0x583f6b8c4124d43, - 0x98165af37b2153de,0xc3727a337a8b704a, - 0xbe1bf1b059e9a8d6,0x744f18c0592e4c5c, - 0xeda2ee1c7064130c,0x1162def06f79df73, - 0x9485d4d1c63e8be7,0x8addcb5645ac2ba8, - 0xb9a74a0637ce2ee1,0x6d953e2bd7173692, - 0xe8111c87c5c1ba99,0xc8fa8db6ccdd0437, - 0x910ab1d4db9914a0,0x1d9c9892400a22a2, - 0xb54d5e4a127f59c8,0x2503beb6d00cab4b, - 0xe2a0b5dc971f303a,0x2e44ae64840fd61d, - 0x8da471a9de737e24,0x5ceaecfed289e5d2, - 0xb10d8e1456105dad,0x7425a83e872c5f47, - 0xdd50f1996b947518,0xd12f124e28f77719, - 0x8a5296ffe33cc92f,0x82bd6b70d99aaa6f, - 0xace73cbfdc0bfb7b,0x636cc64d1001550b, - 0xd8210befd30efa5a,0x3c47f7e05401aa4e, - 0x8714a775e3e95c78,0x65acfaec34810a71, - 0xa8d9d1535ce3b396,0x7f1839a741a14d0d, - 0xd31045a8341ca07c,0x1ede48111209a050, - 0x83ea2b892091e44d,0x934aed0aab460432, - 0xa4e4b66b68b65d60,0xf81da84d5617853f, - 0xce1de40642e3f4b9,0x36251260ab9d668e, - 0x80d2ae83e9ce78f3,0xc1d72b7c6b426019, - 0xa1075a24e4421730,0xb24cf65b8612f81f, - 0xc94930ae1d529cfc,0xdee033f26797b627, - 0xfb9b7cd9a4a7443c,0x169840ef017da3b1, - 0x9d412e0806e88aa5,0x8e1f289560ee864e, - 0xc491798a08a2ad4e,0xf1a6f2bab92a27e2, - 0xf5b5d7ec8acb58a2,0xae10af696774b1db, - 0x9991a6f3d6bf1765,0xacca6da1e0a8ef29, - 0xbff610b0cc6edd3f,0x17fd090a58d32af3, - 0xeff394dcff8a948e,0xddfc4b4cef07f5b0, - 0x95f83d0a1fb69cd9,0x4abdaf101564f98e, - 0xbb764c4ca7a4440f,0x9d6d1ad41abe37f1, - 0xea53df5fd18d5513,0x84c86189216dc5ed, - 0x92746b9be2f8552c,0x32fd3cf5b4e49bb4, - 0xb7118682dbb66a77,0x3fbc8c33221dc2a1, - 0xe4d5e82392a40515,0xfabaf3feaa5334a, - 0x8f05b1163ba6832d,0x29cb4d87f2a7400e, - 0xb2c71d5bca9023f8,0x743e20e9ef511012, - 0xdf78e4b2bd342cf6,0x914da9246b255416, - 0x8bab8eefb6409c1a,0x1ad089b6c2f7548e, - 0xae9672aba3d0c320,0xa184ac2473b529b1, - 0xda3c0f568cc4f3e8,0xc9e5d72d90a2741e, - 0x8865899617fb1871,0x7e2fa67c7a658892, - 0xaa7eebfb9df9de8d,0xddbb901b98feeab7, - 0xd51ea6fa85785631,0x552a74227f3ea565, - 0x8533285c936b35de,0xd53a88958f87275f, - 0xa67ff273b8460356,0x8a892abaf368f137, - 0xd01fef10a657842c,0x2d2b7569b0432d85, - 0x8213f56a67f6b29b,0x9c3b29620e29fc73, - 0xa298f2c501f45f42,0x8349f3ba91b47b8f, - 0xcb3f2f7642717713,0x241c70a936219a73, - 0xfe0efb53d30dd4d7,0xed238cd383aa0110, - 0x9ec95d1463e8a506,0xf4363804324a40aa, - 0xc67bb4597ce2ce48,0xb143c6053edcd0d5, - 0xf81aa16fdc1b81da,0xdd94b7868e94050a, - 0x9b10a4e5e9913128,0xca7cf2b4191c8326, - 0xc1d4ce1f63f57d72,0xfd1c2f611f63a3f0, - 0xf24a01a73cf2dccf,0xbc633b39673c8cec, - 0x976e41088617ca01,0xd5be0503e085d813, - 0xbd49d14aa79dbc82,0x4b2d8644d8a74e18, - 0xec9c459d51852ba2,0xddf8e7d60ed1219e, - 0x93e1ab8252f33b45,0xcabb90e5c942b503, - 0xb8da1662e7b00a17,0x3d6a751f3b936243, - 0xe7109bfba19c0c9d,0xcc512670a783ad4, - 0x906a617d450187e2,0x27fb2b80668b24c5, - 0xb484f9dc9641e9da,0xb1f9f660802dedf6, - 0xe1a63853bbd26451,0x5e7873f8a0396973, - 0x8d07e33455637eb2,0xdb0b487b6423e1e8, - 0xb049dc016abc5e5f,0x91ce1a9a3d2cda62, - 0xdc5c5301c56b75f7,0x7641a140cc7810fb, - 0x89b9b3e11b6329ba,0xa9e904c87fcb0a9d, - 0xac2820d9623bf429,0x546345fa9fbdcd44, - 0xd732290fbacaf133,0xa97c177947ad4095, - 0x867f59a9d4bed6c0,0x49ed8eabcccc485d, - 0xa81f301449ee8c70,0x5c68f256bfff5a74, - 0xd226fc195c6a2f8c,0x73832eec6fff3111, - 0x83585d8fd9c25db7,0xc831fd53c5ff7eab, - 0xa42e74f3d032f525,0xba3e7ca8b77f5e55, - 0xcd3a1230c43fb26f,0x28ce1bd2e55f35eb, - 0x80444b5e7aa7cf85,0x7980d163cf5b81b3, - 0xa0555e361951c366,0xd7e105bcc332621f, - 0xc86ab5c39fa63440,0x8dd9472bf3fefaa7, - 0xfa856334878fc150,0xb14f98f6f0feb951, - 0x9c935e00d4b9d8d2,0x6ed1bf9a569f33d3, - 0xc3b8358109e84f07,0xa862f80ec4700c8, - 0xf4a642e14c6262c8,0xcd27bb612758c0fa, - 0x98e7e9cccfbd7dbd,0x8038d51cb897789c, - 0xbf21e44003acdd2c,0xe0470a63e6bd56c3, - 0xeeea5d5004981478,0x1858ccfce06cac74, - 0x95527a5202df0ccb,0xf37801e0c43ebc8, - 0xbaa718e68396cffd,0xd30560258f54e6ba, - 0xe950df20247c83fd,0x47c6b82ef32a2069, - 0x91d28b7416cdd27e,0x4cdc331d57fa5441, - 0xb6472e511c81471d,0xe0133fe4adf8e952, - 0xe3d8f9e563a198e5,0x58180fddd97723a6, - 0x8e679c2f5e44ff8f,0x570f09eaa7ea7648,}; + constexpr static int smallest_power_of_five = + binary_format::smallest_power_of_ten(); + constexpr static int largest_power_of_five = + binary_format::largest_power_of_ten(); + constexpr static int number_of_entries = + 2 * (largest_power_of_five - smallest_power_of_five + 1); + // Powers of five from 5^-342 all the way to 5^308 rounded toward one. + constexpr static uint64_t power_of_five_128[number_of_entries] = { + 0xeef453d6923bd65a, 0x113faa2906a13b3f, + 0x9558b4661b6565f8, 0x4ac7ca59a424c507, + 0xbaaee17fa23ebf76, 0x5d79bcf00d2df649, + 0xe95a99df8ace6f53, 0xf4d82c2c107973dc, + 0x91d8a02bb6c10594, 0x79071b9b8a4be869, + 0xb64ec836a47146f9, 0x9748e2826cdee284, + 0xe3e27a444d8d98b7, 0xfd1b1b2308169b25, + 0x8e6d8c6ab0787f72, 0xfe30f0f5e50e20f7, + 0xb208ef855c969f4f, 0xbdbd2d335e51a935, + 0xde8b2b66b3bc4723, 0xad2c788035e61382, + 0x8b16fb203055ac76, 0x4c3bcb5021afcc31, + 0xaddcb9e83c6b1793, 0xdf4abe242a1bbf3d, + 0xd953e8624b85dd78, 0xd71d6dad34a2af0d, + 0x87d4713d6f33aa6b, 0x8672648c40e5ad68, + 0xa9c98d8ccb009506, 0x680efdaf511f18c2, + 0xd43bf0effdc0ba48, 0x212bd1b2566def2, + 0x84a57695fe98746d, 0x14bb630f7604b57, + 0xa5ced43b7e3e9188, 0x419ea3bd35385e2d, + 0xcf42894a5dce35ea, 0x52064cac828675b9, + 0x818995ce7aa0e1b2, 0x7343efebd1940993, + 0xa1ebfb4219491a1f, 0x1014ebe6c5f90bf8, + 0xca66fa129f9b60a6, 0xd41a26e077774ef6, + 0xfd00b897478238d0, 0x8920b098955522b4, + 0x9e20735e8cb16382, 0x55b46e5f5d5535b0, + 0xc5a890362fddbc62, 0xeb2189f734aa831d, + 0xf712b443bbd52b7b, 0xa5e9ec7501d523e4, + 0x9a6bb0aa55653b2d, 0x47b233c92125366e, + 0xc1069cd4eabe89f8, 0x999ec0bb696e840a, + 0xf148440a256e2c76, 0xc00670ea43ca250d, + 0x96cd2a865764dbca, 0x380406926a5e5728, + 0xbc807527ed3e12bc, 0xc605083704f5ecf2, + 0xeba09271e88d976b, 0xf7864a44c633682e, + 0x93445b8731587ea3, 0x7ab3ee6afbe0211d, + 0xb8157268fdae9e4c, 0x5960ea05bad82964, + 0xe61acf033d1a45df, 0x6fb92487298e33bd, + 0x8fd0c16206306bab, 0xa5d3b6d479f8e056, + 0xb3c4f1ba87bc8696, 0x8f48a4899877186c, + 0xe0b62e2929aba83c, 0x331acdabfe94de87, + 0x8c71dcd9ba0b4925, 0x9ff0c08b7f1d0b14, + 0xaf8e5410288e1b6f, 0x7ecf0ae5ee44dd9, + 0xdb71e91432b1a24a, 0xc9e82cd9f69d6150, + 0x892731ac9faf056e, 0xbe311c083a225cd2, + 0xab70fe17c79ac6ca, 0x6dbd630a48aaf406, + 0xd64d3d9db981787d, 0x92cbbccdad5b108, + 0x85f0468293f0eb4e, 0x25bbf56008c58ea5, + 0xa76c582338ed2621, 0xaf2af2b80af6f24e, + 0xd1476e2c07286faa, 0x1af5af660db4aee1, + 0x82cca4db847945ca, 0x50d98d9fc890ed4d, + 0xa37fce126597973c, 0xe50ff107bab528a0, + 0xcc5fc196fefd7d0c, 0x1e53ed49a96272c8, + 0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7a, + 0x9faacf3df73609b1, 0x77b191618c54e9ac, + 0xc795830d75038c1d, 0xd59df5b9ef6a2417, + 0xf97ae3d0d2446f25, 0x4b0573286b44ad1d, + 0x9becce62836ac577, 0x4ee367f9430aec32, + 0xc2e801fb244576d5, 0x229c41f793cda73f, + 0xf3a20279ed56d48a, 0x6b43527578c1110f, + 0x9845418c345644d6, 0x830a13896b78aaa9, + 0xbe5691ef416bd60c, 0x23cc986bc656d553, + 0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa8, + 0x94b3a202eb1c3f39, 0x7bf7d71432f3d6a9, + 0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc53, + 0xe858ad248f5c22c9, 0xd1b3400f8f9cff68, + 0x91376c36d99995be, 0x23100809b9c21fa1, + 0xb58547448ffffb2d, 0xabd40a0c2832a78a, + 0xe2e69915b3fff9f9, 0x16c90c8f323f516c, + 0x8dd01fad907ffc3b, 0xae3da7d97f6792e3, + 0xb1442798f49ffb4a, 0x99cd11cfdf41779c, + 0xdd95317f31c7fa1d, 0x40405643d711d583, + 0x8a7d3eef7f1cfc52, 0x482835ea666b2572, + 0xad1c8eab5ee43b66, 0xda3243650005eecf, + 0xd863b256369d4a40, 0x90bed43e40076a82, + 0x873e4f75e2224e68, 0x5a7744a6e804a291, + 0xa90de3535aaae202, 0x711515d0a205cb36, + 0xd3515c2831559a83, 0xd5a5b44ca873e03, + 0x8412d9991ed58091, 0xe858790afe9486c2, + 0xa5178fff668ae0b6, 0x626e974dbe39a872, + 0xce5d73ff402d98e3, 0xfb0a3d212dc8128f, + 0x80fa687f881c7f8e, 0x7ce66634bc9d0b99, + 0xa139029f6a239f72, 0x1c1fffc1ebc44e80, + 0xc987434744ac874e, 0xa327ffb266b56220, + 0xfbe9141915d7a922, 0x4bf1ff9f0062baa8, + 0x9d71ac8fada6c9b5, 0x6f773fc3603db4a9, + 0xc4ce17b399107c22, 0xcb550fb4384d21d3, + 0xf6019da07f549b2b, 0x7e2a53a146606a48, + 0x99c102844f94e0fb, 0x2eda7444cbfc426d, + 0xc0314325637a1939, 0xfa911155fefb5308, + 0xf03d93eebc589f88, 0x793555ab7eba27ca, + 0x96267c7535b763b5, 0x4bc1558b2f3458de, + 0xbbb01b9283253ca2, 0x9eb1aaedfb016f16, + 0xea9c227723ee8bcb, 0x465e15a979c1cadc, + 0x92a1958a7675175f, 0xbfacd89ec191ec9, + 0xb749faed14125d36, 0xcef980ec671f667b, + 0xe51c79a85916f484, 0x82b7e12780e7401a, + 0x8f31cc0937ae58d2, 0xd1b2ecb8b0908810, + 0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa15, + 0xdfbdcece67006ac9, 0x67a791e093e1d49a, + 0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e0, + 0xaecc49914078536d, 0x58fae9f773886e18, + 0xda7f5bf590966848, 0xaf39a475506a899e, + 0x888f99797a5e012d, 0x6d8406c952429603, + 0xaab37fd7d8f58178, 0xc8e5087ba6d33b83, + 0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a64, + 0x855c3be0a17fcd26, 0x5cf2eea09a55067f, + 0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481e, + 0xd0601d8efc57b08b, 0xf13b94daf124da26, + 0x823c12795db6ce57, 0x76c53d08d6b70858, + 0xa2cb1717b52481ed, 0x54768c4b0c64ca6e, + 0xcb7ddcdda26da268, 0xa9942f5dcf7dfd09, + 0xfe5d54150b090b02, 0xd3f93b35435d7c4c, + 0x9efa548d26e5a6e1, 0xc47bc5014a1a6daf, + 0xc6b8e9b0709f109a, 0x359ab6419ca1091b, + 0xf867241c8cc6d4c0, 0xc30163d203c94b62, + 0x9b407691d7fc44f8, 0x79e0de63425dcf1d, + 0xc21094364dfb5636, 0x985915fc12f542e4, + 0xf294b943e17a2bc4, 0x3e6f5b7b17b2939d, + 0x979cf3ca6cec5b5a, 0xa705992ceecf9c42, + 0xbd8430bd08277231, 0x50c6ff782a838353, + 0xece53cec4a314ebd, 0xa4f8bf5635246428, + 0x940f4613ae5ed136, 0x871b7795e136be99, + 0xb913179899f68584, 0x28e2557b59846e3f, + 0xe757dd7ec07426e5, 0x331aeada2fe589cf, + 0x9096ea6f3848984f, 0x3ff0d2c85def7621, + 0xb4bca50b065abe63, 0xfed077a756b53a9, + 0xe1ebce4dc7f16dfb, 0xd3e8495912c62894, + 0x8d3360f09cf6e4bd, 0x64712dd7abbbd95c, + 0xb080392cc4349dec, 0xbd8d794d96aacfb3, + 0xdca04777f541c567, 0xecf0d7a0fc5583a0, + 0x89e42caaf9491b60, 0xf41686c49db57244, + 0xac5d37d5b79b6239, 0x311c2875c522ced5, + 0xd77485cb25823ac7, 0x7d633293366b828b, + 0x86a8d39ef77164bc, 0xae5dff9c02033197, + 0xa8530886b54dbdeb, 0xd9f57f830283fdfc, + 0xd267caa862a12d66, 0xd072df63c324fd7b, + 0x8380dea93da4bc60, 0x4247cb9e59f71e6d, + 0xa46116538d0deb78, 0x52d9be85f074e608, + 0xcd795be870516656, 0x67902e276c921f8b, + 0x806bd9714632dff6, 0xba1cd8a3db53b6, + 0xa086cfcd97bf97f3, 0x80e8a40eccd228a4, + 0xc8a883c0fdaf7df0, 0x6122cd128006b2cd, + 0xfad2a4b13d1b5d6c, 0x796b805720085f81, + 0x9cc3a6eec6311a63, 0xcbe3303674053bb0, + 0xc3f490aa77bd60fc, 0xbedbfc4411068a9c, + 0xf4f1b4d515acb93b, 0xee92fb5515482d44, + 0x991711052d8bf3c5, 0x751bdd152d4d1c4a, + 0xbf5cd54678eef0b6, 0xd262d45a78a0635d, + 0xef340a98172aace4, 0x86fb897116c87c34, + 0x9580869f0e7aac0e, 0xd45d35e6ae3d4da0, + 0xbae0a846d2195712, 0x8974836059cca109, + 0xe998d258869facd7, 0x2bd1a438703fc94b, + 0x91ff83775423cc06, 0x7b6306a34627ddcf, + 0xb67f6455292cbf08, 0x1a3bc84c17b1d542, + 0xe41f3d6a7377eeca, 0x20caba5f1d9e4a93, + 0x8e938662882af53e, 0x547eb47b7282ee9c, + 0xb23867fb2a35b28d, 0xe99e619a4f23aa43, + 0xdec681f9f4c31f31, 0x6405fa00e2ec94d4, + 0x8b3c113c38f9f37e, 0xde83bc408dd3dd04, + 0xae0b158b4738705e, 0x9624ab50b148d445, + 0xd98ddaee19068c76, 0x3badd624dd9b0957, + 0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d6, + 0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4c, + 0xd47487cc8470652b, 0x7647c3200069671f, + 0x84c8d4dfd2c63f3b, 0x29ecd9f40041e073, + 0xa5fb0a17c777cf09, 0xf468107100525890, + 0xcf79cc9db955c2cc, 0x7182148d4066eeb4, + 0x81ac1fe293d599bf, 0xc6f14cd848405530, + 0xa21727db38cb002f, 0xb8ada00e5a506a7c, + 0xca9cf1d206fdc03b, 0xa6d90811f0e4851c, + 0xfd442e4688bd304a, 0x908f4a166d1da663, + 0x9e4a9cec15763e2e, 0x9a598e4e043287fe, + 0xc5dd44271ad3cdba, 0x40eff1e1853f29fd, + 0xf7549530e188c128, 0xd12bee59e68ef47c, + 0x9a94dd3e8cf578b9, 0x82bb74f8301958ce, + 0xc13a148e3032d6e7, 0xe36a52363c1faf01, + 0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac1, + 0x96f5600f15a7b7e5, 0x29ab103a5ef8c0b9, + 0xbcb2b812db11a5de, 0x7415d448f6b6f0e7, + 0xebdf661791d60f56, 0x111b495b3464ad21, + 0x936b9fcebb25c995, 0xcab10dd900beec34, + 0xb84687c269ef3bfb, 0x3d5d514f40eea742, + 0xe65829b3046b0afa, 0xcb4a5a3112a5112, + 0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ab, + 0xb3f4e093db73a093, 0x59ed216765690f56, + 0xe0f218b8d25088b8, 0x306869c13ec3532c, + 0x8c974f7383725573, 0x1e414218c73a13fb, + 0xafbd2350644eeacf, 0xe5d1929ef90898fa, + 0xdbac6c247d62a583, 0xdf45f746b74abf39, + 0x894bc396ce5da772, 0x6b8bba8c328eb783, + 0xab9eb47c81f5114f, 0x66ea92f3f326564, + 0xd686619ba27255a2, 0xc80a537b0efefebd, + 0x8613fd0145877585, 0xbd06742ce95f5f36, + 0xa798fc4196e952e7, 0x2c48113823b73704, + 0xd17f3b51fca3a7a0, 0xf75a15862ca504c5, + 0x82ef85133de648c4, 0x9a984d73dbe722fb, + 0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebba, + 0xcc963fee10b7d1b3, 0x318df905079926a8, + 0xffbbcfe994e5c61f, 0xfdf17746497f7052, + 0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa633, + 0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc0, + 0xf9bd690a1b68637b, 0x3dfdce7aa3c673b0, + 0x9c1661a651213e2d, 0x6bea10ca65c084e, + 0xc31bfa0fe5698db8, 0x486e494fcff30a62, + 0xf3e2f893dec3f126, 0x5a89dba3c3efccfa, + 0x986ddb5c6b3a76b7, 0xf89629465a75e01c, + 0xbe89523386091465, 0xf6bbb397f1135823, + 0xee2ba6c0678b597f, 0x746aa07ded582e2c, + 0x94db483840b717ef, 0xa8c2a44eb4571cdc, + 0xba121a4650e4ddeb, 0x92f34d62616ce413, + 0xe896a0d7e51e1566, 0x77b020baf9c81d17, + 0x915e2486ef32cd60, 0xace1474dc1d122e, + 0xb5b5ada8aaff80b8, 0xd819992132456ba, + 0xe3231912d5bf60e6, 0x10e1fff697ed6c69, + 0x8df5efabc5979c8f, 0xca8d3ffa1ef463c1, + 0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb2, + 0xddd0467c64bce4a0, 0xac7cb3f6d05ddbde, + 0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96b, + 0xad4ab7112eb3929d, 0x86c16c98d2c953c6, + 0xd89d64d57a607744, 0xe871c7bf077ba8b7, + 0x87625f056c7c4a8b, 0x11471cd764ad4972, + 0xa93af6c6c79b5d2d, 0xd598e40d3dd89bcf, + 0xd389b47879823479, 0x4aff1d108d4ec2c3, + 0x843610cb4bf160cb, 0xcedf722a585139ba, + 0xa54394fe1eedb8fe, 0xc2974eb4ee658828, + 0xce947a3da6a9273e, 0x733d226229feea32, + 0x811ccc668829b887, 0x806357d5a3f525f, + 0xa163ff802a3426a8, 0xca07c2dcb0cf26f7, + 0xc9bcff6034c13052, 0xfc89b393dd02f0b5, + 0xfc2c3f3841f17c67, 0xbbac2078d443ace2, + 0x9d9ba7832936edc0, 0xd54b944b84aa4c0d, + 0xc5029163f384a931, 0xa9e795e65d4df11, + 0xf64335bcf065d37d, 0x4d4617b5ff4a16d5, + 0x99ea0196163fa42e, 0x504bced1bf8e4e45, + 0xc06481fb9bcf8d39, 0xe45ec2862f71e1d6, + 0xf07da27a82c37088, 0x5d767327bb4e5a4c, + 0x964e858c91ba2655, 0x3a6a07f8d510f86f, + 0xbbe226efb628afea, 0x890489f70a55368b, + 0xeadab0aba3b2dbe5, 0x2b45ac74ccea842e, + 0x92c8ae6b464fc96f, 0x3b0b8bc90012929d, + 0xb77ada0617e3bbcb, 0x9ce6ebb40173744, + 0xe55990879ddcaabd, 0xcc420a6a101d0515, + 0x8f57fa54c2a9eab6, 0x9fa946824a12232d, + 0xb32df8e9f3546564, 0x47939822dc96abf9, + 0xdff9772470297ebd, 0x59787e2b93bc56f7, + 0x8bfbea76c619ef36, 0x57eb4edb3c55b65a, + 0xaefae51477a06b03, 0xede622920b6b23f1, + 0xdab99e59958885c4, 0xe95fab368e45eced, + 0x88b402f7fd75539b, 0x11dbcb0218ebb414, + 0xaae103b5fcd2a881, 0xd652bdc29f26a119, + 0xd59944a37c0752a2, 0x4be76d3346f0495f, + 0x857fcae62d8493a5, 0x6f70a4400c562ddb, + 0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb952, + 0xd097ad07a71f26b2, 0x7e2000a41346a7a7, + 0x825ecc24c873782f, 0x8ed400668c0c28c8, + 0xa2f67f2dfa90563b, 0x728900802f0f32fa, + 0xcbb41ef979346bca, 0x4f2b40a03ad2ffb9, + 0xfea126b7d78186bc, 0xe2f610c84987bfa8, + 0x9f24b832e6b0f436, 0xdd9ca7d2df4d7c9, + 0xc6ede63fa05d3143, 0x91503d1c79720dbb, + 0xf8a95fcf88747d94, 0x75a44c6397ce912a, + 0x9b69dbe1b548ce7c, 0xc986afbe3ee11aba, + 0xc24452da229b021b, 0xfbe85badce996168, + 0xf2d56790ab41c2a2, 0xfae27299423fb9c3, + 0x97c560ba6b0919a5, 0xdccd879fc967d41a, + 0xbdb6b8e905cb600f, 0x5400e987bbc1c920, + 0xed246723473e3813, 0x290123e9aab23b68, + 0x9436c0760c86e30b, 0xf9a0b6720aaf6521, + 0xb94470938fa89bce, 0xf808e40e8d5b3e69, + 0xe7958cb87392c2c2, 0xb60b1d1230b20e04, + 0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c2, + 0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af3, + 0xe2280b6c20dd5232, 0x25c6da63c38de1b0, + 0x8d590723948a535f, 0x579c487e5a38ad0e, + 0xb0af48ec79ace837, 0x2d835a9df0c6d851, + 0xdcdb1b2798182244, 0xf8e431456cf88e65, + 0x8a08f0f8bf0f156b, 0x1b8e9ecb641b58ff, + 0xac8b2d36eed2dac5, 0xe272467e3d222f3f, + 0xd7adf884aa879177, 0x5b0ed81dcc6abb0f, + 0x86ccbb52ea94baea, 0x98e947129fc2b4e9, + 0xa87fea27a539e9a5, 0x3f2398d747b36224, + 0xd29fe4b18e88640e, 0x8eec7f0d19a03aad, + 0x83a3eeeef9153e89, 0x1953cf68300424ac, + 0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd7, + 0xcdb02555653131b6, 0x3792f412cb06794d, + 0x808e17555f3ebf11, 0xe2bbd88bbee40bd0, + 0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec4, + 0xc8de047564d20a8b, 0xf245825a5a445275, + 0xfb158592be068d2e, 0xeed6e2f0f0d56712, + 0x9ced737bb6c4183d, 0x55464dd69685606b, + 0xc428d05aa4751e4c, 0xaa97e14c3c26b886, + 0xf53304714d9265df, 0xd53dd99f4b3066a8, + 0x993fe2c6d07b7fab, 0xe546a8038efe4029, + 0xbf8fdb78849a5f96, 0xde98520472bdd033, + 0xef73d256a5c0f77c, 0x963e66858f6d4440, + 0x95a8637627989aad, 0xdde7001379a44aa8, + 0xbb127c53b17ec159, 0x5560c018580d5d52, + 0xe9d71b689dde71af, 0xaab8f01e6e10b4a6, + 0x9226712162ab070d, 0xcab3961304ca70e8, + 0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d22, + 0xe45c10c42a2b3b05, 0x8cb89a7db77c506a, + 0x8eb98a7a9a5b04e3, 0x77f3608e92adb242, + 0xb267ed1940f1c61c, 0x55f038b237591ed3, + 0xdf01e85f912e37a3, 0x6b6c46dec52f6688, + 0x8b61313bbabce2c6, 0x2323ac4b3b3da015, + 0xae397d8aa96c1b77, 0xabec975e0a0d081a, + 0xd9c7dced53c72255, 0x96e7bd358c904a21, + 0x881cea14545c7575, 0x7e50d64177da2e54, + 0xaa242499697392d2, 0xdde50bd1d5d0b9e9, + 0xd4ad2dbfc3d07787, 0x955e4ec64b44e864, + 0x84ec3c97da624ab4, 0xbd5af13bef0b113e, + 0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58e, + 0xcfb11ead453994ba, 0x67de18eda5814af2, + 0x81ceb32c4b43fcf4, 0x80eacf948770ced7, + 0xa2425ff75e14fc31, 0xa1258379a94d028d, + 0xcad2f7f5359a3b3e, 0x96ee45813a04330, + 0xfd87b5f28300ca0d, 0x8bca9d6e188853fc, + 0x9e74d1b791e07e48, 0x775ea264cf55347e, + 0xc612062576589dda, 0x95364afe032a819e, + 0xf79687aed3eec551, 0x3a83ddbd83f52205, + 0x9abe14cd44753b52, 0xc4926a9672793543, + 0xc16d9a0095928a27, 0x75b7053c0f178294, + 0xf1c90080baf72cb1, 0x5324c68b12dd6339, + 0x971da05074da7bee, 0xd3f6fc16ebca5e04, + 0xbce5086492111aea, 0x88f4bb1ca6bcf585, + 0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6, + 0x9392ee8e921d5d07, 0x3aff322e62439fd0, + 0xb877aa3236a4b449, 0x9befeb9fad487c3, + 0xe69594bec44de15b, 0x4c2ebe687989a9b4, + 0x901d7cf73ab0acd9, 0xf9d37014bf60a11, + 0xb424dc35095cd80f, 0x538484c19ef38c95, + 0xe12e13424bb40e13, 0x2865a5f206b06fba, + 0x8cbccc096f5088cb, 0xf93f87b7442e45d4, + 0xafebff0bcb24aafe, 0xf78f69a51539d749, + 0xdbe6fecebdedd5be, 0xb573440e5a884d1c, + 0x89705f4136b4a597, 0x31680a88f8953031, + 0xabcc77118461cefc, 0xfdc20d2b36ba7c3e, + 0xd6bf94d5e57a42bc, 0x3d32907604691b4d, + 0x8637bd05af6c69b5, 0xa63f9a49c2c1b110, + 0xa7c5ac471b478423, 0xfcf80dc33721d54, + 0xd1b71758e219652b, 0xd3c36113404ea4a9, + 0x83126e978d4fdf3b, 0x645a1cac083126ea, + 0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4, + 0xcccccccccccccccc, 0xcccccccccccccccd, + 0x8000000000000000, 0x0, + 0xa000000000000000, 0x0, + 0xc800000000000000, 0x0, + 0xfa00000000000000, 0x0, + 0x9c40000000000000, 0x0, + 0xc350000000000000, 0x0, + 0xf424000000000000, 0x0, + 0x9896800000000000, 0x0, + 0xbebc200000000000, 0x0, + 0xee6b280000000000, 0x0, + 0x9502f90000000000, 0x0, + 0xba43b74000000000, 0x0, + 0xe8d4a51000000000, 0x0, + 0x9184e72a00000000, 0x0, + 0xb5e620f480000000, 0x0, + 0xe35fa931a0000000, 0x0, + 0x8e1bc9bf04000000, 0x0, + 0xb1a2bc2ec5000000, 0x0, + 0xde0b6b3a76400000, 0x0, + 0x8ac7230489e80000, 0x0, + 0xad78ebc5ac620000, 0x0, + 0xd8d726b7177a8000, 0x0, + 0x878678326eac9000, 0x0, + 0xa968163f0a57b400, 0x0, + 0xd3c21bcecceda100, 0x0, + 0x84595161401484a0, 0x0, + 0xa56fa5b99019a5c8, 0x0, + 0xcecb8f27f4200f3a, 0x0, + 0x813f3978f8940984, 0x4000000000000000, + 0xa18f07d736b90be5, 0x5000000000000000, + 0xc9f2c9cd04674ede, 0xa400000000000000, + 0xfc6f7c4045812296, 0x4d00000000000000, + 0x9dc5ada82b70b59d, 0xf020000000000000, + 0xc5371912364ce305, 0x6c28000000000000, + 0xf684df56c3e01bc6, 0xc732000000000000, + 0x9a130b963a6c115c, 0x3c7f400000000000, + 0xc097ce7bc90715b3, 0x4b9f100000000000, + 0xf0bdc21abb48db20, 0x1e86d40000000000, + 0x96769950b50d88f4, 0x1314448000000000, + 0xbc143fa4e250eb31, 0x17d955a000000000, + 0xeb194f8e1ae525fd, 0x5dcfab0800000000, + 0x92efd1b8d0cf37be, 0x5aa1cae500000000, + 0xb7abc627050305ad, 0xf14a3d9e40000000, + 0xe596b7b0c643c719, 0x6d9ccd05d0000000, + 0x8f7e32ce7bea5c6f, 0xe4820023a2000000, + 0xb35dbf821ae4f38b, 0xdda2802c8a800000, + 0xe0352f62a19e306e, 0xd50b2037ad200000, + 0x8c213d9da502de45, 0x4526f422cc340000, + 0xaf298d050e4395d6, 0x9670b12b7f410000, + 0xdaf3f04651d47b4c, 0x3c0cdd765f114000, + 0x88d8762bf324cd0f, 0xa5880a69fb6ac800, + 0xab0e93b6efee0053, 0x8eea0d047a457a00, + 0xd5d238a4abe98068, 0x72a4904598d6d880, + 0x85a36366eb71f041, 0x47a6da2b7f864750, + 0xa70c3c40a64e6c51, 0x999090b65f67d924, + 0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d, + 0x82818f1281ed449f, 0xbff8f10e7a8921a4, + 0xa321f2d7226895c7, 0xaff72d52192b6a0d, + 0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490, + 0xfee50b7025c36a08, 0x2f236d04753d5b4, + 0x9f4f2726179a2245, 0x1d762422c946590, + 0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5, + 0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2, + 0x9b934c3b330c8577, 0x63cc55f49f88eb2f, + 0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb, + 0xf316271c7fc3908a, 0x8bef464e3945ef7a, + 0x97edd871cfda3a56, 0x97758bf0e3cbb5ac, + 0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317, + 0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd, + 0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a, + 0xb975d6b6ee39e436, 0xb3e2fd538e122b44, + 0xe7d34c64a9c85d44, 0x60dbbca87196b616, + 0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd, + 0xb51d13aea4a488dd, 0x6babab6398bdbe41, + 0xe264589a4dcdab14, 0xc696963c7eed2dd1, + 0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2, + 0xb0de65388cc8ada8, 0x3b25a55f43294bcb, + 0xdd15fe86affad912, 0x49ef0eb713f39ebe, + 0x8a2dbf142dfcc7ab, 0x6e3569326c784337, + 0xacb92ed9397bf996, 0x49c2c37f07965404, + 0xd7e77a8f87daf7fb, 0xdc33745ec97be906, + 0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3, + 0xa8acd7c0222311bc, 0xc40832ea0d68ce0c, + 0xd2d80db02aabd62b, 0xf50a3fa490c30190, + 0x83c7088e1aab65db, 0x792667c6da79e0fa, + 0xa4b8cab1a1563f52, 0x577001b891185938, + 0xcde6fd5e09abcf26, 0xed4c0226b55e6f86, + 0x80b05e5ac60b6178, 0x544f8158315b05b4, + 0xa0dc75f1778e39d6, 0x696361ae3db1c721, + 0xc913936dd571c84c, 0x3bc3a19cd1e38e9, + 0xfb5878494ace3a5f, 0x4ab48a04065c723, + 0x9d174b2dcec0e47b, 0x62eb0d64283f9c76, + 0xc45d1df942711d9a, 0x3ba5d0bd324f8394, + 0xf5746577930d6500, 0xca8f44ec7ee36479, + 0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb, + 0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e, + 0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e, + 0x95d04aee3b80ece5, 0xbba1f1d158724a12, + 0xbb445da9ca61281f, 0x2a8a6e45ae8edc97, + 0xea1575143cf97226, 0xf52d09d71a3293bd, + 0x924d692ca61be758, 0x593c2626705f9c56, + 0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c, + 0xe498f455c38b997a, 0xb6dfb9c0f956447, + 0x8edf98b59a373fec, 0x4724bd4189bd5eac, + 0xb2977ee300c50fe7, 0x58edec91ec2cb657, + 0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed, + 0x8b865b215899f46c, 0xbd79e0d20082ee74, + 0xae67f1e9aec07187, 0xecd8590680a3aa11, + 0xda01ee641a708de9, 0xe80e6f4820cc9495, + 0x884134fe908658b2, 0x3109058d147fdcdd, + 0xaa51823e34a7eede, 0xbd4b46f0599fd415, + 0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a, + 0x850fadc09923329e, 0x3e2cf6bc604ddb0, + 0xa6539930bf6bff45, 0x84db8346b786151c, + 0xcfe87f7cef46ff16, 0xe612641865679a63, + 0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e, + 0xa26da3999aef7749, 0xe3be5e330f38f09d, + 0xcb090c8001ab551c, 0x5cadf5bfd3072cc5, + 0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6, + 0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa, + 0xc646d63501a1511d, 0xb281e1fd541501b8, + 0xf7d88bc24209a565, 0x1f225a7ca91a4226, + 0x9ae757596946075f, 0x3375788de9b06958, + 0xc1a12d2fc3978937, 0x52d6b1641c83ae, + 0xf209787bb47d6b84, 0xc0678c5dbd23a49a, + 0x9745eb4d50ce6332, 0xf840b7ba963646e0, + 0xbd176620a501fbff, 0xb650e5a93bc3d898, + 0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe, + 0x93ba47c980e98cdf, 0xc66f336c36b10137, + 0xb8a8d9bbe123f017, 0xb80b0047445d4184, + 0xe6d3102ad96cec1d, 0xa60dc059157491e5, + 0x9043ea1ac7e41392, 0x87c89837ad68db2f, + 0xb454e4a179dd1877, 0x29babe4598c311fb, + 0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a, + 0x8ce2529e2734bb1d, 0x1899e4a65f58660c, + 0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f, + 0xdc21a1171d42645d, 0x76707543f4fa1f73, + 0x899504ae72497eba, 0x6a06494a791c53a8, + 0xabfa45da0edbde69, 0x487db9d17636892, + 0xd6f8d7509292d603, 0x45a9d2845d3c42b6, + 0x865b86925b9bc5c2, 0xb8a2392ba45a9b2, + 0xa7f26836f282b732, 0x8e6cac7768d7141e, + 0xd1ef0244af2364ff, 0x3207d795430cd926, + 0x8335616aed761f1f, 0x7f44e6bd49e807b8, + 0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6, + 0xcd036837130890a1, 0x36dba887c37a8c0f, + 0x802221226be55a64, 0xc2494954da2c9789, + 0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c, + 0xc83553c5c8965d3d, 0x6f92829494e5acc7, + 0xfa42a8b73abbf48c, 0xcb772339ba1f17f9, + 0x9c69a97284b578d7, 0xff2a760414536efb, + 0xc38413cf25e2d70d, 0xfef5138519684aba, + 0xf46518c2ef5b8cd1, 0x7eb258665fc25d69, + 0x98bf2f79d5993802, 0xef2f773ffbd97a61, + 0xbeeefb584aff8603, 0xaafb550ffacfd8fa, + 0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38, + 0x952ab45cfa97a0b2, 0xdd945a747bf26183, + 0xba756174393d88df, 0x94f971119aeef9e4, + 0xe912b9d1478ceb17, 0x7a37cd5601aab85d, + 0x91abb422ccb812ee, 0xac62e055c10ab33a, + 0xb616a12b7fe617aa, 0x577b986b314d6009, + 0xe39c49765fdf9d94, 0xed5a7e85fda0b80b, + 0x8e41ade9fbebc27d, 0x14588f13be847307, + 0xb1d219647ae6b31c, 0x596eb2d8ae258fc8, + 0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb, + 0x8aec23d680043bee, 0x25de7bb9480d5854, + 0xada72ccc20054ae9, 0xaf561aa79a10ae6a, + 0xd910f7ff28069da4, 0x1b2ba1518094da04, + 0x87aa9aff79042286, 0x90fb44d2f05d0842, + 0xa99541bf57452b28, 0x353a1607ac744a53, + 0xd3fa922f2d1675f2, 0x42889b8997915ce8, + 0x847c9b5d7c2e09b7, 0x69956135febada11, + 0xa59bc234db398c25, 0x43fab9837e699095, + 0xcf02b2c21207ef2e, 0x94f967e45e03f4bb, + 0x8161afb94b44f57d, 0x1d1be0eebac278f5, + 0xa1ba1ba79e1632dc, 0x6462d92a69731732, + 0xca28a291859bbf93, 0x7d7b8f7503cfdcfe, + 0xfcb2cb35e702af78, 0x5cda735244c3d43e, + 0x9defbf01b061adab, 0x3a0888136afa64a7, + 0xc56baec21c7a1916, 0x88aaa1845b8fdd0, + 0xf6c69a72a3989f5b, 0x8aad549e57273d45, + 0x9a3c2087a63f6399, 0x36ac54e2f678864b, + 0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd, + 0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5, + 0x969eb7c47859e743, 0x9f644ae5a4b1b325, + 0xbc4665b596706114, 0x873d5d9f0dde1fee, + 0xeb57ff22fc0c7959, 0xa90cb506d155a7ea, + 0x9316ff75dd87cbd8, 0x9a7f12442d588f2, + 0xb7dcbf5354e9bece, 0xc11ed6d538aeb2f, + 0xe5d3ef282a242e81, 0x8f1668c8a86da5fa, + 0x8fa475791a569d10, 0xf96e017d694487bc, + 0xb38d92d760ec4455, 0x37c981dcc395a9ac, + 0xe070f78d3927556a, 0x85bbe253f47b1417, + 0x8c469ab843b89562, 0x93956d7478ccec8e, + 0xaf58416654a6babb, 0x387ac8d1970027b2, + 0xdb2e51bfe9d0696a, 0x6997b05fcc0319e, + 0x88fcf317f22241e2, 0x441fece3bdf81f03, + 0xab3c2fddeeaad25a, 0xd527e81cad7626c3, + 0xd60b3bd56a5586f1, 0x8a71e223d8d3b074, + 0x85c7056562757456, 0xf6872d5667844e49, + 0xa738c6bebb12d16c, 0xb428f8ac016561db, + 0xd106f86e69d785c7, 0xe13336d701beba52, + 0x82a45b450226b39c, 0xecc0024661173473, + 0xa34d721642b06084, 0x27f002d7f95d0190, + 0xcc20ce9bd35c78a5, 0x31ec038df7b441f4, + 0xff290242c83396ce, 0x7e67047175a15271, + 0x9f79a169bd203e41, 0xf0062c6e984d386, + 0xc75809c42c684dd1, 0x52c07b78a3e60868, + 0xf92e0c3537826145, 0xa7709a56ccdf8a82, + 0x9bbcc7a142b17ccb, 0x88a66076400bb691, + 0xc2abf989935ddbfe, 0x6acff893d00ea435, + 0xf356f7ebf83552fe, 0x583f6b8c4124d43, + 0x98165af37b2153de, 0xc3727a337a8b704a, + 0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c, + 0xeda2ee1c7064130c, 0x1162def06f79df73, + 0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8, + 0xb9a74a0637ce2ee1, 0x6d953e2bd7173692, + 0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437, + 0x910ab1d4db9914a0, 0x1d9c9892400a22a2, + 0xb54d5e4a127f59c8, 0x2503beb6d00cab4b, + 0xe2a0b5dc971f303a, 0x2e44ae64840fd61d, + 0x8da471a9de737e24, 0x5ceaecfed289e5d2, + 0xb10d8e1456105dad, 0x7425a83e872c5f47, + 0xdd50f1996b947518, 0xd12f124e28f77719, + 0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f, + 0xace73cbfdc0bfb7b, 0x636cc64d1001550b, + 0xd8210befd30efa5a, 0x3c47f7e05401aa4e, + 0x8714a775e3e95c78, 0x65acfaec34810a71, + 0xa8d9d1535ce3b396, 0x7f1839a741a14d0d, + 0xd31045a8341ca07c, 0x1ede48111209a050, + 0x83ea2b892091e44d, 0x934aed0aab460432, + 0xa4e4b66b68b65d60, 0xf81da84d5617853f, + 0xce1de40642e3f4b9, 0x36251260ab9d668e, + 0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019, + 0xa1075a24e4421730, 0xb24cf65b8612f81f, + 0xc94930ae1d529cfc, 0xdee033f26797b627, + 0xfb9b7cd9a4a7443c, 0x169840ef017da3b1, + 0x9d412e0806e88aa5, 0x8e1f289560ee864e, + 0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2, + 0xf5b5d7ec8acb58a2, 0xae10af696774b1db, + 0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29, + 0xbff610b0cc6edd3f, 0x17fd090a58d32af3, + 0xeff394dcff8a948e, 0xddfc4b4cef07f5b0, + 0x95f83d0a1fb69cd9, 0x4abdaf101564f98e, + 0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1, + 0xea53df5fd18d5513, 0x84c86189216dc5ed, + 0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4, + 0xb7118682dbb66a77, 0x3fbc8c33221dc2a1, + 0xe4d5e82392a40515, 0xfabaf3feaa5334a, + 0x8f05b1163ba6832d, 0x29cb4d87f2a7400e, + 0xb2c71d5bca9023f8, 0x743e20e9ef511012, + 0xdf78e4b2bd342cf6, 0x914da9246b255416, + 0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e, + 0xae9672aba3d0c320, 0xa184ac2473b529b1, + 0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e, + 0x8865899617fb1871, 0x7e2fa67c7a658892, + 0xaa7eebfb9df9de8d, 0xddbb901b98feeab7, + 0xd51ea6fa85785631, 0x552a74227f3ea565, + 0x8533285c936b35de, 0xd53a88958f87275f, + 0xa67ff273b8460356, 0x8a892abaf368f137, + 0xd01fef10a657842c, 0x2d2b7569b0432d85, + 0x8213f56a67f6b29b, 0x9c3b29620e29fc73, + 0xa298f2c501f45f42, 0x8349f3ba91b47b8f, + 0xcb3f2f7642717713, 0x241c70a936219a73, + 0xfe0efb53d30dd4d7, 0xed238cd383aa0110, + 0x9ec95d1463e8a506, 0xf4363804324a40aa, + 0xc67bb4597ce2ce48, 0xb143c6053edcd0d5, + 0xf81aa16fdc1b81da, 0xdd94b7868e94050a, + 0x9b10a4e5e9913128, 0xca7cf2b4191c8326, + 0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0, + 0xf24a01a73cf2dccf, 0xbc633b39673c8cec, + 0x976e41088617ca01, 0xd5be0503e085d813, + 0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18, + 0xec9c459d51852ba2, 0xddf8e7d60ed1219e, + 0x93e1ab8252f33b45, 0xcabb90e5c942b503, + 0xb8da1662e7b00a17, 0x3d6a751f3b936243, + 0xe7109bfba19c0c9d, 0xcc512670a783ad4, + 0x906a617d450187e2, 0x27fb2b80668b24c5, + 0xb484f9dc9641e9da, 0xb1f9f660802dedf6, + 0xe1a63853bbd26451, 0x5e7873f8a0396973, + 0x8d07e33455637eb2, 0xdb0b487b6423e1e8, + 0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62, + 0xdc5c5301c56b75f7, 0x7641a140cc7810fb, + 0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d, + 0xac2820d9623bf429, 0x546345fa9fbdcd44, + 0xd732290fbacaf133, 0xa97c177947ad4095, + 0x867f59a9d4bed6c0, 0x49ed8eabcccc485d, + 0xa81f301449ee8c70, 0x5c68f256bfff5a74, + 0xd226fc195c6a2f8c, 0x73832eec6fff3111, + 0x83585d8fd9c25db7, 0xc831fd53c5ff7eab, + 0xa42e74f3d032f525, 0xba3e7ca8b77f5e55, + 0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb, + 0x80444b5e7aa7cf85, 0x7980d163cf5b81b3, + 0xa0555e361951c366, 0xd7e105bcc332621f, + 0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7, + 0xfa856334878fc150, 0xb14f98f6f0feb951, + 0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3, + 0xc3b8358109e84f07, 0xa862f80ec4700c8, + 0xf4a642e14c6262c8, 0xcd27bb612758c0fa, + 0x98e7e9cccfbd7dbd, 0x8038d51cb897789c, + 0xbf21e44003acdd2c, 0xe0470a63e6bd56c3, + 0xeeea5d5004981478, 0x1858ccfce06cac74, + 0x95527a5202df0ccb, 0xf37801e0c43ebc8, + 0xbaa718e68396cffd, 0xd30560258f54e6ba, + 0xe950df20247c83fd, 0x47c6b82ef32a2069, + 0x91d28b7416cdd27e, 0x4cdc331d57fa5441, + 0xb6472e511c81471d, 0xe0133fe4adf8e952, + 0xe3d8f9e563a198e5, 0x58180fddd97723a6, + 0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648, + }; }; +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + template -constexpr uint64_t powers_template::power_of_five_128[number_of_entries]; +constexpr uint64_t + powers_template::power_of_five_128[number_of_entries]; + +#endif using powers = powers_template<>; diff --git a/third_party/fast_float/float_common.h b/third_party/fast_float/float_common.h index bee882152025..edc163cb472e 100644 --- a/third_party/fast_float/float_common.h +++ b/third_party/fast_float/float_common.h @@ -7,7 +7,11 @@ #include #include #include - +#ifdef __has_include +#if __has_include() && (__cplusplus > 202002L || _MSVC_LANG > 202002L) +#include +#endif +#endif #include "constexpr_feature_detect.h" namespace fast_float { @@ -28,18 +32,16 @@ enum chars_format { general = fixed | scientific }; -template -struct from_chars_result_t { - UC const* ptr; +template struct from_chars_result_t { + UC const *ptr; std::errc ec; }; using from_chars_result = from_chars_result_t; -template -struct parse_options_t { +template struct parse_options_t { constexpr explicit parse_options_t(chars_format fmt = chars_format::general, - UC dot = UC('.')) - : format(fmt), decimal_point(dot) {} + UC dot = UC('.')) + : format(fmt), decimal_point(dot) {} /** Which number formats are accepted */ chars_format format; @@ -48,39 +50,41 @@ struct parse_options_t { }; using parse_options = parse_options_t; -} +} // namespace fast_float #if FASTFLOAT_HAS_BIT_CAST #include #endif -#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) \ - || defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) \ - || defined(__MINGW64__) \ - || defined(__s390x__) \ - || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) \ - || defined(__loongarch64) ) +#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) || \ + defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) || \ + defined(__MINGW64__) || defined(__s390x__) || \ + (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || \ + defined(__PPC64LE__)) || \ + defined(__loongarch64)) #define FASTFLOAT_64BIT 1 -#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) \ - || defined(__arm__) || defined(_M_ARM) || defined(__ppc__) \ - || defined(__MINGW32__) || defined(__EMSCRIPTEN__)) +#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__arm__) || defined(_M_ARM) || defined(__ppc__) || \ + defined(__MINGW32__) || defined(__EMSCRIPTEN__)) #define FASTFLOAT_32BIT 1 #else // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow. - // We can never tell the register width, but the SIZE_MAX is a good approximation. - // UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max portability. - #if SIZE_MAX == 0xffff - #error Unknown platform (16-bit, unsupported) - #elif SIZE_MAX == 0xffffffff - #define FASTFLOAT_32BIT 1 - #elif SIZE_MAX == 0xffffffffffffffff - #define FASTFLOAT_64BIT 1 - #else - #error Unknown platform (not 32-bit, not 64-bit?) - #endif -#endif - -#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__)) +// We can never tell the register width, but the SIZE_MAX is a good +// approximation. UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max +// portability. +#if SIZE_MAX == 0xffff +#error Unknown platform (16-bit, unsupported) +#elif SIZE_MAX == 0xffffffff +#define FASTFLOAT_32BIT 1 +#elif SIZE_MAX == 0xffffffffffffffff +#define FASTFLOAT_64BIT 1 +#else +#error Unknown platform (not 32-bit, not 64-bit?) +#endif +#endif + +#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__)) || \ + (defined(_M_ARM64) && !defined(__MINGW32__)) #include #endif @@ -124,9 +128,9 @@ using parse_options = parse_options_t; #endif #endif -#if defined(__SSE2__) || \ - (defined(FASTFLOAT_VISUAL_STUDIO) && \ - (defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP == 2))) +#if defined(__SSE2__) || (defined(FASTFLOAT_VISUAL_STUDIO) && \ + (defined(_M_AMD64) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP == 2))) #define FASTFLOAT_SSE2 1 #endif @@ -134,28 +138,25 @@ using parse_options = parse_options_t; #define FASTFLOAT_NEON 1 #endif -#if defined(FASTFLOAT_SSE2) || defined(FASTFLOAT_ARM64) +#if defined(FASTFLOAT_SSE2) || defined(FASTFLOAT_NEON) #define FASTFLOAT_HAS_SIMD 1 #endif #if defined(__GNUC__) // disable -Wcast-align=strict (GCC only) -#define FASTFLOAT_SIMD_DISABLE_WARNINGS \ - _Pragma("GCC diagnostic push") \ - _Pragma("GCC diagnostic ignored \"-Wcast-align\"") +#define FASTFLOAT_SIMD_DISABLE_WARNINGS \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wcast-align\"") #else #define FASTFLOAT_SIMD_DISABLE_WARNINGS #endif #if defined(__GNUC__) -#define FASTFLOAT_SIMD_RESTORE_WARNINGS \ - _Pragma("GCC diagnostic pop") +#define FASTFLOAT_SIMD_RESTORE_WARNINGS _Pragma("GCC diagnostic pop") #else #define FASTFLOAT_SIMD_RESTORE_WARNINGS #endif - - #ifdef FASTFLOAT_VISUAL_STUDIO #define fastfloat_really_inline __forceinline #else @@ -163,18 +164,24 @@ using parse_options = parse_options_t; #endif #ifndef FASTFLOAT_ASSERT -#define FASTFLOAT_ASSERT(x) { ((void)(x)); } +#define FASTFLOAT_ASSERT(x) \ + { ((void)(x)); } #endif #ifndef FASTFLOAT_DEBUG_ASSERT -#define FASTFLOAT_DEBUG_ASSERT(x) { ((void)(x)); } +#define FASTFLOAT_DEBUG_ASSERT(x) \ + { ((void)(x)); } #endif // rust style `try!()` macro, or `?` operator -#define FASTFLOAT_TRY(x) { if (!(x)) return false; } - -#define FASTFLOAT_ENABLE_IF(...) typename std::enable_if<(__VA_ARGS__), int>::type = 0 +#define FASTFLOAT_TRY(x) \ + { \ + if (!(x)) \ + return false; \ + } +#define FASTFLOAT_ENABLE_IF(...) \ + typename std::enable_if<(__VA_ARGS__), int>::type namespace fast_float { @@ -186,10 +193,28 @@ fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() { #endif } +template +fastfloat_really_inline constexpr bool is_supported_float_type() { + return std::is_same::value || std::is_same::value +#if __STDCPP_FLOAT32_T__ + || std::is_same::value +#endif +#if __STDCPP_FLOAT64_T__ + || std::is_same::value +#endif + ; +} + +template +fastfloat_really_inline constexpr bool is_supported_char_type() { + return std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value; +} + // Compares two ASCII strings in a case insensitive manner. template inline FASTFLOAT_CONSTEXPR14 bool -fastfloat_strncasecmp(UC const * input1, UC const * input2, size_t length) { +fastfloat_strncasecmp(UC const *input1, UC const *input2, size_t length) { char running_diff{0}; for (size_t i = 0; i < length; ++i) { running_diff |= (char(input1[i]) ^ char(input2[i])); @@ -202,18 +227,15 @@ fastfloat_strncasecmp(UC const * input1, UC const * input2, size_t length) { #endif // a pointer and a length to a contiguous block of memory -template -struct span { - const T* ptr; +template struct span { + const T *ptr; size_t length; - constexpr span(const T* _ptr, size_t _length) : ptr(_ptr), length(_length) {} + constexpr span(const T *_ptr, size_t _length) : ptr(_ptr), length(_length) {} constexpr span() : ptr(nullptr), length(0) {} - constexpr size_t len() const noexcept { - return length; - } + constexpr size_t len() const noexcept { return length; } - FASTFLOAT_CONSTEXPR14 const T& operator[](size_t index) const noexcept { + FASTFLOAT_CONSTEXPR14 const T &operator[](size_t index) const noexcept { FASTFLOAT_DEBUG_ASSERT(index < length); return ptr[index]; } @@ -227,34 +249,51 @@ struct value128 { }; /* Helper C++14 constexpr generic implementation of leading_zeroes */ -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -int leading_zeroes_generic(uint64_t input_num, int last_bit = 0) { - if(input_num & uint64_t(0xffffffff00000000)) { input_num >>= 32; last_bit |= 32; } - if(input_num & uint64_t( 0xffff0000)) { input_num >>= 16; last_bit |= 16; } - if(input_num & uint64_t( 0xff00)) { input_num >>= 8; last_bit |= 8; } - if(input_num & uint64_t( 0xf0)) { input_num >>= 4; last_bit |= 4; } - if(input_num & uint64_t( 0xc)) { input_num >>= 2; last_bit |= 2; } - if(input_num & uint64_t( 0x2)) { input_num >>= 1; last_bit |= 1; } - return 63 - last_bit; +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int +leading_zeroes_generic(uint64_t input_num, int last_bit = 0) { + if (input_num & uint64_t(0xffffffff00000000)) { + input_num >>= 32; + last_bit |= 32; + } + if (input_num & uint64_t(0xffff0000)) { + input_num >>= 16; + last_bit |= 16; + } + if (input_num & uint64_t(0xff00)) { + input_num >>= 8; + last_bit |= 8; + } + if (input_num & uint64_t(0xf0)) { + input_num >>= 4; + last_bit |= 4; + } + if (input_num & uint64_t(0xc)) { + input_num >>= 2; + last_bit |= 2; + } + if (input_num & uint64_t(0x2)) { /* input_num >>= 1; */ + last_bit |= 1; + } + return 63 - last_bit; } /* result might be undefined when input_num is zero */ -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -int leading_zeroes(uint64_t input_num) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 int +leading_zeroes(uint64_t input_num) { assert(input_num > 0); if (cpp20_and_in_constexpr()) { return leading_zeroes_generic(input_num); } #ifdef FASTFLOAT_VISUAL_STUDIO - #if defined(_M_X64) || defined(_M_ARM64) +#if defined(_M_X64) || defined(_M_ARM64) unsigned long leading_zero = 0; // Search the mask data from most significant bit (MSB) // to least significant bit (LSB) for a set bit (1). _BitScanReverse64(&leading_zero, input_num); return (int)(63 - leading_zero); - #else +#else return leading_zeroes_generic(input_num); - #endif +#endif #else return __builtin_clzll(input_num); #endif @@ -262,18 +301,18 @@ int leading_zeroes(uint64_t input_num) { // slow emulation routine for 32-bit fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) { - return x * (uint64_t)y; + return x * (uint64_t)y; } -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t +umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) { uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd); uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd); uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32)); - uint64_t adbc_carry = !!(adbc < ad); + uint64_t adbc_carry = (uint64_t)(adbc < ad); uint64_t lo = bd + (adbc << 32); *hi = emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) + - (adbc_carry << 32) + !!(lo < bd); + (adbc_carry << 32) + (uint64_t)(lo < bd); return lo; } @@ -281,18 +320,18 @@ uint64_t umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) { // slow emulation routine for 32-bit #if !defined(__MINGW64__) -fastfloat_really_inline FASTFLOAT_CONSTEXPR14 -uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t _umul128(uint64_t ab, + uint64_t cd, + uint64_t *hi) { return umul128_generic(ab, cd, hi); } #endif // !__MINGW64__ #endif // FASTFLOAT_32BIT - // compute 64-bit a*b -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -value128 full_multiplication(uint64_t a, uint64_t b) { +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128 +full_multiplication(uint64_t a, uint64_t b) { if (cpp20_and_in_constexpr()) { value128 answer; answer.low = umul128_generic(a, b, &answer.high); @@ -304,9 +343,10 @@ value128 full_multiplication(uint64_t a, uint64_t b) { // But MinGW on ARM64 doesn't have native support for 64-bit multiplications answer.high = __umulh(a, b); answer.low = a * b; -#elif defined(FASTFLOAT_32BIT) || (defined(_WIN64) && !defined(__clang__)) +#elif defined(FASTFLOAT_32BIT) || \ + (defined(_WIN64) && !defined(__clang__) && !defined(_M_ARM64)) answer.low = _umul128(a, b, &answer.high); // _umul128 not available on ARM64 -#elif defined(FASTFLOAT_64BIT) +#elif defined(FASTFLOAT_64BIT) && defined(__SIZEOF_INT128__) __uint128_t r = ((__uint128_t)a) * b; answer.low = uint64_t(r); answer.high = uint64_t(r >> 64); @@ -334,22 +374,24 @@ constexpr static int32_t invalid_am_bias = -0x8000; // used for binary_format_lookup_tables::max_mantissa constexpr uint64_t constant_55555 = 5 * 5 * 5 * 5 * 5; -template -struct binary_format_lookup_tables; +template struct binary_format_lookup_tables; template struct binary_format : binary_format_lookup_tables { - using equiv_uint = typename std::conditional::type; + using equiv_uint = + typename std::conditional::type; static inline constexpr int mantissa_explicit_bits(); static inline constexpr int minimum_exponent(); static inline constexpr int infinite_power(); static inline constexpr int sign_index(); - static inline constexpr int min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST + static inline constexpr int + min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST static inline constexpr int max_exponent_fast_path(); static inline constexpr int max_exponent_round_to_even(); static inline constexpr int min_exponent_round_to_even(); static inline constexpr uint64_t max_mantissa_fast_path(int64_t power); - static inline constexpr uint64_t max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST + static inline constexpr uint64_t + max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST static inline constexpr int largest_power_of_ten(); static inline constexpr int smallest_power_of_ten(); static inline constexpr T exact_power_of_ten(int64_t power); @@ -359,76 +401,91 @@ template struct binary_format : binary_format_lookup_tables { static inline constexpr equiv_uint hidden_bit_mask(); }; -template -struct binary_format_lookup_tables { +template struct binary_format_lookup_tables { static constexpr double powers_of_ten[] = { 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22}; // Largest integer value v so that (5**index * v) <= 1<<53. - // 0x10000000000000 == 1 << 53 + // 0x20000000000000 == 1 << 53 static constexpr uint64_t max_mantissa[] = { - 0x10000000000000, - 0x10000000000000 / 5, - 0x10000000000000 / (5 * 5), - 0x10000000000000 / (5 * 5 * 5), - 0x10000000000000 / (5 * 5 * 5 * 5), - 0x10000000000000 / (constant_55555), - 0x10000000000000 / (constant_55555 * 5), - 0x10000000000000 / (constant_55555 * 5 * 5), - 0x10000000000000 / (constant_55555 * 5 * 5 * 5), - 0x10000000000000 / (constant_55555 * 5 * 5 * 5 * 5), - 0x10000000000000 / (constant_55555 * constant_55555), - 0x10000000000000 / (constant_55555 * constant_55555 * 5), - 0x10000000000000 / (constant_55555 * constant_55555 * 5 * 5), - 0x10000000000000 / (constant_55555 * constant_55555 * 5 * 5 * 5), - 0x10000000000000 / (constant_55555 * constant_55555 * constant_55555), - 0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5), - 0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5 * 5), - 0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5), - 0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5), - 0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555), - 0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5), - 0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5 * 5), - 0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5), - 0x10000000000000 / (constant_55555 * constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5)}; + 0x20000000000000, + 0x20000000000000 / 5, + 0x20000000000000 / (5 * 5), + 0x20000000000000 / (5 * 5 * 5), + 0x20000000000000 / (5 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555), + 0x20000000000000 / (constant_55555 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5 * 5 * 5)}; }; +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + template constexpr double binary_format_lookup_tables::powers_of_ten[]; template constexpr uint64_t binary_format_lookup_tables::max_mantissa[]; -template -struct binary_format_lookup_tables { +#endif + +template struct binary_format_lookup_tables { static constexpr float powers_of_ten[] = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, - 1e6f, 1e7f, 1e8f, 1e9f, 1e10f}; + 1e6f, 1e7f, 1e8f, 1e9f, 1e10f}; // Largest integer value v so that (5**index * v) <= 1<<24. // 0x1000000 == 1<<24 static constexpr uint64_t max_mantissa[] = { - 0x1000000, - 0x1000000 / 5, - 0x1000000 / (5 * 5), - 0x1000000 / (5 * 5 * 5), - 0x1000000 / (5 * 5 * 5 * 5), - 0x1000000 / (constant_55555), - 0x1000000 / (constant_55555 * 5), - 0x1000000 / (constant_55555 * 5 * 5), - 0x1000000 / (constant_55555 * 5 * 5 * 5), - 0x1000000 / (constant_55555 * 5 * 5 * 5 * 5), - 0x1000000 / (constant_55555 * constant_55555), - 0x1000000 / (constant_55555 * constant_55555 * 5)}; + 0x1000000, + 0x1000000 / 5, + 0x1000000 / (5 * 5), + 0x1000000 / (5 * 5 * 5), + 0x1000000 / (5 * 5 * 5 * 5), + 0x1000000 / (constant_55555), + 0x1000000 / (constant_55555 * 5), + 0x1000000 / (constant_55555 * 5 * 5), + 0x1000000 / (constant_55555 * 5 * 5 * 5), + 0x1000000 / (constant_55555 * 5 * 5 * 5 * 5), + 0x1000000 / (constant_55555 * constant_55555), + 0x1000000 / (constant_55555 * constant_55555 * 5)}; }; +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + template constexpr float binary_format_lookup_tables::powers_of_ten[]; template constexpr uint64_t binary_format_lookup_tables::max_mantissa[]; -template <> inline constexpr int binary_format::min_exponent_fast_path() { +#endif + +template <> +inline constexpr int binary_format::min_exponent_fast_path() { #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) return 0; #else @@ -436,7 +493,8 @@ template <> inline constexpr int binary_format::min_exponent_fast_path() #endif } -template <> inline constexpr int binary_format::min_exponent_fast_path() { +template <> +inline constexpr int binary_format::min_exponent_fast_path() { #if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) return 0; #else @@ -444,26 +502,32 @@ template <> inline constexpr int binary_format::min_exponent_fast_path() #endif } -template <> inline constexpr int binary_format::mantissa_explicit_bits() { +template <> +inline constexpr int binary_format::mantissa_explicit_bits() { return 52; } -template <> inline constexpr int binary_format::mantissa_explicit_bits() { +template <> +inline constexpr int binary_format::mantissa_explicit_bits() { return 23; } -template <> inline constexpr int binary_format::max_exponent_round_to_even() { +template <> +inline constexpr int binary_format::max_exponent_round_to_even() { return 23; } -template <> inline constexpr int binary_format::max_exponent_round_to_even() { +template <> +inline constexpr int binary_format::max_exponent_round_to_even() { return 10; } -template <> inline constexpr int binary_format::min_exponent_round_to_even() { +template <> +inline constexpr int binary_format::min_exponent_round_to_even() { return -4; } -template <> inline constexpr int binary_format::min_exponent_round_to_even() { +template <> +inline constexpr int binary_format::min_exponent_round_to_even() { return -17; } @@ -481,30 +545,42 @@ template <> inline constexpr int binary_format::infinite_power() { return 0xFF; } -template <> inline constexpr int binary_format::sign_index() { return 63; } -template <> inline constexpr int binary_format::sign_index() { return 31; } +template <> inline constexpr int binary_format::sign_index() { + return 63; +} +template <> inline constexpr int binary_format::sign_index() { + return 31; +} -template <> inline constexpr int binary_format::max_exponent_fast_path() { +template <> +inline constexpr int binary_format::max_exponent_fast_path() { return 22; } -template <> inline constexpr int binary_format::max_exponent_fast_path() { +template <> +inline constexpr int binary_format::max_exponent_fast_path() { return 10; } -template <> inline constexpr uint64_t binary_format::max_mantissa_fast_path() { +template <> +inline constexpr uint64_t binary_format::max_mantissa_fast_path() { return uint64_t(2) << mantissa_explicit_bits(); } -template <> inline constexpr uint64_t binary_format::max_mantissa_fast_path(int64_t power) { +template <> +inline constexpr uint64_t +binary_format::max_mantissa_fast_path(int64_t power) { // caller is responsible to ensure that // power >= 0 && power <= 22 // // Work around clang bug https://godbolt.org/z/zedh7rrhc return (void)max_mantissa[0], max_mantissa[power]; } -template <> inline constexpr uint64_t binary_format::max_mantissa_fast_path() { +template <> +inline constexpr uint64_t binary_format::max_mantissa_fast_path() { return uint64_t(2) << mantissa_explicit_bits(); } -template <> inline constexpr uint64_t binary_format::max_mantissa_fast_path(int64_t power) { +template <> +inline constexpr uint64_t +binary_format::max_mantissa_fast_path(int64_t power) { // caller is responsible to ensure that // power >= 0 && power <= 10 // @@ -513,7 +589,8 @@ template <> inline constexpr uint64_t binary_format::max_mantissa_fast_pa } template <> -inline constexpr double binary_format::exact_power_of_ten(int64_t power) { +inline constexpr double +binary_format::exact_power_of_ten(int64_t power) { // Work around clang bug https://godbolt.org/z/zedh7rrhc return (void)powers_of_ten[0], powers_of_ten[power]; } @@ -523,13 +600,10 @@ inline constexpr float binary_format::exact_power_of_ten(int64_t power) { return (void)powers_of_ten[0], powers_of_ten[power]; } - -template <> -inline constexpr int binary_format::largest_power_of_ten() { +template <> inline constexpr int binary_format::largest_power_of_ten() { return 308; } -template <> -inline constexpr int binary_format::largest_power_of_ten() { +template <> inline constexpr int binary_format::largest_power_of_ten() { return 38; } @@ -537,9 +611,8 @@ template <> inline constexpr int binary_format::smallest_power_of_ten() { return -342; } -template <> -inline constexpr int binary_format::smallest_power_of_ten() { - return -65; +template <> inline constexpr int binary_format::smallest_power_of_ten() { + return -64; } template <> inline constexpr size_t binary_format::max_digits() { @@ -549,39 +622,46 @@ template <> inline constexpr size_t binary_format::max_digits() { return 114; } -template <> inline constexpr binary_format::equiv_uint - binary_format::exponent_mask() { +template <> +inline constexpr binary_format::equiv_uint +binary_format::exponent_mask() { return 0x7F800000; } -template <> inline constexpr binary_format::equiv_uint - binary_format::exponent_mask() { +template <> +inline constexpr binary_format::equiv_uint +binary_format::exponent_mask() { return 0x7FF0000000000000; } -template <> inline constexpr binary_format::equiv_uint - binary_format::mantissa_mask() { +template <> +inline constexpr binary_format::equiv_uint +binary_format::mantissa_mask() { return 0x007FFFFF; } -template <> inline constexpr binary_format::equiv_uint - binary_format::mantissa_mask() { +template <> +inline constexpr binary_format::equiv_uint +binary_format::mantissa_mask() { return 0x000FFFFFFFFFFFFF; } -template <> inline constexpr binary_format::equiv_uint - binary_format::hidden_bit_mask() { +template <> +inline constexpr binary_format::equiv_uint +binary_format::hidden_bit_mask() { return 0x00800000; } -template <> inline constexpr binary_format::equiv_uint - binary_format::hidden_bit_mask() { +template <> +inline constexpr binary_format::equiv_uint +binary_format::hidden_bit_mask() { return 0x0010000000000000; } -template -fastfloat_really_inline FASTFLOAT_CONSTEXPR20 -void to_float(bool negative, adjusted_mantissa am, T &value) { +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +to_float(bool negative, adjusted_mantissa am, T &value) { using fastfloat_uint = typename binary_format::equiv_uint; fastfloat_uint word = (fastfloat_uint)am.mantissa; - word |= fastfloat_uint(am.power2) << binary_format::mantissa_explicit_bits(); + word |= fastfloat_uint(am.power2) + << binary_format::mantissa_explicit_bits(); word |= fastfloat_uint(negative) << binary_format::sign_index(); #if FASTFLOAT_HAS_BIT_CAST value = std::bit_cast(word); @@ -591,89 +671,132 @@ void to_float(bool negative, adjusted_mantissa am, T &value) { } #ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default -template -struct space_lut { +template struct space_lut { static constexpr bool value[] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; }; -template -constexpr bool space_lut::value[]; +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr bool space_lut::value[]; + +#endif inline constexpr bool is_space(uint8_t c) { return space_lut<>::value[c]; } #endif -template -static constexpr uint64_t int_cmp_zeros() -{ - static_assert((sizeof(UC) == 1) || (sizeof(UC) == 2) || (sizeof(UC) == 4), "Unsupported character size"); - return (sizeof(UC) == 1) ? 0x3030303030303030 : (sizeof(UC) == 2) ? (uint64_t(UC('0')) << 48 | uint64_t(UC('0')) << 32 | uint64_t(UC('0')) << 16 | UC('0')) : (uint64_t(UC('0')) << 32 | UC('0')); -} -template -static constexpr int int_cmp_len() -{ - return sizeof(uint64_t) / sizeof(UC); -} -template -static constexpr UC const * str_const_nan() -{ - return nullptr; -} -template<> -constexpr char const * str_const_nan() -{ - return "nan"; -} -template<> -constexpr wchar_t const * str_const_nan() -{ - return L"nan"; -} -template<> -constexpr char16_t const * str_const_nan() -{ - return u"nan"; -} -template<> -constexpr char32_t const * str_const_nan() -{ - return U"nan"; -} -template -static constexpr UC const * str_const_inf() -{ - return nullptr; -} -template<> -constexpr char const * str_const_inf() -{ - return "infinity"; -} -template<> -constexpr wchar_t const * str_const_inf() -{ - return L"infinity"; -} -template<> -constexpr char16_t const * str_const_inf() -{ - return u"infinity"; -} -template<> -constexpr char32_t const * str_const_inf() -{ - return U"infinity"; +template static constexpr uint64_t int_cmp_zeros() { + static_assert((sizeof(UC) == 1) || (sizeof(UC) == 2) || (sizeof(UC) == 4), + "Unsupported character size"); + return (sizeof(UC) == 1) ? 0x3030303030303030 + : (sizeof(UC) == 2) + ? (uint64_t(UC('0')) << 48 | uint64_t(UC('0')) << 32 | + uint64_t(UC('0')) << 16 | UC('0')) + : (uint64_t(UC('0')) << 32 | UC('0')); +} +template static constexpr int int_cmp_len() { + return sizeof(uint64_t) / sizeof(UC); +} +template static constexpr UC const *str_const_nan() { + return nullptr; +} +template <> constexpr char const *str_const_nan() { return "nan"; } +template <> constexpr wchar_t const *str_const_nan() { return L"nan"; } +template <> constexpr char16_t const *str_const_nan() { + return u"nan"; +} +template <> constexpr char32_t const *str_const_nan() { + return U"nan"; +} +template static constexpr UC const *str_const_inf() { + return nullptr; +} +template <> constexpr char const *str_const_inf() { return "infinity"; } +template <> constexpr wchar_t const *str_const_inf() { + return L"infinity"; +} +template <> constexpr char16_t const *str_const_inf() { + return u"infinity"; +} +template <> constexpr char32_t const *str_const_inf() { + return U"infinity"; +} + +template struct int_luts { + static constexpr uint8_t chdigit[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 255, 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; + + static constexpr size_t maxdigits_u64[] = { + 64, 41, 32, 28, 25, 23, 22, 21, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, + 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13}; + + static constexpr uint64_t min_safe_u64[] = { + 9223372036854775808ull, 12157665459056928801ull, 4611686018427387904, + 7450580596923828125, 4738381338321616896, 3909821048582988049, + 9223372036854775808ull, 12157665459056928801ull, 10000000000000000000ull, + 5559917313492231481, 2218611106740436992, 8650415919381337933, + 2177953337809371136, 6568408355712890625, 1152921504606846976, + 2862423051509815793, 6746640616477458432, 15181127029874798299ull, + 1638400000000000000, 3243919932521508681, 6221821273427820544, + 11592836324538749809ull, 876488338465357824, 1490116119384765625, + 2481152873203736576, 4052555153018976267, 6502111422497947648, + 10260628712958602189ull, 15943230000000000000ull, 787662783788549761, + 1152921504606846976, 1667889514952984961, 2386420683693101056, + 3379220508056640625, 4738381338321616896}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr uint8_t int_luts::chdigit[]; + +template constexpr size_t int_luts::maxdigits_u64[]; + +template constexpr uint64_t int_luts::min_safe_u64[]; + +#endif + +template +fastfloat_really_inline constexpr uint8_t ch_to_digit(UC c) { + return int_luts<>::chdigit[static_cast(c)]; } + +fastfloat_really_inline constexpr size_t max_digits_u64(int base) { + return int_luts<>::maxdigits_u64[base - 2]; +} + +// If a u64 is exactly max_digits_u64() in length, this is +// the value below which it has definitely overflowed. +fastfloat_really_inline constexpr uint64_t min_safe_u64(int base) { + return int_luts<>::min_safe_u64[base - 2]; +} + } // namespace fast_float #endif diff --git a/third_party/fast_float/parse_number.h b/third_party/fast_float/parse_number.h index a011a8cbf4df..6d883fb96ea1 100644 --- a/third_party/fast_float/parse_number.h +++ b/third_party/fast_float/parse_number.h @@ -10,10 +10,8 @@ #include #include #include - namespace fast_float { - namespace detail { /** * Special case +inf, -inf, nan, infinity, -infinity. @@ -21,45 +19,53 @@ namespace detail { * strings a null-free and fixed. **/ template -from_chars_result_t FASTFLOAT_CONSTEXPR14 -parse_infnan(UC const * first, UC const * last, T &value) noexcept { +from_chars_result_t FASTFLOAT_CONSTEXPR14 parse_infnan(UC const *first, + UC const *last, + T &value) noexcept { from_chars_result_t answer{}; answer.ptr = first; answer.ec = std::errc(); // be optimistic bool minusSign = false; - if (*first == UC('-')) { // assume first < last, so dereference without checks; C++17 20.19.3.(7.1) explicitly forbids '+' here - minusSign = true; - ++first; + if (*first == + UC('-')) { // assume first < last, so dereference without checks; + // C++17 20.19.3.(7.1) explicitly forbids '+' here + minusSign = true; + ++first; } #ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default if (*first == UC('+')) { - ++first; + ++first; } #endif if (last - first >= 3) { if (fastfloat_strncasecmp(first, str_const_nan(), 3)) { answer.ptr = (first += 3); - value = minusSign ? -std::numeric_limits::quiet_NaN() : std::numeric_limits::quiet_NaN(); - // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan). - if(first != last && *first == UC('(')) { - for(UC const * ptr = first + 1; ptr != last; ++ptr) { + value = minusSign ? -std::numeric_limits::quiet_NaN() + : std::numeric_limits::quiet_NaN(); + // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, + // C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan). + if (first != last && *first == UC('(')) { + for (UC const *ptr = first + 1; ptr != last; ++ptr) { if (*ptr == UC(')')) { answer.ptr = ptr + 1; // valid nan(n-char-seq-opt) break; - } - else if(!((UC('a') <= *ptr && *ptr <= UC('z')) || (UC('A') <= *ptr && *ptr <= UC('Z')) || (UC('0') <= *ptr && *ptr <= UC('9')) || *ptr == UC('_'))) + } else if (!((UC('a') <= *ptr && *ptr <= UC('z')) || + (UC('A') <= *ptr && *ptr <= UC('Z')) || + (UC('0') <= *ptr && *ptr <= UC('9')) || *ptr == UC('_'))) break; // forbidden char, not nan(n-char-seq-opt) } } return answer; } if (fastfloat_strncasecmp(first, str_const_inf(), 3)) { - if ((last - first >= 8) && fastfloat_strncasecmp(first + 3, str_const_inf() + 3, 5)) { + if ((last - first >= 8) && + fastfloat_strncasecmp(first + 3, str_const_inf() + 3, 5)) { answer.ptr = first + 8; } else { answer.ptr = first + 3; } - value = minusSign ? -std::numeric_limits::infinity() : std::numeric_limits::infinity(); + value = minusSign ? -std::numeric_limits::infinity() + : std::numeric_limits::infinity(); return answer; } } @@ -89,98 +95,128 @@ fastfloat_really_inline bool rounds_to_nearest() noexcept { // // The volatile keywoard prevents the compiler from computing the function // at compile-time. - // There might be other ways to prevent compile-time optimizations (e.g., asm). - // The value does not need to be std::numeric_limits::min(), any small - // value so that 1 + x should round to 1 would do (after accounting for excess - // precision, as in 387 instructions). + // There might be other ways to prevent compile-time optimizations (e.g., + // asm). The value does not need to be std::numeric_limits::min(), any + // small value so that 1 + x should round to 1 would do (after accounting for + // excess precision, as in 387 instructions). static volatile float fmin = std::numeric_limits::min(); float fmini = fmin; // we copy it so that it gets loaded at most once. - // - // Explanation: - // Only when fegetround() == FE_TONEAREST do we have that - // fmin + 1.0f == 1.0f - fmin. - // - // FE_UPWARD: - // fmin + 1.0f > 1 - // 1.0f - fmin == 1 - // - // FE_DOWNWARD or FE_TOWARDZERO: - // fmin + 1.0f == 1 - // 1.0f - fmin < 1 - // - // Note: This may fail to be accurate if fast-math has been - // enabled, as rounding conventions may not apply. - #ifdef FASTFLOAT_VISUAL_STUDIO - # pragma warning(push) - // todo: is there a VS warning? - // see https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013 - #elif defined(__clang__) - # pragma clang diagnostic push - # pragma clang diagnostic ignored "-Wfloat-equal" - #elif defined(__GNUC__) - # pragma GCC diagnostic push - # pragma GCC diagnostic ignored "-Wfloat-equal" - #endif +// +// Explanation: +// Only when fegetround() == FE_TONEAREST do we have that +// fmin + 1.0f == 1.0f - fmin. +// +// FE_UPWARD: +// fmin + 1.0f > 1 +// 1.0f - fmin == 1 +// +// FE_DOWNWARD or FE_TOWARDZERO: +// fmin + 1.0f == 1 +// 1.0f - fmin < 1 +// +// Note: This may fail to be accurate if fast-math has been +// enabled, as rounding conventions may not apply. +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(push) +// todo: is there a VS warning? +// see +// https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013 +#elif defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wfloat-equal" +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif return (fmini + 1.0f == 1.0f - fmini); - #ifdef FASTFLOAT_VISUAL_STUDIO - # pragma warning(pop) - #elif defined(__clang__) - # pragma clang diagnostic pop - #elif defined(__GNUC__) - # pragma GCC diagnostic pop - #endif +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(pop) +#elif defined(__clang__) +#pragma clang diagnostic pop +#elif defined(__GNUC__) +#pragma GCC diagnostic pop +#endif } } // namespace detail -template -FASTFLOAT_CONSTEXPR20 -from_chars_result_t from_chars(UC const * first, UC const * last, - T &value, chars_format fmt /*= chars_format::general*/) noexcept { - return from_chars_advanced(first, last, value, parse_options_t{fmt}); +template struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept { + return from_chars_advanced(first, last, value, options); + } +}; + +#if __STDCPP_FLOAT32_T__ == 1 +template <> struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, std::float32_t &value, + parse_options_t options) noexcept { + // if std::float32_t is defined, and we are in C++23 mode; macro set for + // float32; set value to float due to equivalence between float and + // float32_t + float val; + auto ret = from_chars_advanced(first, last, val, options); + value = val; + return ret; + } +}; +#endif + +#if __STDCPP_FLOAT64_T__ == 1 +template <> struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, std::float64_t &value, + parse_options_t options) noexcept { + // if std::float64_t is defined, and we are in C++23 mode; macro set for + // float64; set value as double due to equivalence between double and + // float64_t + double val; + auto ret = from_chars_advanced(first, last, val, options); + value = val; + return ret; + } +}; +#endif + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, + chars_format fmt /*= chars_format::general*/) noexcept { + return from_chars_caller::call(first, last, value, + parse_options_t(fmt)); } -template -FASTFLOAT_CONSTEXPR20 -from_chars_result_t from_chars_advanced(UC const * first, UC const * last, - T &value, parse_options_t options) noexcept { +/** + * This function overload takes parsed_number_string_t structure that is created + * and populated either by from_chars_advanced function taking chars range and + * parsing options or other parsing custom function implemented by user. + */ +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(parsed_number_string_t &pns, T &value) noexcept { - static_assert (std::is_same::value || std::is_same::value, "only float and double are supported"); - static_assert (std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value , "only char, wchar_t, char16_t and char32_t are supported"); + static_assert(is_supported_float_type(), + "only some floating-point types are supported"); + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); from_chars_result_t answer; -#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default - while ((first != last) && fast_float::is_space(uint8_t(*first))) { - first++; - } -#endif - if (first == last) { - answer.ec = std::errc::invalid_argument; - answer.ptr = first; - return answer; - } - parsed_number_string_t pns = parse_number_string(first, last, options); - if (!pns.valid) { - if (options.format & chars_format::no_infnan) { - answer.ec = std::errc::invalid_argument; - answer.ptr = first; - return answer; - } else { - return detail::parse_infnan(first, last, value); - } - } answer.ec = std::errc(); // be optimistic answer.ptr = pns.lastmatch; // The implementation of the Clinger's fast path is convoluted because // we want round-to-nearest in all cases, irrespective of the rounding mode // selected on the thread. - // We proceed optimistically, assuming that detail::rounds_to_nearest() returns - // true. - if (binary_format::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format::max_exponent_fast_path() && !pns.too_many_digits) { + // We proceed optimistically, assuming that detail::rounds_to_nearest() + // returns true. + if (binary_format::min_exponent_fast_path() <= pns.exponent && + pns.exponent <= binary_format::max_exponent_fast_path() && + !pns.too_many_digits) { // Unfortunately, the conventional Clinger's fast path is only possible // when the system rounds to the nearest float. // @@ -188,50 +224,123 @@ from_chars_result_t from_chars_advanced(UC const * first, UC const * last, // We could check it first (before the previous branch), but // there might be performance advantages at having the check // be last. - if(!cpp20_and_in_constexpr() && detail::rounds_to_nearest()) { + if (!cpp20_and_in_constexpr() && detail::rounds_to_nearest()) { // We have that fegetround() == FE_TONEAREST. // Next is Clinger's fast path. - if (pns.mantissa <=binary_format::max_mantissa_fast_path()) { + if (pns.mantissa <= binary_format::max_mantissa_fast_path()) { value = T(pns.mantissa); - if (pns.exponent < 0) { value = value / binary_format::exact_power_of_ten(-pns.exponent); } - else { value = value * binary_format::exact_power_of_ten(pns.exponent); } - if (pns.negative) { value = -value; } + if (pns.exponent < 0) { + value = value / binary_format::exact_power_of_ten(-pns.exponent); + } else { + value = value * binary_format::exact_power_of_ten(pns.exponent); + } + if (pns.negative) { + value = -value; + } return answer; } } else { // We do not have that fegetround() == FE_TONEAREST. - // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal - if (pns.exponent >= 0 && pns.mantissa <=binary_format::max_mantissa_fast_path(pns.exponent)) { -#if defined(__clang__) + // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's + // proposal + if (pns.exponent >= 0 && + pns.mantissa <= + binary_format::max_mantissa_fast_path(pns.exponent)) { +#if defined(__clang__) || defined(FASTFLOAT_32BIT) // Clang may map 0 to -0.0 when fegetround() == FE_DOWNWARD - if(pns.mantissa == 0) { - value = pns.negative ? -0. : 0.; + if (pns.mantissa == 0) { + value = pns.negative ? T(-0.) : T(0.); return answer; } #endif - value = T(pns.mantissa) * binary_format::exact_power_of_ten(pns.exponent); - if (pns.negative) { value = -value; } + value = T(pns.mantissa) * + binary_format::exact_power_of_ten(pns.exponent); + if (pns.negative) { + value = -value; + } return answer; } } } - adjusted_mantissa am = compute_float>(pns.exponent, pns.mantissa); - if(pns.too_many_digits && am.power2 >= 0) { - if(am != compute_float>(pns.exponent, pns.mantissa + 1)) { + adjusted_mantissa am = + compute_float>(pns.exponent, pns.mantissa); + if (pns.too_many_digits && am.power2 >= 0) { + if (am != compute_float>(pns.exponent, pns.mantissa + 1)) { am = compute_error>(pns.exponent, pns.mantissa); } } - // If we called compute_float>(pns.exponent, pns.mantissa) and we have an invalid power (am.power2 < 0), - // then we need to go the long way around again. This is very uncommon. - if(am.power2 < 0) { am = digit_comp(pns, am); } + // If we called compute_float>(pns.exponent, pns.mantissa) + // and we have an invalid power (am.power2 < 0), then we need to go the long + // way around again. This is very uncommon. + if (am.power2 < 0) { + am = digit_comp(pns, am); + } to_float(pns.negative, am, value); // Test for over/underflow. - if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || am.power2 == binary_format::infinite_power()) { + if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + am.power2 == binary_format::infinite_power()) { answer.ec = std::errc::result_out_of_range; } return answer; } +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept { + + static_assert(is_supported_float_type(), + "only some floating-point types are supported"); + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default + while ((first != last) && fast_float::is_space(uint8_t(*first))) { + first++; + } +#endif + if (first == last) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } + parsed_number_string_t pns = + parse_number_string(first, last, options); + if (!pns.valid) { + if (options.format & chars_format::no_infnan) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } else { + return detail::parse_infnan(first, last, value); + } + } + + // call overload that takes parsed_number_string_t directly. + return from_chars_advanced(pns, value); +} + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, int base) noexcept { + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default + while ((first != last) && fast_float::is_space(uint8_t(*first))) { + first++; + } +#endif + if (first == last || base < 2 || base > 36) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } + return parse_int_string(first, last, value, base); +} + } // namespace fast_float #endif