diff --git a/gcore/CMakeLists.txt b/gcore/CMakeLists.txt index f8258bbdec4c..44c798dd8ee2 100644 --- a/gcore/CMakeLists.txt +++ b/gcore/CMakeLists.txt @@ -97,7 +97,10 @@ if (NOT GDAL_AUTOLOAD_PLUGINS) PROPERTY COMPILE_DEFINITIONS GDAL_NO_AUTOLOAD) endif () -if (HAVE_SSSE3_AT_COMPILE_TIME) +if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS) + target_compile_definitions(gcore PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME -DUSE_NEON_OPTIMIZATIONS) + target_sources(gcore PRIVATE rasterio_ssse3.cpp) +elseif (HAVE_SSSE3_AT_COMPILE_TIME) target_compile_definitions(gcore PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME) target_sources(gcore PRIVATE rasterio_ssse3.cpp) set_property( diff --git a/gcore/gdal_priv_templates.hpp b/gcore/gdal_priv_templates.hpp index 80a988dcdf3f..7e7b02d34174 100644 --- a/gcore/gdal_priv_templates.hpp +++ b/gcore/gdal_priv_templates.hpp @@ -513,9 +513,14 @@ inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut) } // Needs SSE2 -#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) +#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) || \ + defined(USE_NEON_OPTIMIZATIONS) +#ifdef USE_NEON_OPTIMIZATIONS +#include "sse2neon.h" +#else #include +#endif static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest) { diff --git a/gcore/rasterio.cpp b/gcore/rasterio.cpp index b3c7f22d29fb..82d732480092 100644 --- a/gcore/rasterio.cpp +++ b/gcore/rasterio.cpp @@ -57,6 +57,18 @@ #include "memdataset.h" #include "vrtdataset.h" +#if defined(__x86_64) || defined(_M_X64) +#include +#define HAVE_SSE2 +#elif defined(USE_NEON_OPTIMIZATIONS) +#include "sse2neon.h" +#define HAVE_SSE2 +#endif + +#ifdef HAVE_SSSE3_AT_COMPILE_TIME +#include "rasterio_ssse3.h" +#endif + static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData, int nSrcPixelStride, GByte *CPL_RESTRICT pDstData, int nDstPixelStride, GPtrDiff_t nWordCount); @@ -2192,9 +2204,7 @@ static void inline GDALCopyWordsT_8atatime( } } -#if defined(__x86_64) || defined(_M_X64) - -#include +#ifdef HAVE_SSE2 template void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData, @@ -2605,7 +2615,7 @@ void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData, nDstPixelStride, nWordCount); } -#endif // defined(__x86_64) || defined(_M_X64) +#endif // HAVE_SSE2 template <> void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData, @@ -3043,13 +3053,7 @@ static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest, GDALUnrolledCopyGeneric(pDest, pSrc, nIters); } -#if (defined(__x86_64) || defined(_M_X64)) - -#ifdef HAVE_SSSE3_AT_COMPILE_TIME - -#include "rasterio_ssse3.h" - -#endif +#ifdef HAVE_SSE2 template <> void GDALUnrolledCopy(GByte *CPL_RESTRICT pDest, @@ -3150,7 +3154,7 @@ void GDALUnrolledCopy(GByte *CPL_RESTRICT pDest, pSrc += 4; } } -#endif // defined(__x86_64) || defined(_M_X64) +#endif // HAVE_SSE2 /************************************************************************/ /* GDALFastCopy() */ @@ -5256,13 +5260,7 @@ bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue, return false; } -#if defined(__x86_64) || defined(_M_X64) - -#include - -#ifdef HAVE_SSSE3_AT_COMPILE_TIME -#include "rasterio_ssse3.h" -#endif +#ifdef HAVE_SSE2 /************************************************************************/ /* GDALDeinterleave3Byte() */ @@ -5276,6 +5274,12 @@ GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest0, GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, size_t nIters) +#ifdef USE_NEON_OPTIMIZATIONS +{ + return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2, + nIters); +} +#else { #ifdef HAVE_SSSE3_AT_COMPILE_TIME if (CPLHaveRuntimeSSSE3()) @@ -5323,6 +5327,7 @@ GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc, pabyDest2[i] = pabySrc[3 * i + 2]; } } +#endif /************************************************************************/ /* GDALDeinterleave4Byte() */ @@ -5378,6 +5383,12 @@ static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc, GByte *CPL_RESTRICT pabyDest1, GByte *CPL_RESTRICT pabyDest2, GByte *CPL_RESTRICT pabyDest3, size_t nIters) +#ifdef USE_NEON_OPTIMIZATIONS +{ + return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2, + pabyDest3, nIters); +} +#else { #ifdef HAVE_SSSE3_AT_COMPILE_TIME if (CPLHaveRuntimeSSSE3()) @@ -5426,6 +5437,7 @@ static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc, pabyDest3[i] = pabySrc[4 * i + 3]; } } +#endif #else // GCC autovectorizer does an excellent job __attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte( @@ -5553,8 +5565,7 @@ void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT, } #if ((defined(__GNUC__) && !defined(__clang__)) || \ defined(__INTEL_CLANG_COMPILER)) && \ - (defined(__x86_64) || defined(_M_X64)) && \ - defined(HAVE_SSSE3_AT_COMPILE_TIME) + defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME) else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) && CPLHaveRuntimeSSSE3()) { diff --git a/gcore/rasterio_ssse3.cpp b/gcore/rasterio_ssse3.cpp index ebe6f456753a..bbc1191805c6 100644 --- a/gcore/rasterio_ssse3.cpp +++ b/gcore/rasterio_ssse3.cpp @@ -28,12 +28,18 @@ #include "cpl_port.h" -#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && \ - (defined(__x86_64) || defined(_M_X64)) +#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \ + (defined(__x86_64) || defined(_M_X64))) || \ + defined(USE_NEON_OPTIMIZATIONS) #include "rasterio_ssse3.h" +#ifdef USE_NEON_OPTIMIZATIONS +#include "sse2neon.h" +#else #include +#endif + #include "gdal_priv_templates.hpp" void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest, diff --git a/gcore/rasterio_ssse3.h b/gcore/rasterio_ssse3.h index 72fa8adf519c..21208bb4fec1 100644 --- a/gcore/rasterio_ssse3.h +++ b/gcore/rasterio_ssse3.h @@ -32,7 +32,7 @@ #include "cpl_port.h" #if defined(HAVE_SSSE3_AT_COMPILE_TIME) && \ - (defined(__x86_64) || defined(_M_X64)) + (defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS)) void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest, const GByte *CPL_RESTRICT pSrc, diff --git a/port/cpl_cpu_features.h b/port/cpl_cpu_features.h index 4e83411147c9..4196d2622463 100644 --- a/port/cpl_cpu_features.h +++ b/port/cpl_cpu_features.h @@ -46,7 +46,12 @@ bool CPLHaveRuntimeSSE(); #endif #endif -#ifdef HAVE_SSSE3_AT_COMPILE_TIME +#ifdef USE_NEON_OPTIMIZATIONS +static bool inline CPLHaveRuntimeSSSE3() +{ + return true; +} +#elif defined(HAVE_SSSE3_AT_COMPILE_TIME) #if __SSSE3__ #define HAVE_INLINE_SSSE3 static bool inline CPLHaveRuntimeSSSE3()