Skip to content

Commit

Permalink
Use sse2neon.h in gcore/rasterio functionnality
Browse files Browse the repository at this point in the history
  • Loading branch information
rouault committed Mar 16, 2024
1 parent 75626d6 commit 30f3844
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 27 deletions.
5 changes: 4 additions & 1 deletion gcore/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ if (NOT GDAL_AUTOLOAD_PLUGINS)
PROPERTY COMPILE_DEFINITIONS GDAL_NO_AUTOLOAD)
endif ()

if (HAVE_SSSE3_AT_COMPILE_TIME)
if (GDAL_ENABLE_ARM_NEON_OPTIMIZATIONS)
target_compile_definitions(gcore PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME -DUSE_NEON_OPTIMIZATIONS)
target_sources(gcore PRIVATE rasterio_ssse3.cpp)
elseif (HAVE_SSSE3_AT_COMPILE_TIME)
target_compile_definitions(gcore PRIVATE -DHAVE_SSSE3_AT_COMPILE_TIME)
target_sources(gcore PRIVATE rasterio_ssse3.cpp)
set_property(
Expand Down
7 changes: 6 additions & 1 deletion gcore/gdal_priv_templates.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -513,9 +513,14 @@ inline void GDALCopy8Words(const Tin *pValueIn, Tout *const pValueOut)
}

// Needs SSE2
#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2)
#if defined(__x86_64) || defined(_M_X64) || defined(USE_SSE2) || \
defined(USE_NEON_OPTIMIZATIONS)

#ifdef USE_NEON_OPTIMIZATIONS
#include "sse2neon.h"
#else
#include <emmintrin.h>
#endif

static inline void GDALCopyXMMToInt32(const __m128i xmm, void *pDest)
{
Expand Down
53 changes: 32 additions & 21 deletions gcore/rasterio.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,18 @@
#include "memdataset.h"
#include "vrtdataset.h"

#if defined(__x86_64) || defined(_M_X64)
#include <emmintrin.h>
#define HAVE_SSE2
#elif defined(USE_NEON_OPTIMIZATIONS)
#include "sse2neon.h"
#define HAVE_SSE2
#endif

#ifdef HAVE_SSSE3_AT_COMPILE_TIME
#include "rasterio_ssse3.h"
#endif

static void GDALFastCopyByte(const GByte *CPL_RESTRICT pSrcData,
int nSrcPixelStride, GByte *CPL_RESTRICT pDstData,
int nDstPixelStride, GPtrDiff_t nWordCount);
Expand Down Expand Up @@ -2192,9 +2204,7 @@ static void inline GDALCopyWordsT_8atatime(
}
}

#if defined(__x86_64) || defined(_M_X64)

#include <emmintrin.h>
#ifdef HAVE_SSE2

template <class Tout>
void GDALCopyWordsByteTo16Bit(const GByte *const CPL_RESTRICT pSrcData,
Expand Down Expand Up @@ -2605,7 +2615,7 @@ void GDALCopyWordsT(const double *const CPL_RESTRICT pSrcData,
nDstPixelStride, nWordCount);
}

#endif // defined(__x86_64) || defined(_M_X64)
#endif // HAVE_SSE2

template <>
void GDALCopyWordsT(const float *const CPL_RESTRICT pSrcData,
Expand Down Expand Up @@ -3043,13 +3053,7 @@ static inline void GDALUnrolledCopy(T *CPL_RESTRICT pDest,
GDALUnrolledCopyGeneric<T, srcStride, dstStride>(pDest, pSrc, nIters);
}

#if (defined(__x86_64) || defined(_M_X64))

#ifdef HAVE_SSSE3_AT_COMPILE_TIME

#include "rasterio_ssse3.h"

#endif
#ifdef HAVE_SSE2

template <>
void GDALUnrolledCopy<GByte, 2, 1>(GByte *CPL_RESTRICT pDest,
Expand Down Expand Up @@ -3150,7 +3154,7 @@ void GDALUnrolledCopy<GByte, 4, 1>(GByte *CPL_RESTRICT pDest,
pSrc += 4;
}
}
#endif // defined(__x86_64) || defined(_M_X64)
#endif // HAVE_SSE2

/************************************************************************/
/* GDALFastCopy() */
Expand Down Expand Up @@ -5256,13 +5260,7 @@ bool GDALBufferHasOnlyNoData(const void *pBuffer, double dfNoDataValue,
return false;
}

#if defined(__x86_64) || defined(_M_X64)

#include <emmintrin.h>

#ifdef HAVE_SSSE3_AT_COMPILE_TIME
#include "rasterio_ssse3.h"
#endif
#ifdef HAVE_SSE2

/************************************************************************/
/* GDALDeinterleave3Byte() */
Expand All @@ -5276,6 +5274,12 @@ GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
GByte *CPL_RESTRICT pabyDest0,
GByte *CPL_RESTRICT pabyDest1,
GByte *CPL_RESTRICT pabyDest2, size_t nIters)
#ifdef USE_NEON_OPTIMIZATIONS
{
return GDALDeinterleave3Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
nIters);
}
#else
{
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
if (CPLHaveRuntimeSSSE3())
Expand Down Expand Up @@ -5323,6 +5327,7 @@ GDALDeinterleave3Byte(const GByte *CPL_RESTRICT pabySrc,
pabyDest2[i] = pabySrc[3 * i + 2];
}
}
#endif

/************************************************************************/
/* GDALDeinterleave4Byte() */
Expand Down Expand Up @@ -5378,6 +5383,12 @@ static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
GByte *CPL_RESTRICT pabyDest1,
GByte *CPL_RESTRICT pabyDest2,
GByte *CPL_RESTRICT pabyDest3, size_t nIters)
#ifdef USE_NEON_OPTIMIZATIONS
{
return GDALDeinterleave4Byte_SSSE3(pabySrc, pabyDest0, pabyDest1, pabyDest2,
pabyDest3, nIters);
}
#else
{
#ifdef HAVE_SSSE3_AT_COMPILE_TIME
if (CPLHaveRuntimeSSSE3())
Expand Down Expand Up @@ -5426,6 +5437,7 @@ static void GDALDeinterleave4Byte(const GByte *CPL_RESTRICT pabySrc,
pabyDest3[i] = pabySrc[4 * i + 3];
}
}
#endif
#else
// GCC autovectorizer does an excellent job
__attribute__((optimize("tree-vectorize"))) static void GDALDeinterleave4Byte(
Expand Down Expand Up @@ -5553,8 +5565,7 @@ void GDALDeinterleave(const void *pSourceBuffer, GDALDataType eSourceDT,
}
#if ((defined(__GNUC__) && !defined(__clang__)) || \
defined(__INTEL_CLANG_COMPILER)) && \
(defined(__x86_64) || defined(_M_X64)) && \
defined(HAVE_SSSE3_AT_COMPILE_TIME)
defined(HAVE_SSE2) && defined(HAVE_SSSE3_AT_COMPILE_TIME)
else if ((eSourceDT == GDT_Int16 || eSourceDT == GDT_UInt16) &&
CPLHaveRuntimeSSSE3())
{
Expand Down
10 changes: 8 additions & 2 deletions gcore/rasterio_ssse3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,18 @@

#include "cpl_port.h"

#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
(defined(__x86_64) || defined(_M_X64))
#if (defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
(defined(__x86_64) || defined(_M_X64))) || \
defined(USE_NEON_OPTIMIZATIONS)

#include "rasterio_ssse3.h"

#ifdef USE_NEON_OPTIMIZATIONS
#include "sse2neon.h"
#else
#include <tmmintrin.h>
#endif

#include "gdal_priv_templates.hpp"

void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
Expand Down
2 changes: 1 addition & 1 deletion gcore/rasterio_ssse3.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#include "cpl_port.h"

#if defined(HAVE_SSSE3_AT_COMPILE_TIME) && \
(defined(__x86_64) || defined(_M_X64))
(defined(__x86_64) || defined(_M_X64) || defined(USE_NEON_OPTIMIZATIONS))

void GDALUnrolledCopy_GByte_3_1_SSSE3(GByte *CPL_RESTRICT pDest,
const GByte *CPL_RESTRICT pSrc,
Expand Down
7 changes: 6 additions & 1 deletion port/cpl_cpu_features.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,12 @@ bool CPLHaveRuntimeSSE();
#endif
#endif

#ifdef HAVE_SSSE3_AT_COMPILE_TIME
#ifdef USE_NEON_OPTIMIZATIONS
static bool inline CPLHaveRuntimeSSSE3()
{
return true;
}
#elif defined(HAVE_SSSE3_AT_COMPILE_TIME)
#if __SSSE3__
#define HAVE_INLINE_SSSE3
static bool inline CPLHaveRuntimeSSSE3()
Expand Down

0 comments on commit 30f3844

Please sign in to comment.