From 375c499de565896f7ccb112e658d3e8fb71d37a7 Mon Sep 17 00:00:00 2001 From: tonyskyzeng <292224750@qq.com> Date: Fri, 20 Sep 2024 11:54:14 +0800 Subject: [PATCH 1/2] Make simd a default behavior --- CMakeLists.txt | 29 ++ cmake/macros.cmake | 46 ++- include/CANDY/LSHAPGIndex/basis.h | 33 +- include/CANDY/LSHAPGIndex/fastL2_ip.h | 459 ++------------------------ include/CANDY/LSHAPGIndex/space_l2.h | 169 ---------- include/Utils/MicroDataSet.hpp | 2 +- include/simd_config.h.in | 9 + src/CANDY/DPGIndex.cpp | 18 +- thirdparty/faiss/CMakeLists.txt | 65 +++- thirdparty/spdk | 1 + 10 files changed, 185 insertions(+), 646 deletions(-) create mode 100644 include/simd_config.h.in create mode 160000 thirdparty/spdk diff --git a/CMakeLists.txt b/CMakeLists.txt index ed6c79651..3a4fe755b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,8 @@ include(cmake/default.cmake) #gcc 10 g++10 # First compile faiss before anything else #set(CMAKE_CXX_FLAGS "-fno-openmp") +#test avx2 + add_subdirectory(thirdparty/faiss) # Set specific options for Faiss compilation @@ -29,6 +31,33 @@ set(CMAKE_CXX_FLAGS "-std=c++20 -Wall -Werror=return-type") set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DNO_RACE_CHECK -DCANDY_DEBUG_MODE=1") set(CMAKE_CXX_FLAGS_RELEASE "-Wno-ignored-qualifiers -Wno-sign-compare -O3") set(PROJECT_BINARY_DIR_RAW ${PROJECT_BINARY_DIR}) + +# Valid values are "generic", "avx2", "avx512". + +detect_avx512_support(AVX512_AVAILABLE) +# Use AVX-512 based on the result +if(AVX512_AVAILABLE) + message(STATUS "AVX-512 support detected.") + set(CANDY_AVX512 1) + set(CANDY_AVX2 1) +else() + message(STATUS "AVX-512 support NOT detected.") + detect_avx2_support(AVX2_AVAILABLE) + if(AVX2_AVAILABLE) + message(STATUS "AVX-2 support detected.") + set(CANDY_AVX2 1) + else () + message(STATUS "AVX-2 support not detected.") + set(CANDY_AVX2 0) + set(CANDY_AVX512 0) + endif () +endif() +configure_file( + "${PROJECT_SOURCE_DIR}/include/simd_config.h.in" + "${PROJECT_BINARY_DIR}/include/simd_config.h" +) + + #set(CMAKE_CUDA_STANDARD 11) #set(CMAKE_CUDA_FLAGS "-std=c++11") option(ENABLE_OPENCL diff --git a/cmake/macros.cmake b/cmake/macros.cmake index b450e0b52..1831bebc7 100644 --- a/cmake/macros.cmake +++ b/cmake/macros.cmake @@ -54,4 +54,48 @@ endmacro() macro(get_headers HEADER_FILES) file(GLOB_RECURSE ${HEADER_FILES} "include/*.h" "include/*.hpp") -endmacro() \ No newline at end of file +endmacro() + +# Define the function to detect AVX-512 support +function(detect_avx512_support result_var) + include(CheckCXXSourceCompiles) + set(CMAKE_REQUIRED_FLAGS "-mavx512f") + check_cxx_source_compiles(" + #include + int main() { + __m512i vec = _mm512_set1_epi32(1); // AVX-512 intrinsic + return 0; + } +" HAVE_AVX512) + + if(HAVE_AVX512) + #message(STATUS "AVX-512 support detected.") + set(${result_var} 1 PARENT_SCOPE) + else() + # message(STATUS "AVX-512 support NOT detected.") + set(${result_var} 0 PARENT_SCOPE) + endif() +endfunction() + +function(detect_avx2_support result_var) + include(CheckCXXSourceCompiles) + # Save the current compiler flags to restore them later + set(saved_flags "${CMAKE_CXX_FLAGS}") + # Test AVX2 intrinsic support by compiling a minimal test program + check_cxx_source_compiles(" + #include + int main() { + __m256i vec = _mm256_set1_epi32(1); // AVX2 intrinsic + return 0; + } + " HAVE_AVX2) + + # Restore the original compiler flags + set(CMAKE_CXX_FLAGS "${saved_flags}" PARENT_SCOPE) + # Return TRUE or FALSE based on the test result + if(HAVE_AVX2) + set(${result_var} 1 PARENT_SCOPE) + else() + set(${result_var} 0 PARENT_SCOPE) + endif() +endfunction() \ No newline at end of file diff --git a/include/CANDY/LSHAPGIndex/basis.h b/include/CANDY/LSHAPGIndex/basis.h index 0bd1f33b6..720aa74f8 100644 --- a/include/CANDY/LSHAPGIndex/basis.h +++ b/include/CANDY/LSHAPGIndex/basis.h @@ -169,11 +169,9 @@ struct Res//the result of knns inline float cal_inner_product(float* v1, float* v2, int dim) { -#if (defined __AVX2__ && defined __USE__AVX2__ZX__) - return faiss::fvec_inner_product_avx512(v1, v2, dim); -#else - return calIp_fast(v1, v2, dim); -#endif + + return calIp_fast(v1, v2, dim); + } inline float cal_lengthSquare(float* v1, int dim) @@ -187,34 +185,13 @@ inline float cal_lengthSquare(float* v1, int dim) extern int _g_dist_mes; inline float cal_dist(float* v1, float* v2, int dim) { - if(_g_dist_mes==1) { - return 1.0-cal_inner_product(v1,v2,dim); - } -#ifdef USE_SQRDIST - #if (defined __AVX2__ && defined __USE__AVX2__ZX__) - return faiss::fvec_L2sqr_avx512(v1, v2, dim); - #else - return calL2Sqr_fast(v1, v2, dim); - #endif -#else - #if (defined __AVX2__ && defined __USE__AVX2__ZX__) - return sqrt(faiss::fvec_L2sqr_avx512(v1, v2, dim)); - #else - return sqrt(calL2Sqr_fast(v1, v2, dim)); - #endif -#endif + return calL2Sqr_fast(v1, v2, dim); } inline float cal_distSqrt(float* v1, float* v2, int dim) { -#if (defined __AVX2__ && defined __USE__AVX2__ZX__) - return sqrt(faiss::fvec_L2sqr_avx512(v1, v2, dim)); -#else - return sqrt(calL2Sqr_fast(v1, v2, dim)); -#endif - //return sqrt(calL2Sqr_fast(v1, v2, dim)); - + return calL2Sqr_fast(v1, v2, dim); } template diff --git a/include/CANDY/LSHAPGIndex/fastL2_ip.h b/include/CANDY/LSHAPGIndex/fastL2_ip.h index 26266bbd5..8d294a26f 100644 --- a/include/CANDY/LSHAPGIndex/fastL2_ip.h +++ b/include/CANDY/LSHAPGIndex/fastL2_ip.h @@ -1,455 +1,36 @@ -#pragma once - -//#define USE_FAST - - - -#ifdef USE_FAST - -#define __SSE__ -#define __AVX__ -#ifdef __SSE__ -#define USE_SSE -#ifdef __AVX__ -#define USE_AVX -#endif -#endif -#endif - -#if defined(USE_AVX) || defined(USE_SSE) -#ifdef _MSC_VER -#include -#include -#else -#include -#endif - -#if defined(__GNUC__) -#define PORTABLE_ALIGN32 __attribute__((aligned(32))) -#else -#define PORTABLE_ALIGN32 __declspec(align(32)) -#endif -#endif +#ifndef _FASTL2_IP_H +#define _FASTL2_IP_H +#include namespace fastlib { - static float - L2Sqr(float* pVect1, float* pVect2, size_t qty) { - float res = 0; - for (size_t i = 0; i < qty; i++) { - float t = *pVect1 - *pVect2; - pVect1++; - pVect2++; - res += t * t; - } - return (res); - } - - static float - InnerProduct(float* pVect1, float* pVect2, size_t qty) { - float res = 0; - for (size_t i = 0; i < qty; i++) { - res += *pVect1 * (*pVect2); - pVect1++; - pVect2++; - } - return (res); - } - -#if defined(USE_AVX) - - // Favor using AVX if available. - static float - L2SqrSIMD16Ext(float* pVect1, float* pVect2, size_t qty) { - float PORTABLE_ALIGN32 TmpRes[8]; - size_t qty16 = qty >> 4; - - const float* pEnd1 = pVect1 + (qty16 << 4); - - __m256 diff, v1, v2; - __m256 sum = _mm256_set1_ps(0); - - while (pVect1 < pEnd1) { - v1 = _mm256_loadu_ps(pVect1); - pVect1 += 8; - v2 = _mm256_loadu_ps(pVect2); - pVect2 += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - - v1 = _mm256_loadu_ps(pVect1); - pVect1 += 8; - v2 = _mm256_loadu_ps(pVect2); - pVect2 += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - } - - _mm256_store_ps(TmpRes, sum); - return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; - } - - static float - IpSIMD16Ext(float* pVect1, float* pVect2, size_t qty) { - float PORTABLE_ALIGN32 TmpRes[8]; - size_t qty16 = qty >> 4; - - const float* pEnd1 = pVect1 + (qty16 << 4); - - __m256 v1, v2; - __m256 sum = _mm256_set1_ps(0); - - while (pVect1 < pEnd1) { - v1 = _mm256_loadu_ps(pVect1); - pVect1 += 8; - v2 = _mm256_loadu_ps(pVect2); - pVect2 += 8; - //diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(v1, v2)); - - v1 = _mm256_loadu_ps(pVect1); - pVect1 += 8; - v2 = _mm256_loadu_ps(pVect2); - pVect2 += 8; - //diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(v1, v2)); - } - - _mm256_store_ps(TmpRes, sum); - return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; - } -#endif - -#if defined(USE_SSE) || defined(USE_AVX) - static float - L2SqrSIMD16ExtResiduals(float* pVect1v, float* pVect2v, size_t qty) { - //size_t qty = *((size_t*)qty_ptr); - size_t qty16 = qty >> 4 << 4; - float res = L2SqrSIMD16Ext(pVect1v, pVect2v, qty16); - float* pVect1 = (float*)pVect1v + qty16; - float* pVect2 = (float*)pVect2v + qty16; - - size_t qty_left = qty - qty16; - float res_tail = L2Sqr(pVect1, pVect2, qty_left); - return (res + res_tail); - } - - static float - IpSIMD16ExtResiduals(float* pVect1v, float* pVect2v, size_t qty) { - //size_t qty = *((size_t*)qty_ptr); - size_t qty16 = qty >> 4 << 4; - float res = IpSIMD16Ext(pVect1v, pVect2v, qty16); - float* pVect1 = (float*)pVect1v + qty16; - float* pVect2 = (float*)pVect2v + qty16; - - size_t qty_left = qty - qty16; - float res_tail = InnerProduct(pVect1, pVect2, qty_left); - return (res + res_tail); - } -#endif - - -#ifdef USE_SSE - static float - L2SqrSIMD4Ext(float* pVect1, float* pVect2, size_t qty) { - float PORTABLE_ALIGN32 TmpRes[8]; - - size_t qty4 = qty >> 2; - - const float* pEnd1 = pVect1 + (qty4 << 2); - - __m128 diff, v1, v2; - __m128 sum = _mm_set1_ps(0); - - while (pVect1 < pEnd1) { - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - } - _mm_store_ps(TmpRes, sum); - return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; - } - - static float - IpSIMD4Ext(float* pVect1, float* pVect2, size_t qty) { - float PORTABLE_ALIGN32 TmpRes[8]; - //float* pVect1 = (float*)pVect1v; - //float* pVect2 = (float*)pVect2v; - //size_t qty = *((size_t*)qty_ptr); - - - size_t qty4 = qty >> 2; - - const float* pEnd1 = pVect1 + (qty4 << 2); - - __m128 v1, v2; - __m128 sum = _mm_set1_ps(0); - - while (pVect1 < pEnd1) { - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - //diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(v1, v2)); - } - _mm_store_ps(TmpRes, sum); - return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; - } - - static float - L2SqrSIMD4ExtResiduals(float* pVect1v, float* pVect2v, size_t qty) { - //size_t qty = *((size_t*)qty_ptr); - size_t qty4 = qty >> 2 << 2; - - float res = L2SqrSIMD4Ext(pVect1v, pVect2v, qty4); - size_t qty_left = qty - qty4; - - float* pVect1 = (float*)pVect1v + qty4; - float* pVect2 = (float*)pVect2v + qty4; - float res_tail = L2Sqr(pVect1, pVect2, qty_left); - - return (res + res_tail); - } - - static float - IpSIMD4ExtResiduals(float* pVect1v, float* pVect2v, size_t qty) { - //size_t qty = *((size_t*)qty_ptr); - size_t qty4 = qty >> 2 << 2; - - float res = IpSIMD4Ext(pVect1v, pVect2v, qty4); - size_t qty_left = qty - qty4; - - float* pVect1 = (float*)pVect1v + qty4; - float* pVect2 = (float*)pVect2v + qty4; - float res_tail = InnerProduct(pVect1, pVect2, qty_left); - - return (res + res_tail); - } -#endif - } -inline float calL2Sqr_fast(float* v1, float* v2, int dim) -{ - - float res = 0.0; - for (int i = 0; i < dim; ++i) { - res += (v1[i] - v2[i]) * (v1[i] - v2[i]); - } - return res; +inline float calL2Sqr_fast(float *v1, float *v2, int dim) { + // Create tensors from the input arrays + auto t1 = torch::from_blob(v1, {dim}, torch::kFloat); + auto t2 = torch::from_blob(v2, {dim}, torch::kFloat); -} - -inline float calIp_fast(float* v1, float* v2, int dim) -{ -#if defined(USE_FAST) - if (dim % 16 == 0) - return fastlib::IpSIMD16Ext(v1, v2, dim); - else if (dim % 4 == 0) - return fastlib::IpSIMD4Ext(v1, v2, dim); - else if (dim > 16) - return fastlib::IpSIMD16ExtResiduals(v1, v2, dim); - else if (dim > 4) - return fastlib::IpSIMD4ExtResiduals(v1, v2, dim); -#else - float res = 0.0; - for (int i = 0; i < dim; ++i) { - res += v1[i] * v2[i]; - } - return res; -#endif + // Calculate the squared L2 distance as ||v1 - v2||^2 + auto diff = t1 - t2; + auto l2_sqr = torch::sum(diff * diff); + // Convert the result back to float and return + return l2_sqr.item(); } -namespace fastlib1 { - static float - L2Sqr(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - float* pVect1 = (float*)pVect1v; - float* pVect2 = (float*)pVect2v; - size_t qty = *((size_t*)qty_ptr); - - float res = 0; - for (size_t i = 0; i < qty; i++) { - float t = *pVect1 - *pVect2; - pVect1++; - pVect2++; - res += t * t; - } - return (res); - } - -#if defined(USE_AVX) - - // Favor using AVX if available. - static float - L2SqrSIMD16Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - float* pVect1 = (float*)pVect1v; - float* pVect2 = (float*)pVect2v; - size_t qty = *((size_t*)qty_ptr); - float PORTABLE_ALIGN32 TmpRes[8]; - size_t qty16 = qty >> 4; - - const float* pEnd1 = pVect1 + (qty16 << 4); - - __m256 diff, v1, v2; - __m256 sum = _mm256_set1_ps(0); - - while (pVect1 < pEnd1) { - v1 = _mm256_loadu_ps(pVect1); - pVect1 += 8; - v2 = _mm256_loadu_ps(pVect2); - pVect2 += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - - v1 = _mm256_loadu_ps(pVect1); - pVect1 += 8; - v2 = _mm256_loadu_ps(pVect2); - pVect2 += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - } - - _mm256_store_ps(TmpRes, sum); - return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; - } - -#elif defined(USE_SSE) - - static float - L2SqrSIMD16Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - float* pVect1 = (float*)pVect1v; - float* pVect2 = (float*)pVect2v; - size_t qty = *((size_t*)qty_ptr); - float PORTABLE_ALIGN32 TmpRes[8]; - size_t qty16 = qty >> 4; +inline float calIp_fast(float *v1, float *v2, int dim) { + auto t1 = torch::from_blob(v1, {dim}, torch::kFloat); + auto t2 = torch::from_blob(v2, {dim}, torch::kFloat); - const float* pEnd1 = pVect1 + (qty16 << 4); + // Calculate the inner (dot) product + auto inner_product = torch::dot(t1, t2); - __m128 diff, v1, v2; - __m128 sum = _mm_set1_ps(0); + // Convert the result back to float and return + return inner_product.item(); - while (pVect1 < pEnd1) { - //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0); - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - } - - _mm_store_ps(TmpRes, sum); - return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; - } -#endif - -#if defined(USE_SSE) || defined(USE_AVX) - static float - L2SqrSIMD16ExtResiduals(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - size_t qty = *((size_t*)qty_ptr); - size_t qty16 = qty >> 4 << 4; - float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16); - float* pVect1 = (float*)pVect1v + qty16; - float* pVect2 = (float*)pVect2v + qty16; - - size_t qty_left = qty - qty16; - float res_tail = L2Sqr(pVect1, pVect2, &qty_left); - return (res + res_tail); - } -#endif - - -#ifdef USE_SSE - static float - L2SqrSIMD4Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - float PORTABLE_ALIGN32 TmpRes[8]; - float* pVect1 = (float*)pVect1v; - float* pVect2 = (float*)pVect2v; - size_t qty = *((size_t*)qty_ptr); - - - size_t qty4 = qty >> 2; - - const float* pEnd1 = pVect1 + (qty4 << 2); - - __m128 diff, v1, v2; - __m128 sum = _mm_set1_ps(0); - - while (pVect1 < pEnd1) { - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - } - _mm_store_ps(TmpRes, sum); - return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; - } - - static float - L2SqrSIMD4ExtResiduals(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - size_t qty = *((size_t*)qty_ptr); - size_t qty4 = qty >> 2 << 2; - - float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4); - size_t qty_left = qty - qty4; - - float* pVect1 = (float*)pVect1v + qty4; - float* pVect2 = (float*)pVect2v + qty4; - float res_tail = L2Sqr(pVect1, pVect2, &qty_left); - - return (res + res_tail); - } -#endif } -inline float calL2Sqr_fast1(float* v1, float* v2, size_t dim) -{ -#if defined(USE_FAST) - if (dim % 16 == 0) - return fastlib1::L2SqrSIMD16Ext(v1, v2, &dim); - else if (dim % 4 == 0) - return fastlib1::L2SqrSIMD4Ext(v1, v2, &dim); - else if (dim > 16) - return fastlib1::L2SqrSIMD16ExtResiduals(v1, v2, &dim); - else if (dim > 4) - return fastlib1::L2SqrSIMD4ExtResiduals(v1, v2, &dim); -#else - float res = 0.0; - for (int i = 0; i < dim; ++i) { - res += (v1[i] - v2[i]) * (v1[i] - v2[i]); - } - return res; -#endif - - -} \ No newline at end of file +#endif \ No newline at end of file diff --git a/include/CANDY/LSHAPGIndex/space_l2.h b/include/CANDY/LSHAPGIndex/space_l2.h index e9d92b273..8378a8635 100644 --- a/include/CANDY/LSHAPGIndex/space_l2.h +++ b/include/CANDY/LSHAPGIndex/space_l2.h @@ -5,172 +5,3 @@ extern int _G_COST; template using DISTFUNC = MTYPE(*)(const void*, const void*, const void*); - -static float - L2Sqr(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - ++_G_COST; - float* pVect1 = (float*)pVect1v; - float* pVect2 = (float*)pVect2v; - size_t qty = *((size_t*)qty_ptr); - - float res = 0; - for (size_t i = 0; i < qty; i++) { - float t = *pVect1 - *pVect2; - pVect1++; - pVect2++; - res += t * t; - } - return (res); -} - -#if defined(USE_AVX) - -// Favor using AVX if available. -static float - L2SqrSIMD16Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - ++_G_COST; - float* pVect1 = (float*)pVect1v; - float* pVect2 = (float*)pVect2v; - size_t qty = *((size_t*)qty_ptr); - float PORTABLE_ALIGN32 TmpRes[8]; - size_t qty16 = qty >> 4; - - const float* pEnd1 = pVect1 + (qty16 << 4); - - __m256 diff, v1, v2; - __m256 sum = _mm256_set1_ps(0); - - while (pVect1 < pEnd1) { - v1 = _mm256_loadu_ps(pVect1); - pVect1 += 8; - v2 = _mm256_loadu_ps(pVect2); - pVect2 += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - - v1 = _mm256_loadu_ps(pVect1); - pVect1 += 8; - v2 = _mm256_loadu_ps(pVect2); - pVect2 += 8; - diff = _mm256_sub_ps(v1, v2); - sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff)); - } - - _mm256_store_ps(TmpRes, sum); - return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7]; -} - -#elif defined(USE_SSE) - -static float - L2SqrSIMD16Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - ++_G_COST; - float* pVect1 = (float*)pVect1v; - float* pVect2 = (float*)pVect2v; - size_t qty = *((size_t*)qty_ptr); - float PORTABLE_ALIGN32 TmpRes[8]; - size_t qty16 = qty >> 4; - - const float* pEnd1 = pVect1 + (qty16 << 4); - - __m128 diff, v1, v2; - __m128 sum = _mm_set1_ps(0); - - while (pVect1 < pEnd1) { - //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0); - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - } - - _mm_store_ps(TmpRes, sum); - return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; -} -#endif - -#if defined(USE_SSE) || defined(USE_AVX) -static float - L2SqrSIMD16ExtResiduals(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - ++_G_COST; - size_t qty = *((size_t*)qty_ptr); - size_t qty16 = qty >> 4 << 4; - float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16); - float* pVect1 = (float*)pVect1v + qty16; - float* pVect2 = (float*)pVect2v + qty16; - - size_t qty_left = qty - qty16; - float res_tail = L2Sqr(pVect1, pVect2, &qty_left); - return (res + res_tail); -} -#endif - - -#ifdef USE_SSE -static float - L2SqrSIMD4Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - ++_G_COST; - float PORTABLE_ALIGN32 TmpRes[8]; - float* pVect1 = (float*)pVect1v; - float* pVect2 = (float*)pVect2v; - size_t qty = *((size_t*)qty_ptr); - - - size_t qty4 = qty >> 2; - - const float* pEnd1 = pVect1 + (qty4 << 2); - - __m128 diff, v1, v2; - __m128 sum = _mm_set1_ps(0); - - while (pVect1 < pEnd1) { - v1 = _mm_loadu_ps(pVect1); - pVect1 += 4; - v2 = _mm_loadu_ps(pVect2); - pVect2 += 4; - diff = _mm_sub_ps(v1, v2); - sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff)); - } - _mm_store_ps(TmpRes, sum); - return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; -} - -static float - L2SqrSIMD4ExtResiduals(const void* pVect1v, const void* pVect2v, const void* qty_ptr) { - ++_G_COST; - size_t qty = *((size_t*)qty_ptr); - size_t qty4 = qty >> 2 << 2; - - float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4); - size_t qty_left = qty - qty4; - - float* pVect1 = (float*)pVect1v + qty4; - float* pVect2 = (float*)pVect2v + qty4; - float res_tail = L2Sqr(pVect1, pVect2, &qty_left); - - return (res + res_tail); -} -#endif diff --git a/include/Utils/MicroDataSet.hpp b/include/Utils/MicroDataSet.hpp index 1af3ca8ac..afaedcfab 100644 --- a/include/Utils/MicroDataSet.hpp +++ b/include/Utils/MicroDataSet.hpp @@ -15,7 +15,7 @@ #include #include #include - +#include using namespace std; namespace INTELLI { /** diff --git a/include/simd_config.h.in b/include/simd_config.h.in new file mode 100644 index 000000000..581061061 --- /dev/null +++ b/include/simd_config.h.in @@ -0,0 +1,9 @@ +// +// Created by tony on 04/06/22. +// + +#ifndef CANDY_SIMD_CONFIG_H_IN_H_ +#define CANDY_SIMD_CONFIG_H_IN_H_ +#define CANDY_AVX2 @CANDY_AVX2@ +#define CANDY_AVX512 @CANDY_AVX512@ +#endif diff --git a/src/CANDY/DPGIndex.cpp b/src/CANDY/DPGIndex.cpp index 67e3a48ad..83f1c59d3 100644 --- a/src/CANDY/DPGIndex.cpp +++ b/src/CANDY/DPGIndex.cpp @@ -153,16 +153,20 @@ void DPGIndex::removeLayer1Neighbor(size_t i, size_t j) { } double DPGIndex::calcDist(const torch::Tensor &ta, const torch::Tensor &tb) { - auto taPtr = ta.contiguous().data_ptr(), - tbPtr = tb.contiguous().data_ptr(); + double ans = 0; if (faissMetric == faiss::METRIC_L2) { - for (size_t i = 0; i < vecDim; ++i) { - auto diff = taPtr[i] - tbPtr[i]; - ans += diff * diff; - } + // Calculate the squared L2 distance as ||v1 - v2||^2 + auto diff = ta-tb; + auto l2_sqr = torch::sum(diff * diff); + // Convert the result back to float and return + ans = l2_sqr.item(); } else { - for (size_t i = 0; i < vecDim; ++i) ans -= taPtr[i] * tbPtr[i]; + //for (size_t i = 0; i < vecDim; ++i) ans -= taPtr[i] * tbPtr[i]; + // Calculate the inner (dot) product + auto inner_product = torch::dot(ta, tb); + ans = inner_product.item(); + // Convert the result back to float and return } return ans; } diff --git a/thirdparty/faiss/CMakeLists.txt b/thirdparty/faiss/CMakeLists.txt index 40c6fb2e3..dc0c09bdd 100644 --- a/thirdparty/faiss/CMakeLists.txt +++ b/thirdparty/faiss/CMakeLists.txt @@ -51,7 +51,70 @@ set(CMAKE_CXX_STANDARD 17) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") # Valid values are "generic", "avx2", "avx512". -option(FAISS_OPT_LEVEL "" "generic") + +# Define the function to detect AVX-512 support +function(detect_avx512_support result_var) + include(CheckCXXSourceCompiles) + set(CMAKE_REQUIRED_FLAGS "-mavx512f") + check_cxx_source_compiles(" + #include + int main() { + __m512i vec = _mm512_set1_epi32(1); // AVX-512 intrinsic + return 0; + } +" HAVE_AVX512) + + if(HAVE_AVX512) + #message(STATUS "AVX-512 support detected.") + set(${result_var} 1 PARENT_SCOPE) + else() + # message(STATUS "AVX-512 support NOT detected.") + set(${result_var} 0 PARENT_SCOPE) + endif() +endfunction() + +function(detect_avx2_support result_var) + include(CheckCXXSourceCompiles) + # Save the current compiler flags to restore them later + set(saved_flags "${CMAKE_CXX_FLAGS}") + # Test AVX2 intrinsic support by compiling a minimal test program + check_cxx_source_compiles(" + #include + int main() { + __m256i vec = _mm256_set1_epi32(1); // AVX2 intrinsic + return 0; + } + " HAVE_AVX2) + + # Restore the original compiler flags + set(CMAKE_CXX_FLAGS "${saved_flags}" PARENT_SCOPE) + # Return TRUE or FALSE based on the test result + if(HAVE_AVX2) + set(${result_var} 1 PARENT_SCOPE) + else() + set(${result_var} 0 PARENT_SCOPE) + endif() +endfunction() + +detect_avx512_support(AVX512_AVAILABLE) +# Use AVX-512 based on the result +if(AVX512_AVAILABLE) + message(STATUS "AVX-512 support detected.") + option(FAISS_OPT_LEVEL "" "avx512") +else() + message(STATUS "AVX-512 support NOT detected.") + detect_avx2_support(AVX2_AVAILABLE) + if(AVX2_AVAILABLE) + message(STATUS "AVX-2 support detected.") + option(FAISS_OPT_LEVEL "" "avx2") + else () + message(STATUS "AVX-2 support not detected.") + option(FAISS_OPT_LEVEL "" "generic") + endif () +endif() + + + option(FAISS_ENABLE_GPU "Enable support for GPU indexes." OFF) option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF) option(FAISS_ENABLE_PYTHON "Build Python extension." OFF) diff --git a/thirdparty/spdk b/thirdparty/spdk new file mode 160000 index 000000000..94a53a53b --- /dev/null +++ b/thirdparty/spdk @@ -0,0 +1 @@ +Subproject commit 94a53a53bc28084c32aa697203735d6959a042ec From faae46804459e3fd53f61a88ef40201907af1a27 Mon Sep 17 00:00:00 2001 From: tonyskyzeng <292224750@qq.com> Date: Fri, 20 Sep 2024 12:35:08 +0800 Subject: [PATCH 2/2] Make simd a default behavior --- src/CANDY/DPGIndex.cpp | 2 +- src/CANDY/NNDescentIndex.cpp | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/CANDY/DPGIndex.cpp b/src/CANDY/DPGIndex.cpp index 83f1c59d3..f8ae22f2d 100644 --- a/src/CANDY/DPGIndex.cpp +++ b/src/CANDY/DPGIndex.cpp @@ -165,7 +165,7 @@ double DPGIndex::calcDist(const torch::Tensor &ta, const torch::Tensor &tb) { //for (size_t i = 0; i < vecDim; ++i) ans -= taPtr[i] * tbPtr[i]; // Calculate the inner (dot) product auto inner_product = torch::dot(ta, tb); - ans = inner_product.item(); + ans = -inner_product.item(); // Convert the result back to float and return } return ans; diff --git a/src/CANDY/NNDescentIndex.cpp b/src/CANDY/NNDescentIndex.cpp index 3e14da937..a89eed526 100644 --- a/src/CANDY/NNDescentIndex.cpp +++ b/src/CANDY/NNDescentIndex.cpp @@ -128,16 +128,19 @@ bool NNDescentIndex::updateNN(size_t i, size_t j, double dist) { double NNDescentIndex::calcDist(const torch::Tensor &ta, const torch::Tensor &tb) { - auto taPtr = ta.contiguous().data_ptr(), - tbPtr = tb.contiguous().data_ptr(); double ans = 0; if (faissMetric == faiss::METRIC_L2) { - for (size_t i = 0; i < vecDim; ++i) { - auto diff = taPtr[i] - tbPtr[i]; - ans += diff * diff; - } + // Calculate the squared L2 distance as ||v1 - v2||^2 + auto diff = ta-tb; + auto l2_sqr = torch::sum(diff * diff); + // Convert the result back to float and return + ans = l2_sqr.item(); } else { - for (size_t i = 0; i < vecDim; ++i) ans -= taPtr[i] * tbPtr[i]; + //for (size_t i = 0; i < vecDim; ++i) ans -= taPtr[i] * tbPtr[i]; + // Calculate the inner (dot) product + auto inner_product = torch::dot(ta, tb); + ans = -inner_product.item(); + // Convert the result back to float and return } return ans; }