From 375c499de565896f7ccb112e658d3e8fb71d37a7 Mon Sep 17 00:00:00 2001
From: tonyskyzeng <292224750@qq.com>
Date: Fri, 20 Sep 2024 11:54:14 +0800
Subject: [PATCH 1/2] Make simd a default behavior

---
 CMakeLists.txt                        |  29 ++
 cmake/macros.cmake                    |  46 ++-
 include/CANDY/LSHAPGIndex/basis.h     |  33 +-
 include/CANDY/LSHAPGIndex/fastL2_ip.h | 459 ++------------------------
 include/CANDY/LSHAPGIndex/space_l2.h  | 169 ----------
 include/Utils/MicroDataSet.hpp        |   2 +-
 include/simd_config.h.in              |   9 +
 src/CANDY/DPGIndex.cpp                |  18 +-
 thirdparty/faiss/CMakeLists.txt       |  65 +++-
 thirdparty/spdk                       |   1 +
 10 files changed, 185 insertions(+), 646 deletions(-)
 create mode 100644 include/simd_config.h.in
 create mode 160000 thirdparty/spdk

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed6c79651..3a4fe755b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,6 +15,8 @@ include(cmake/default.cmake)
 #gcc 10 g++10
 # First compile faiss before anything else
 #set(CMAKE_CXX_FLAGS "-fno-openmp")
+#test avx2
+
 add_subdirectory(thirdparty/faiss)
 
 # Set specific options for Faiss compilation
@@ -29,6 +31,33 @@ set(CMAKE_CXX_FLAGS "-std=c++20 -Wall -Werror=return-type")
 set(CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DNO_RACE_CHECK -DCANDY_DEBUG_MODE=1")
 set(CMAKE_CXX_FLAGS_RELEASE "-Wno-ignored-qualifiers -Wno-sign-compare -O3")
 set(PROJECT_BINARY_DIR_RAW ${PROJECT_BINARY_DIR})
+
+# Valid values are "generic", "avx2", "avx512".
+
+detect_avx512_support(AVX512_AVAILABLE)
+# Use AVX-512 based on the result
+if(AVX512_AVAILABLE)
+    message(STATUS "AVX-512 support detected.")
+    set(CANDY_AVX512 1)
+    set(CANDY_AVX2 1)
+else()
+    message(STATUS "AVX-512 support NOT detected.")
+    detect_avx2_support(AVX2_AVAILABLE)
+    if(AVX2_AVAILABLE)
+        message(STATUS "AVX-2 support detected.")
+        set(CANDY_AVX2 1)
+    else ()
+        message(STATUS "AVX-2 support not detected.")
+        set(CANDY_AVX2 0)
+        set(CANDY_AVX512 0)
+    endif ()
+endif()
+configure_file(
+        "${PROJECT_SOURCE_DIR}/include/simd_config.h.in"
+        "${PROJECT_BINARY_DIR}/include/simd_config.h"
+)
+
+
 #set(CMAKE_CUDA_STANDARD 11)
 #set(CMAKE_CUDA_FLAGS "-std=c++11")
 option(ENABLE_OPENCL
diff --git a/cmake/macros.cmake b/cmake/macros.cmake
index b450e0b52..1831bebc7 100644
--- a/cmake/macros.cmake
+++ b/cmake/macros.cmake
@@ -54,4 +54,48 @@ endmacro()
 
 macro(get_headers HEADER_FILES)
     file(GLOB_RECURSE ${HEADER_FILES} "include/*.h" "include/*.hpp")
-endmacro()
\ No newline at end of file
+endmacro()
+
+# Define the function to detect AVX-512 support
+function(detect_avx512_support result_var)
+    include(CheckCXXSourceCompiles)
+    set(CMAKE_REQUIRED_FLAGS "-mavx512f")
+    check_cxx_source_compiles("
+    #include <immintrin.h>
+    int main() {
+        __m512i vec = _mm512_set1_epi32(1);  // AVX-512 intrinsic
+        return 0;
+    }
+" HAVE_AVX512)
+
+    if(HAVE_AVX512)
+        #message(STATUS "AVX-512 support detected.")
+        set(${result_var} 1 PARENT_SCOPE)
+    else()
+        # message(STATUS "AVX-512 support NOT detected.")
+        set(${result_var} 0 PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(detect_avx2_support result_var)
+    include(CheckCXXSourceCompiles)
+    # Save the current compiler flags to restore them later
+    set(saved_flags "${CMAKE_CXX_FLAGS}")
+    # Test AVX2 intrinsic support by compiling a minimal test program
+    check_cxx_source_compiles("
+        #include <immintrin.h>
+        int main() {
+            __m256i vec = _mm256_set1_epi32(1);  // AVX2 intrinsic
+            return 0;
+        }
+    " HAVE_AVX2)
+
+    # Restore the original compiler flags
+    set(CMAKE_CXX_FLAGS "${saved_flags}" PARENT_SCOPE)
+    # Return TRUE or FALSE based on the test result
+    if(HAVE_AVX2)
+        set(${result_var} 1 PARENT_SCOPE)
+    else()
+        set(${result_var} 0 PARENT_SCOPE)
+    endif()
+endfunction()
\ No newline at end of file
diff --git a/include/CANDY/LSHAPGIndex/basis.h b/include/CANDY/LSHAPGIndex/basis.h
index 0bd1f33b6..720aa74f8 100644
--- a/include/CANDY/LSHAPGIndex/basis.h
+++ b/include/CANDY/LSHAPGIndex/basis.h
@@ -169,11 +169,9 @@ struct Res//the result of knns
 
 inline float cal_inner_product(float* v1, float* v2, int dim)
 {
-#if (defined __AVX2__ && defined __USE__AVX2__ZX__)
-	return faiss::fvec_inner_product_avx512(v1, v2, dim);
-#else
-	return calIp_fast(v1, v2, dim);
-#endif
+
+  return calIp_fast(v1, v2, dim);
+
 }
 
 inline float cal_lengthSquare(float* v1, int dim)
@@ -187,34 +185,13 @@ inline float cal_lengthSquare(float* v1, int dim)
 extern int _g_dist_mes;
 inline float cal_dist(float* v1, float* v2, int dim)
 {
-  if(_g_dist_mes==1) {
-    return 1.0-cal_inner_product(v1,v2,dim);
-  }
-#ifdef USE_SQRDIST
-	#if (defined __AVX2__ && defined __USE__AVX2__ZX__)
-		return faiss::fvec_L2sqr_avx512(v1, v2, dim);
-	#else
-		return calL2Sqr_fast(v1, v2, dim);
-	#endif
-#else
-	#if (defined __AVX2__ && defined __USE__AVX2__ZX__)
-		return sqrt(faiss::fvec_L2sqr_avx512(v1, v2, dim));
-	#else
-		return sqrt(calL2Sqr_fast(v1, v2, dim));
-	#endif
-#endif
+  return calL2Sqr_fast(v1, v2, dim);
 	
 }
 
 inline float cal_distSqrt(float* v1, float* v2, int dim)
 {
-#if (defined __AVX2__ && defined __USE__AVX2__ZX__)
-	return sqrt(faiss::fvec_L2sqr_avx512(v1, v2, dim));
-#else
-	return sqrt(calL2Sqr_fast(v1, v2, dim));
-#endif
-	//return sqrt(calL2Sqr_fast(v1, v2, dim));
-	
+  return calL2Sqr_fast(v1, v2, dim);
 }
 
 template <class T>
diff --git a/include/CANDY/LSHAPGIndex/fastL2_ip.h b/include/CANDY/LSHAPGIndex/fastL2_ip.h
index 26266bbd5..8d294a26f 100644
--- a/include/CANDY/LSHAPGIndex/fastL2_ip.h
+++ b/include/CANDY/LSHAPGIndex/fastL2_ip.h
@@ -1,455 +1,36 @@
-#pragma once
-
-//#define USE_FAST
-
-
-
-#ifdef USE_FAST
-
-#define __SSE__
-#define __AVX__
-#ifdef __SSE__
-#define USE_SSE
-#ifdef __AVX__
-#define USE_AVX
-#endif
-#endif
-#endif
-
-#if defined(USE_AVX) || defined(USE_SSE)
-#ifdef _MSC_VER
-#include <intrin.h>
-#include <stdexcept>
-#else
-#include <x86intrin.h>
-#endif
-
-#if defined(__GNUC__)
-#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
-#else
-#define PORTABLE_ALIGN32 __declspec(align(32))
-#endif
-#endif
+#ifndef _FASTL2_IP_H
+#define _FASTL2_IP_H
+#include <torch/torch.h>
 
 namespace fastlib {
 
-    static float
-        L2Sqr(float* pVect1, float* pVect2, size_t qty) {
-        float res = 0;
-        for (size_t i = 0; i < qty; i++) {
-            float t = *pVect1 - *pVect2;
-            pVect1++;
-            pVect2++;
-            res += t * t;
-        }
-        return (res);
-    }
-
-    static float
-        InnerProduct(float* pVect1, float* pVect2, size_t qty) {
-        float res = 0;
-        for (size_t i = 0; i < qty; i++) {
-            res += *pVect1 * (*pVect2);
-            pVect1++;
-            pVect2++;
-        }
-        return (res);
-    }
-
-#if defined(USE_AVX)
-
-    // Favor using AVX if available.
-    static float
-        L2SqrSIMD16Ext(float* pVect1, float* pVect2, size_t qty) {
-        float PORTABLE_ALIGN32 TmpRes[8];
-        size_t qty16 = qty >> 4;
-
-        const float* pEnd1 = pVect1 + (qty16 << 4);
-
-        __m256 diff, v1, v2;
-        __m256 sum = _mm256_set1_ps(0);
-
-        while (pVect1 < pEnd1) {
-            v1 = _mm256_loadu_ps(pVect1);
-            pVect1 += 8;
-            v2 = _mm256_loadu_ps(pVect2);
-            pVect2 += 8;
-            diff = _mm256_sub_ps(v1, v2);
-            sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-
-            v1 = _mm256_loadu_ps(pVect1);
-            pVect1 += 8;
-            v2 = _mm256_loadu_ps(pVect2);
-            pVect2 += 8;
-            diff = _mm256_sub_ps(v1, v2);
-            sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-        }
-
-        _mm256_store_ps(TmpRes, sum);
-        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
-    }
-
-    static float
-        IpSIMD16Ext(float* pVect1, float* pVect2, size_t qty) {
-        float PORTABLE_ALIGN32 TmpRes[8];
-        size_t qty16 = qty >> 4;
-
-        const float* pEnd1 = pVect1 + (qty16 << 4);
-
-        __m256 v1, v2;
-        __m256 sum = _mm256_set1_ps(0);
-
-        while (pVect1 < pEnd1) {
-            v1 = _mm256_loadu_ps(pVect1);
-            pVect1 += 8;
-            v2 = _mm256_loadu_ps(pVect2);
-            pVect2 += 8;
-            //diff = _mm256_sub_ps(v1, v2);
-            sum = _mm256_add_ps(sum, _mm256_mul_ps(v1, v2));
-
-            v1 = _mm256_loadu_ps(pVect1);
-            pVect1 += 8;
-            v2 = _mm256_loadu_ps(pVect2);
-            pVect2 += 8;
-            //diff = _mm256_sub_ps(v1, v2);
-            sum = _mm256_add_ps(sum, _mm256_mul_ps(v1, v2));
-        }
-
-        _mm256_store_ps(TmpRes, sum);
-        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
-    }
-#endif
-
-#if defined(USE_SSE) || defined(USE_AVX)
-    static float
-        L2SqrSIMD16ExtResiduals(float* pVect1v, float* pVect2v, size_t qty) {
-        //size_t qty = *((size_t*)qty_ptr);
-        size_t qty16 = qty >> 4 << 4;
-        float res = L2SqrSIMD16Ext(pVect1v, pVect2v, qty16);
-        float* pVect1 = (float*)pVect1v + qty16;
-        float* pVect2 = (float*)pVect2v + qty16;
-
-        size_t qty_left = qty - qty16;
-        float res_tail = L2Sqr(pVect1, pVect2, qty_left);
-        return (res + res_tail);
-    }
-
-    static float
-        IpSIMD16ExtResiduals(float* pVect1v, float* pVect2v, size_t qty) {
-        //size_t qty = *((size_t*)qty_ptr);
-        size_t qty16 = qty >> 4 << 4;
-        float res = IpSIMD16Ext(pVect1v, pVect2v, qty16);
-        float* pVect1 = (float*)pVect1v + qty16;
-        float* pVect2 = (float*)pVect2v + qty16;
-
-        size_t qty_left = qty - qty16;
-        float res_tail = InnerProduct(pVect1, pVect2, qty_left);
-        return (res + res_tail);
-    }
-#endif
-
-
-#ifdef USE_SSE
-    static float
-        L2SqrSIMD4Ext(float* pVect1, float* pVect2, size_t qty) {
-        float PORTABLE_ALIGN32 TmpRes[8];
-
-        size_t qty4 = qty >> 2;
-
-        const float* pEnd1 = pVect1 + (qty4 << 2);
-
-        __m128 diff, v1, v2;
-        __m128 sum = _mm_set1_ps(0);
-
-        while (pVect1 < pEnd1) {
-            v1 = _mm_loadu_ps(pVect1);
-            pVect1 += 4;
-            v2 = _mm_loadu_ps(pVect2);
-            pVect2 += 4;
-            diff = _mm_sub_ps(v1, v2);
-            sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-        }
-        _mm_store_ps(TmpRes, sum);
-        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
-    }
-
-    static float
-        IpSIMD4Ext(float* pVect1, float* pVect2, size_t qty) {
-        float PORTABLE_ALIGN32 TmpRes[8];
-        //float* pVect1 = (float*)pVect1v;
-        //float* pVect2 = (float*)pVect2v;
-        //size_t qty = *((size_t*)qty_ptr);
-
-
-        size_t qty4 = qty >> 2;
-
-        const float* pEnd1 = pVect1 + (qty4 << 2);
-
-        __m128 v1, v2;
-        __m128 sum = _mm_set1_ps(0);
-
-        while (pVect1 < pEnd1) {
-            v1 = _mm_loadu_ps(pVect1);
-            pVect1 += 4;
-            v2 = _mm_loadu_ps(pVect2);
-            pVect2 += 4;
-            //diff = _mm_sub_ps(v1, v2);
-            sum = _mm_add_ps(sum, _mm_mul_ps(v1, v2));
-        }
-        _mm_store_ps(TmpRes, sum);
-        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
-    }
-
-    static float
-        L2SqrSIMD4ExtResiduals(float* pVect1v, float* pVect2v, size_t qty) {
-        //size_t qty = *((size_t*)qty_ptr);
-        size_t qty4 = qty >> 2 << 2;
-
-        float res = L2SqrSIMD4Ext(pVect1v, pVect2v, qty4);
-        size_t qty_left = qty - qty4;
-
-        float* pVect1 = (float*)pVect1v + qty4;
-        float* pVect2 = (float*)pVect2v + qty4;
-        float res_tail = L2Sqr(pVect1, pVect2, qty_left);
-
-        return (res + res_tail);
-    }
-
-    static float
-        IpSIMD4ExtResiduals(float* pVect1v, float* pVect2v, size_t qty) {
-        //size_t qty = *((size_t*)qty_ptr);
-        size_t qty4 = qty >> 2 << 2;
-
-        float res = IpSIMD4Ext(pVect1v, pVect2v, qty4);
-        size_t qty_left = qty - qty4;
-
-        float* pVect1 = (float*)pVect1v + qty4;
-        float* pVect2 = (float*)pVect2v + qty4;
-        float res_tail = InnerProduct(pVect1, pVect2, qty_left);
-
-        return (res + res_tail);
-    }
-#endif
-
 }
 
-inline float calL2Sqr_fast(float* v1, float* v2, int dim)
-{
-
-    float res = 0.0;
-    for (int i = 0; i < dim; ++i) {
-        res += (v1[i] - v2[i]) * (v1[i] - v2[i]);
-    }
-    return res;
+inline float calL2Sqr_fast(float *v1, float *v2, int dim) {
 
+  // Create tensors from the input arrays
+  auto t1 = torch::from_blob(v1, {dim}, torch::kFloat);
+  auto t2 = torch::from_blob(v2, {dim}, torch::kFloat);
 
-}
-
-inline float calIp_fast(float* v1, float* v2, int dim)
-{
-#if defined(USE_FAST)
-    if (dim % 16 == 0)
-        return fastlib::IpSIMD16Ext(v1, v2, dim);
-    else if (dim % 4 == 0)
-        return fastlib::IpSIMD4Ext(v1, v2, dim);
-    else if (dim > 16)
-        return fastlib::IpSIMD16ExtResiduals(v1, v2, dim);
-    else if (dim > 4)
-        return fastlib::IpSIMD4ExtResiduals(v1, v2, dim);
-#else
-    float res = 0.0;
-    for (int i = 0; i < dim; ++i) {
-        res += v1[i] * v2[i];
-    }
-    return res;
-#endif
+  // Calculate the squared L2 distance as ||v1 - v2||^2
+  auto diff = t1 - t2;
+  auto l2_sqr = torch::sum(diff * diff);
 
+  // Convert the result back to float and return
+  return l2_sqr.item<float>();
 
 }
 
-namespace fastlib1 {
-    static float
-        L2Sqr(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-        float* pVect1 = (float*)pVect1v;
-        float* pVect2 = (float*)pVect2v;
-        size_t qty = *((size_t*)qty_ptr);
-
-        float res = 0;
-        for (size_t i = 0; i < qty; i++) {
-            float t = *pVect1 - *pVect2;
-            pVect1++;
-            pVect2++;
-            res += t * t;
-        }
-        return (res);
-    }
-
-#if defined(USE_AVX)
-
-    // Favor using AVX if available.
-    static float
-        L2SqrSIMD16Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-        float* pVect1 = (float*)pVect1v;
-        float* pVect2 = (float*)pVect2v;
-        size_t qty = *((size_t*)qty_ptr);
-        float PORTABLE_ALIGN32 TmpRes[8];
-        size_t qty16 = qty >> 4;
-
-        const float* pEnd1 = pVect1 + (qty16 << 4);
-
-        __m256 diff, v1, v2;
-        __m256 sum = _mm256_set1_ps(0);
-
-        while (pVect1 < pEnd1) {
-            v1 = _mm256_loadu_ps(pVect1);
-            pVect1 += 8;
-            v2 = _mm256_loadu_ps(pVect2);
-            pVect2 += 8;
-            diff = _mm256_sub_ps(v1, v2);
-            sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-
-            v1 = _mm256_loadu_ps(pVect1);
-            pVect1 += 8;
-            v2 = _mm256_loadu_ps(pVect2);
-            pVect2 += 8;
-            diff = _mm256_sub_ps(v1, v2);
-            sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-        }
-
-        _mm256_store_ps(TmpRes, sum);
-        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
-    }
-
-#elif defined(USE_SSE)
-
-    static float
-        L2SqrSIMD16Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-        float* pVect1 = (float*)pVect1v;
-        float* pVect2 = (float*)pVect2v;
-        size_t qty = *((size_t*)qty_ptr);
-        float PORTABLE_ALIGN32 TmpRes[8];
-        size_t qty16 = qty >> 4;
+inline float calIp_fast(float *v1, float *v2, int dim) {
+  auto t1 = torch::from_blob(v1, {dim}, torch::kFloat);
+  auto t2 = torch::from_blob(v2, {dim}, torch::kFloat);
 
-        const float* pEnd1 = pVect1 + (qty16 << 4);
+  // Calculate the inner (dot) product
+  auto inner_product = torch::dot(t1, t2);
 
-        __m128 diff, v1, v2;
-        __m128 sum = _mm_set1_ps(0);
+  // Convert the result back to float and return
+  return inner_product.item<float>();
 
-        while (pVect1 < pEnd1) {
-            //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
-            v1 = _mm_loadu_ps(pVect1);
-            pVect1 += 4;
-            v2 = _mm_loadu_ps(pVect2);
-            pVect2 += 4;
-            diff = _mm_sub_ps(v1, v2);
-            sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-            v1 = _mm_loadu_ps(pVect1);
-            pVect1 += 4;
-            v2 = _mm_loadu_ps(pVect2);
-            pVect2 += 4;
-            diff = _mm_sub_ps(v1, v2);
-            sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-            v1 = _mm_loadu_ps(pVect1);
-            pVect1 += 4;
-            v2 = _mm_loadu_ps(pVect2);
-            pVect2 += 4;
-            diff = _mm_sub_ps(v1, v2);
-            sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-            v1 = _mm_loadu_ps(pVect1);
-            pVect1 += 4;
-            v2 = _mm_loadu_ps(pVect2);
-            pVect2 += 4;
-            diff = _mm_sub_ps(v1, v2);
-            sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-        }
-
-        _mm_store_ps(TmpRes, sum);
-        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
-    }
-#endif
-
-#if defined(USE_SSE) || defined(USE_AVX)
-    static float
-        L2SqrSIMD16ExtResiduals(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-        size_t qty = *((size_t*)qty_ptr);
-        size_t qty16 = qty >> 4 << 4;
-        float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16);
-        float* pVect1 = (float*)pVect1v + qty16;
-        float* pVect2 = (float*)pVect2v + qty16;
-
-        size_t qty_left = qty - qty16;
-        float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
-        return (res + res_tail);
-    }
-#endif
-
-
-#ifdef USE_SSE
-    static float
-        L2SqrSIMD4Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-        float PORTABLE_ALIGN32 TmpRes[8];
-        float* pVect1 = (float*)pVect1v;
-        float* pVect2 = (float*)pVect2v;
-        size_t qty = *((size_t*)qty_ptr);
-
-
-        size_t qty4 = qty >> 2;
-
-        const float* pEnd1 = pVect1 + (qty4 << 2);
-
-        __m128 diff, v1, v2;
-        __m128 sum = _mm_set1_ps(0);
-
-        while (pVect1 < pEnd1) {
-            v1 = _mm_loadu_ps(pVect1);
-            pVect1 += 4;
-            v2 = _mm_loadu_ps(pVect2);
-            pVect2 += 4;
-            diff = _mm_sub_ps(v1, v2);
-            sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-        }
-        _mm_store_ps(TmpRes, sum);
-        return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
-    }
-
-    static float
-        L2SqrSIMD4ExtResiduals(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-        size_t qty = *((size_t*)qty_ptr);
-        size_t qty4 = qty >> 2 << 2;
-
-        float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4);
-        size_t qty_left = qty - qty4;
-
-        float* pVect1 = (float*)pVect1v + qty4;
-        float* pVect2 = (float*)pVect2v + qty4;
-        float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
-
-        return (res + res_tail);
-    }
-#endif
 }
 
-inline float calL2Sqr_fast1(float* v1, float* v2, size_t dim)
-{
-#if defined(USE_FAST)
-    if (dim % 16 == 0)
-        return fastlib1::L2SqrSIMD16Ext(v1, v2, &dim);
-    else if (dim % 4 == 0)
-        return fastlib1::L2SqrSIMD4Ext(v1, v2, &dim);
-    else if (dim > 16)
-        return fastlib1::L2SqrSIMD16ExtResiduals(v1, v2, &dim);
-    else if (dim > 4)
-        return fastlib1::L2SqrSIMD4ExtResiduals(v1, v2, &dim);
-#else
-    float res = 0.0;
-    for (int i = 0; i < dim; ++i) {
-        res += (v1[i] - v2[i]) * (v1[i] - v2[i]);
-    }
-    return res;
-#endif
-
-
-}
\ No newline at end of file
+#endif
\ No newline at end of file
diff --git a/include/CANDY/LSHAPGIndex/space_l2.h b/include/CANDY/LSHAPGIndex/space_l2.h
index e9d92b273..8378a8635 100644
--- a/include/CANDY/LSHAPGIndex/space_l2.h
+++ b/include/CANDY/LSHAPGIndex/space_l2.h
@@ -5,172 +5,3 @@ extern int _G_COST;
 template<typename MTYPE>
 using DISTFUNC = MTYPE(*)(const void*, const void*, const void*);
 
-
-static float
-    L2Sqr(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-    ++_G_COST;
-    float* pVect1 = (float*)pVect1v;
-    float* pVect2 = (float*)pVect2v;
-    size_t qty = *((size_t*)qty_ptr);
-
-    float res = 0;
-    for (size_t i = 0; i < qty; i++) {
-        float t = *pVect1 - *pVect2;
-        pVect1++;
-        pVect2++;
-        res += t * t;
-    }
-    return (res);
-}
-
-#if defined(USE_AVX)
-
-// Favor using AVX if available.
-static float
-    L2SqrSIMD16Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-    ++_G_COST;
-    float* pVect1 = (float*)pVect1v;
-    float* pVect2 = (float*)pVect2v;
-    size_t qty = *((size_t*)qty_ptr);
-    float PORTABLE_ALIGN32 TmpRes[8];
-    size_t qty16 = qty >> 4;
-
-    const float* pEnd1 = pVect1 + (qty16 << 4);
-
-    __m256 diff, v1, v2;
-    __m256 sum = _mm256_set1_ps(0);
-
-    while (pVect1 < pEnd1) {
-        v1 = _mm256_loadu_ps(pVect1);
-        pVect1 += 8;
-        v2 = _mm256_loadu_ps(pVect2);
-        pVect2 += 8;
-        diff = _mm256_sub_ps(v1, v2);
-        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-
-        v1 = _mm256_loadu_ps(pVect1);
-        pVect1 += 8;
-        v2 = _mm256_loadu_ps(pVect2);
-        pVect2 += 8;
-        diff = _mm256_sub_ps(v1, v2);
-        sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
-    }
-
-    _mm256_store_ps(TmpRes, sum);
-    return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7];
-}
-
-#elif defined(USE_SSE)
-
-static float
-    L2SqrSIMD16Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-    ++_G_COST;
-    float* pVect1 = (float*)pVect1v;
-    float* pVect2 = (float*)pVect2v;
-    size_t qty = *((size_t*)qty_ptr);
-    float PORTABLE_ALIGN32 TmpRes[8];
-    size_t qty16 = qty >> 4;
-
-    const float* pEnd1 = pVect1 + (qty16 << 4);
-
-    __m128 diff, v1, v2;
-    __m128 sum = _mm_set1_ps(0);
-
-    while (pVect1 < pEnd1) {
-        //_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);
-        v1 = _mm_loadu_ps(pVect1);
-        pVect1 += 4;
-        v2 = _mm_loadu_ps(pVect2);
-        pVect2 += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(pVect1);
-        pVect1 += 4;
-        v2 = _mm_loadu_ps(pVect2);
-        pVect2 += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(pVect1);
-        pVect1 += 4;
-        v2 = _mm_loadu_ps(pVect2);
-        pVect2 += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-
-        v1 = _mm_loadu_ps(pVect1);
-        pVect1 += 4;
-        v2 = _mm_loadu_ps(pVect2);
-        pVect2 += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-    }
-
-    _mm_store_ps(TmpRes, sum);
-    return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
-}
-#endif
-
-#if defined(USE_SSE) || defined(USE_AVX)
-static float
-    L2SqrSIMD16ExtResiduals(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-    ++_G_COST;
-    size_t qty = *((size_t*)qty_ptr);
-    size_t qty16 = qty >> 4 << 4;
-    float res = L2SqrSIMD16Ext(pVect1v, pVect2v, &qty16);
-    float* pVect1 = (float*)pVect1v + qty16;
-    float* pVect2 = (float*)pVect2v + qty16;
-
-    size_t qty_left = qty - qty16;
-    float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
-    return (res + res_tail);
-}
-#endif
-
-
-#ifdef USE_SSE
-static float
-    L2SqrSIMD4Ext(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-    ++_G_COST;
-    float PORTABLE_ALIGN32 TmpRes[8];
-    float* pVect1 = (float*)pVect1v;
-    float* pVect2 = (float*)pVect2v;
-    size_t qty = *((size_t*)qty_ptr);
-
-
-    size_t qty4 = qty >> 2;
-
-    const float* pEnd1 = pVect1 + (qty4 << 2);
-
-    __m128 diff, v1, v2;
-    __m128 sum = _mm_set1_ps(0);
-
-    while (pVect1 < pEnd1) {
-        v1 = _mm_loadu_ps(pVect1);
-        pVect1 += 4;
-        v2 = _mm_loadu_ps(pVect2);
-        pVect2 += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-    }
-    _mm_store_ps(TmpRes, sum);
-    return TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3];
-}
-
-static float
-    L2SqrSIMD4ExtResiduals(const void* pVect1v, const void* pVect2v, const void* qty_ptr) {
-    ++_G_COST;
-    size_t qty = *((size_t*)qty_ptr);
-    size_t qty4 = qty >> 2 << 2;
-
-    float res = L2SqrSIMD4Ext(pVect1v, pVect2v, &qty4);
-    size_t qty_left = qty - qty4;
-
-    float* pVect1 = (float*)pVect1v + qty4;
-    float* pVect2 = (float*)pVect2v + qty4;
-    float res_tail = L2Sqr(pVect1, pVect2, &qty_left);
-
-    return (res + res_tail);
-}
-#endif
diff --git a/include/Utils/MicroDataSet.hpp b/include/Utils/MicroDataSet.hpp
index 1af3ca8ac..afaedcfab 100644
--- a/include/Utils/MicroDataSet.hpp
+++ b/include/Utils/MicroDataSet.hpp
@@ -15,7 +15,7 @@
 #include <random>
 #include <cmath>
 #include <iostream>
-
+#include <algorithm>
 using namespace std;
 namespace INTELLI {
 /**
diff --git a/include/simd_config.h.in b/include/simd_config.h.in
new file mode 100644
index 000000000..581061061
--- /dev/null
+++ b/include/simd_config.h.in
@@ -0,0 +1,9 @@
+//
+// Created by tony on 04/06/22.
+//
+
+#ifndef CANDY_SIMD_CONFIG_H_IN_H_
+#define CANDY_SIMD_CONFIG_H_IN_H_
+#define CANDY_AVX2 @CANDY_AVX2@
+#define CANDY_AVX512 @CANDY_AVX512@
+#endif
diff --git a/src/CANDY/DPGIndex.cpp b/src/CANDY/DPGIndex.cpp
index 67e3a48ad..83f1c59d3 100644
--- a/src/CANDY/DPGIndex.cpp
+++ b/src/CANDY/DPGIndex.cpp
@@ -153,16 +153,20 @@ void DPGIndex::removeLayer1Neighbor(size_t i, size_t j) {
 }
 
 double DPGIndex::calcDist(const torch::Tensor &ta, const torch::Tensor &tb) {
-  auto taPtr = ta.contiguous().data_ptr<float>(),
-       tbPtr = tb.contiguous().data_ptr<float>();
+
   double ans = 0;
   if (faissMetric == faiss::METRIC_L2) {
-    for (size_t i = 0; i < vecDim; ++i) {
-      auto diff = taPtr[i] - tbPtr[i];
-      ans += diff * diff;
-    }
+    // Calculate the squared L2 distance as ||v1 - v2||^2
+    auto diff = ta-tb;
+    auto l2_sqr = torch::sum(diff * diff);
+    // Convert the result back to float and return
+    ans = l2_sqr.item<float>();
   } else {
-    for (size_t i = 0; i < vecDim; ++i) ans -= taPtr[i] * tbPtr[i];
+    //for (size_t i = 0; i < vecDim; ++i) ans -= taPtr[i] * tbPtr[i];
+    // Calculate the inner (dot) product
+    auto inner_product = torch::dot(ta, tb);
+    ans = inner_product.item<float>();
+    // Convert the result back to float and return
   }
   return ans;
 }
diff --git a/thirdparty/faiss/CMakeLists.txt b/thirdparty/faiss/CMakeLists.txt
index 40c6fb2e3..dc0c09bdd 100644
--- a/thirdparty/faiss/CMakeLists.txt
+++ b/thirdparty/faiss/CMakeLists.txt
@@ -51,7 +51,70 @@ set(CMAKE_CXX_STANDARD 17)
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
 # Valid values are "generic", "avx2", "avx512".
-option(FAISS_OPT_LEVEL "" "generic")
+
+# Define the function to detect AVX-512 support
+function(detect_avx512_support result_var)
+    include(CheckCXXSourceCompiles)
+    set(CMAKE_REQUIRED_FLAGS "-mavx512f")
+    check_cxx_source_compiles("
+    #include <immintrin.h>
+    int main() {
+        __m512i vec = _mm512_set1_epi32(1);  // AVX-512 intrinsic
+        return 0;
+    }
+" HAVE_AVX512)
+
+    if(HAVE_AVX512)
+        #message(STATUS "AVX-512 support detected.")
+        set(${result_var} 1 PARENT_SCOPE)
+    else()
+       # message(STATUS "AVX-512 support NOT detected.")
+        set(${result_var} 0 PARENT_SCOPE)
+    endif()
+endfunction()
+
+function(detect_avx2_support result_var)
+    include(CheckCXXSourceCompiles)
+    # Save the current compiler flags to restore them later
+    set(saved_flags "${CMAKE_CXX_FLAGS}")
+    # Test AVX2 intrinsic support by compiling a minimal test program
+    check_cxx_source_compiles("
+        #include <immintrin.h>
+        int main() {
+            __m256i vec = _mm256_set1_epi32(1);  // AVX2 intrinsic
+            return 0;
+        }
+    " HAVE_AVX2)
+
+    # Restore the original compiler flags
+    set(CMAKE_CXX_FLAGS "${saved_flags}" PARENT_SCOPE)
+    # Return TRUE or FALSE based on the test result
+    if(HAVE_AVX2)
+        set(${result_var} 1 PARENT_SCOPE)
+    else()
+        set(${result_var} 0 PARENT_SCOPE)
+    endif()
+endfunction()
+
+detect_avx512_support(AVX512_AVAILABLE)
+# Use AVX-512 based on the result
+if(AVX512_AVAILABLE)
+    message(STATUS "AVX-512 support detected.")
+    option(FAISS_OPT_LEVEL "" "avx512")
+else()
+    message(STATUS "AVX-512 support NOT detected.")
+    detect_avx2_support(AVX2_AVAILABLE)
+    if(AVX2_AVAILABLE)
+        message(STATUS "AVX-2 support detected.")
+        option(FAISS_OPT_LEVEL "" "avx2")
+    else ()
+        message(STATUS "AVX-2 support not detected.")
+        option(FAISS_OPT_LEVEL "" "generic")
+    endif ()
+endif()
+
+
+
 option(FAISS_ENABLE_GPU "Enable support for GPU indexes." OFF)
 option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF)
 option(FAISS_ENABLE_PYTHON "Build Python extension." OFF)
diff --git a/thirdparty/spdk b/thirdparty/spdk
new file mode 160000
index 000000000..94a53a53b
--- /dev/null
+++ b/thirdparty/spdk
@@ -0,0 +1 @@
+Subproject commit 94a53a53bc28084c32aa697203735d6959a042ec

From faae46804459e3fd53f61a88ef40201907af1a27 Mon Sep 17 00:00:00 2001
From: tonyskyzeng <292224750@qq.com>
Date: Fri, 20 Sep 2024 12:35:08 +0800
Subject: [PATCH 2/2] Make simd a default behavior

---
 src/CANDY/DPGIndex.cpp       |  2 +-
 src/CANDY/NNDescentIndex.cpp | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/CANDY/DPGIndex.cpp b/src/CANDY/DPGIndex.cpp
index 83f1c59d3..f8ae22f2d 100644
--- a/src/CANDY/DPGIndex.cpp
+++ b/src/CANDY/DPGIndex.cpp
@@ -165,7 +165,7 @@ double DPGIndex::calcDist(const torch::Tensor &ta, const torch::Tensor &tb) {
     //for (size_t i = 0; i < vecDim; ++i) ans -= taPtr[i] * tbPtr[i];
     // Calculate the inner (dot) product
     auto inner_product = torch::dot(ta, tb);
-    ans = inner_product.item<float>();
+    ans = -inner_product.item<float>();
     // Convert the result back to float and return
   }
   return ans;
diff --git a/src/CANDY/NNDescentIndex.cpp b/src/CANDY/NNDescentIndex.cpp
index 3e14da937..a89eed526 100644
--- a/src/CANDY/NNDescentIndex.cpp
+++ b/src/CANDY/NNDescentIndex.cpp
@@ -128,16 +128,19 @@ bool NNDescentIndex::updateNN(size_t i, size_t j, double dist) {
 
 double NNDescentIndex::calcDist(const torch::Tensor &ta,
                                 const torch::Tensor &tb) {
-  auto taPtr = ta.contiguous().data_ptr<float>(),
-       tbPtr = tb.contiguous().data_ptr<float>();
   double ans = 0;
   if (faissMetric == faiss::METRIC_L2) {
-    for (size_t i = 0; i < vecDim; ++i) {
-      auto diff = taPtr[i] - tbPtr[i];
-      ans += diff * diff;
-    }
+    // Calculate the squared L2 distance as ||v1 - v2||^2
+    auto diff = ta-tb;
+    auto l2_sqr = torch::sum(diff * diff);
+    // Convert the result back to float and return
+    ans = l2_sqr.item<float>();
   } else {
-    for (size_t i = 0; i < vecDim; ++i) ans -= taPtr[i] * tbPtr[i];
+    //for (size_t i = 0; i < vecDim; ++i) ans -= taPtr[i] * tbPtr[i];
+    // Calculate the inner (dot) product
+    auto inner_product = torch::dot(ta, tb);
+    ans = -inner_product.item<float>();
+    // Convert the result back to float and return
   }
   return ans;
 }