From 3a1ad461ff9e75349a3ac3d8eafb44883db4d824 Mon Sep 17 00:00:00 2001
From: Artem Labazov <a.labazov@corp.vk.com>
Date: Fri, 20 Dec 2024 15:30:33 -0800
Subject: [PATCH 1/8] Conditionally compile extras like benchmarks and demos
 (#4094)

Summary:
While embedding faiss as a subproject, these targets are quite useless. This PR adss a new `FAISS_ENABLE_EXTRAS` CMake option to exclude them, default is ON to preserve current behaviour.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4094

Reviewed By: satymish

Differential Revision: D67345666

Pulled By: gtwang01

fbshipit-source-id: c8d9ef14b21816e45f681412bda257192f5365d0
---
 CMakeLists.txt | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6bd1ad3eaf..50843cb716 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ option(FAISS_ENABLE_CUVS "Enable cuVS for GPU indexes." OFF)
 option(FAISS_ENABLE_ROCM "Enable ROCm for GPU indexes." OFF)
 option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
 option(FAISS_ENABLE_C_API "Build C API." OFF)
+option(FAISS_ENABLE_EXTRAS "Build extras like benchmarks and demos" ON)
 option(FAISS_USE_LTO "Enable Link-Time optimization" OFF)
 
 if(FAISS_ENABLE_GPU)
@@ -103,10 +104,11 @@ if(FAISS_ENABLE_C_API)
   add_subdirectory(c_api)
 endif()
 
-add_subdirectory(demos)
-add_subdirectory(benchs)
-add_subdirectory(tutorial/cpp)
-
+if(FAISS_ENABLE_EXTRAS)
+  add_subdirectory(demos)
+  add_subdirectory(benchs)
+  add_subdirectory(tutorial/cpp)
+endif()
 
 # CTest must be included in the top level to enable `make test` target.
 include(CTest)

From 3beb07b198a83450d6ed65c7bf9f4e36ef2a97c9 Mon Sep 17 00:00:00 2001
From: Mulugeta Mammo <mulugeta.mammo@intel.com>
Date: Mon, 23 Dec 2024 08:56:26 -0800
Subject: [PATCH 2/8] Add a new architecture mode: 'avx512_spr'. (#4025)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
This PR adds a new architecture mode to support the new extensions to AVX512, namely [AVX512-FP16](https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide), which have been available since Intel® Sapphire Rapids.

This PR is a prerequisite for [PR#4020](https://github.com/facebookresearch/faiss/pull/4020) that speeds up hamming distance evaluations.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4025

Reviewed By: pankajsingh88

Differential Revision: D67524575

Pulled By: mengdilin

fbshipit-source-id: f3a09943b062d720b241f95aef2f390923ffd779
---
 .github/workflows/build-pull-request.yml | 11 ++++++++
 .gitignore                               |  1 +
 CMakeLists.txt                           |  2 +-
 INSTALL.md                               |  8 +++++-
 cmake/link_to_faiss_lib.cmake            | 13 ++++++++-
 faiss/CMakeLists.txt                     | 35 ++++++++++++++++++++++--
 faiss/gpu/CMakeLists.txt                 |  2 ++
 faiss/python/CMakeLists.txt              | 33 ++++++++++++++++++++++
 faiss/python/loader.py                   | 15 ++++++++++
 faiss/python/setup.py                    | 11 ++++++--
 tests/test_contrib.py                    |  3 +-
 11 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/build-pull-request.yml b/.github/workflows/build-pull-request.yml
index f94077513d..b312c71d37 100644
--- a/.github/workflows/build-pull-request.yml
+++ b/.github/workflows/build-pull-request.yml
@@ -60,6 +60,17 @@ jobs:
         uses: ./.github/actions/build_cmake
         with:
           opt_level: avx512
+  linux-x86_64-AVX512_SPR-cmake:
+    name: Linux x86_64 AVX512_SPR (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: faiss-aws-m7i.large
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx512_spr
   linux-x86_64-GPU-cmake:
     name: Linux x86_64 GPU (cmake)
     needs: linux-x86_64-cmake
diff --git a/.gitignore b/.gitignore
index 01b98f0a9c..52a99e8fc0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,4 +19,5 @@
 /tests/gtest/
 faiss/python/swigfaiss_avx2.swig
 faiss/python/swigfaiss_avx512.swig
+faiss/python/swigfaiss_avx512_spr.swig
 faiss/python/swigfaiss_sve.swig
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50843cb716..fc6c0d55f7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,7 +57,7 @@ set(CMAKE_CXX_STANDARD 17)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 
-# Valid values are "generic", "avx2", "avx512", "sve".
+# Valid values are "generic", "avx2", "avx512", "avx512_spr", "sve".
 option(FAISS_OPT_LEVEL "" "generic")
 option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
 option(FAISS_ENABLE_CUVS "Enable cuVS for GPU indexes." OFF)
diff --git a/INSTALL.md b/INSTALL.md
index e16de484fe..c5f76d47d5 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -131,7 +131,7 @@ Several options can be passed to CMake, among which:
   optimization options (enables `-O3` on gcc for instance),
   - `-DFAISS_OPT_LEVEL=avx2` in order to enable the required compiler flags to
   generate code using optimized SIMD/Vector instructions. Possible values are below:
-    - On x86-64, `generic`, `avx2` and `avx512`, by increasing order of optimization,
+    - On x86-64, `generic`, `avx2`, 'avx512', and `avx512_spr` (for avx512 features available since Intel(R) Sapphire Rapids), by increasing order of optimization,
     - On aarch64, `generic` and `sve`, by increasing order of optimization,
   - `-DFAISS_USE_LTO=ON` in order to enable [Link-Time Optimization](https://en.wikipedia.org/wiki/Link-time_optimization) (default is `OFF`, possible values are `ON` and `OFF`).
 - BLAS-related options:
@@ -180,6 +180,12 @@ For AVX512:
 $ make -C build -j faiss_avx512
 ```
 
+For AVX512 features available since Intel(R) Sapphire Rapids.
+
+``` shell
+$ make -C build -j faiss_avx512_spr
+```
+
 This will ensure the creation of neccesary files when building and installing the python package.
 
 ## Step 3: Building the python bindings (optional)
diff --git a/cmake/link_to_faiss_lib.cmake b/cmake/link_to_faiss_lib.cmake
index 939ed61fc9..e04453ddc3 100644
--- a/cmake/link_to_faiss_lib.cmake
+++ b/cmake/link_to_faiss_lib.cmake
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 function(link_to_faiss_lib target)
-  if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "sve")
+  if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr" AND NOT FAISS_OPT_LEVEL STREQUAL "sve")
     target_link_libraries(${target} PRIVATE faiss)
   endif()
 
@@ -27,6 +27,17 @@ function(link_to_faiss_lib target)
     target_link_libraries(${target} PRIVATE faiss_avx512)
   endif()
 
+  if(FAISS_OPT_LEVEL STREQUAL "avx512_spr")
+    if(NOT WIN32)
+      # Architecture mode to support AVX512 extensions available since Intel (R) Sapphire Rapids.
+      # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide
+      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-march=sapphirerapids -mtune=sapphirerapids>)
+    else()
+      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
+    endif()
+    target_link_libraries(${target} PRIVATE faiss_avx512_spr)
+  endif()
+
   if(FAISS_OPT_LEVEL STREQUAL "sve")
     if(NOT WIN32)
       if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=native")
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 5e91038eb7..535710b829 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -243,7 +243,7 @@ set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE)
 add_library(faiss ${FAISS_SRC})
 
 add_library(faiss_avx2 ${FAISS_SRC})
-if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512")
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
   set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
 endif()
 if(NOT WIN32)
@@ -272,6 +272,20 @@ else()
   add_compile_options(/bigobj)
 endif()
 
+add_library(faiss_avx512_spr ${FAISS_SRC})
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
+  set_target_properties(faiss_avx512_spr PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+if(NOT WIN32)
+  # Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids.
+  # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide
+  target_compile_options(faiss_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-march=sapphirerapids -mtune=sapphirerapids>)
+else()
+  target_compile_options(faiss_avx512_spr PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
+  # we need bigobj for the swig wrapper
+  add_compile_options(/bigobj)
+endif()
+
 add_library(faiss_sve ${FAISS_SRC})
 if(NOT FAISS_OPT_LEVEL STREQUAL "sve")
   set_target_properties(faiss_sve PROPERTIES EXCLUDE_FROM_ALL TRUE)
@@ -307,10 +321,13 @@ target_include_directories(faiss_avx2 PUBLIC
 target_include_directories(faiss_avx512 PUBLIC
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
 # Handle `#include <faiss/foo.h>`.
+target_include_directories(faiss_avx512_spr PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
+# Handle `#include <faiss/foo.h>`.
 target_include_directories(faiss_sve PUBLIC
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
 
-set_target_properties(faiss faiss_avx2 faiss_avx512 faiss_sve PROPERTIES
+set_target_properties(faiss faiss_avx2 faiss_avx512 faiss_avx512_spr faiss_sve PROPERTIES
   POSITION_INDEPENDENT_CODE ON
   WINDOWS_EXPORT_ALL_SYMBOLS ON
 )
@@ -319,6 +336,7 @@ if(WIN32)
   target_compile_definitions(faiss PRIVATE FAISS_MAIN_LIB)
   target_compile_definitions(faiss_avx2 PRIVATE FAISS_MAIN_LIB)
   target_compile_definitions(faiss_avx512 PRIVATE FAISS_MAIN_LIB)
+  target_compile_definitions(faiss_avx512_spr PRIVATE FAISS_MAIN_LIB)
   target_compile_definitions(faiss_sve PRIVATE FAISS_MAIN_LIB)
 endif()
 
@@ -328,6 +346,7 @@ if (${finteger_idx} EQUAL -1)
 endif()
 target_compile_definitions(faiss_avx2 PRIVATE FINTEGER=int)
 target_compile_definitions(faiss_avx512 PRIVATE FINTEGER=int)
+target_compile_definitions(faiss_avx512_spr PRIVATE FINTEGER=int)
 target_compile_definitions(faiss_sve PRIVATE FINTEGER=int)
 
 if(FAISS_USE_LTO)
@@ -339,6 +358,7 @@ if(FAISS_USE_LTO)
     set_property(TARGET faiss PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
     set_property(TARGET faiss_avx2 PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
     set_property(TARGET faiss_avx512 PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
+    set_property(TARGET faiss_avx512_spr PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
   else()
     message(STATUS "LTO not supported: <${ipo_error}>")
   endif()
@@ -348,6 +368,7 @@ find_package(OpenMP REQUIRED)
 target_link_libraries(faiss PRIVATE OpenMP::OpenMP_CXX)
 target_link_libraries(faiss_avx2 PRIVATE OpenMP::OpenMP_CXX)
 target_link_libraries(faiss_avx512 PRIVATE OpenMP::OpenMP_CXX)
+target_link_libraries(faiss_avx512_spr PRIVATE OpenMP::OpenMP_CXX)
 target_link_libraries(faiss_sve PRIVATE OpenMP::OpenMP_CXX)
 
 find_package(MKL)
@@ -355,17 +376,20 @@ if(MKL_FOUND)
   target_link_libraries(faiss PRIVATE ${MKL_LIBRARIES})
   target_link_libraries(faiss_avx2 PRIVATE ${MKL_LIBRARIES})
   target_link_libraries(faiss_avx512 PRIVATE ${MKL_LIBRARIES})
+  target_link_libraries(faiss_avx512_spr PRIVATE ${MKL_LIBRARIES})
 else()
   find_package(BLAS REQUIRED)
   target_link_libraries(faiss PRIVATE ${BLAS_LIBRARIES})
   target_link_libraries(faiss_avx2 PRIVATE ${BLAS_LIBRARIES})
   target_link_libraries(faiss_avx512 PRIVATE ${BLAS_LIBRARIES})
+  target_link_libraries(faiss_avx512_spr PRIVATE ${BLAS_LIBRARIES})
   target_link_libraries(faiss_sve PRIVATE ${BLAS_LIBRARIES})
 
   find_package(LAPACK REQUIRED)
   target_link_libraries(faiss PRIVATE ${LAPACK_LIBRARIES})
   target_link_libraries(faiss_avx2 PRIVATE ${LAPACK_LIBRARIES})
   target_link_libraries(faiss_avx512 PRIVATE ${LAPACK_LIBRARIES})
+  target_link_libraries(faiss_avx512_spr PRIVATE ${LAPACK_LIBRARIES})
   target_link_libraries(faiss_sve PRIVATE ${LAPACK_LIBRARIES})
 endif()
 
@@ -390,6 +414,13 @@ if(FAISS_OPT_LEVEL STREQUAL "avx512")
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
   )
 endif()
+if(FAISS_OPT_LEVEL STREQUAL "avx512_spr")
+  install(TARGETS faiss_avx2 faiss_avx512 faiss_avx512_spr
+    EXPORT faiss-targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
 if(FAISS_OPT_LEVEL STREQUAL "sve")
   install(TARGETS faiss_sve
     EXPORT faiss-targets
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 84cb222145..16574aab61 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -270,6 +270,7 @@ if(FAISS_ENABLE_CUVS)
   target_compile_definitions(faiss PUBLIC USE_NVIDIA_CUVS=1)
   target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_CUVS=1)
   target_compile_definitions(faiss_avx512 PUBLIC USE_NVIDIA_CUVS=1)
+  target_compile_definitions(faiss_avx512_spr PUBLIC USE_NVIDIA_CUVS=1)
 
   # Mark all functions as hidden so that we don't generate
   # global 'public' functions that also exist in libraft.so
@@ -305,6 +306,7 @@ set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)
 target_link_libraries(faiss PRIVATE  "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
 target_link_libraries(faiss_avx2 PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
 target_link_libraries(faiss_avx512 PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
+target_link_libraries(faiss_avx512_spr PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
 target_link_libraries(faiss_sve PRIVATE "$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>")
 
 foreach(header ${FAISS_GPU_HEADERS})
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 41e08eb348..3fc46f4c56 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -54,11 +54,13 @@ endmacro()
 # we duplicate the source in order to override the module name.
 configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_avx2.swig COPYONLY)
 configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_avx512.swig COPYONLY)
+configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_avx512_spr.swig COPYONLY)
 configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_sve.swig COPYONLY)
 
 configure_swigfaiss(swigfaiss.swig)
 configure_swigfaiss(swigfaiss_avx2.swig)
 configure_swigfaiss(swigfaiss_avx512.swig)
+configure_swigfaiss(swigfaiss_avx512_spr.swig)
 configure_swigfaiss(swigfaiss_sve.swig)
 configure_swigfaiss(faiss_example_external_module.swig)
 
@@ -72,6 +74,8 @@ if(TARGET faiss)
       "${faiss_SOURCE_DIR}/faiss/${h}")
     list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS
       "${faiss_SOURCE_DIR}/faiss/${h}")
+    list(APPEND SWIG_MODULE_swigfaiss_avx512_spr_EXTRA_DEPS
+      "${faiss_SOURCE_DIR}/faiss/${h}")
     list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS
       "${faiss_SOURCE_DIR}/faiss/${h}")
     list(APPEND SWIG_MODULE_faiss_example_external_module_EXTRA_DEPS
@@ -85,6 +89,8 @@ if(TARGET faiss)
         "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}")
       list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS
         "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}")
+      list(APPEND SWIG_MODULE_swigfaiss_avx512_spr_EXTRA_DEPS
+        "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}")
       list(APPEND SWIG_MODULE_faiss_example_external_module_EXTRA_DEPS
         "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}")
     endforeach()
@@ -96,6 +102,8 @@ if(TARGET faiss)
         "${faiss_SOURCE_DIR}/faiss/gpu/${h}")
       list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS
         "${faiss_SOURCE_DIR}/faiss/gpu/${h}")
+      list(APPEND SWIG_MODULE_swigfaiss_avx512_spr_EXTRA_DEPS
+        "${faiss_SOURCE_DIR}/faiss/gpu/${h}")
       list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS
         "${faiss_SOURCE_DIR}/faiss/gpu/${h}")
       list(APPEND SWIG_MODULE_faiss_example_external_module_EXTRA_DEPS
@@ -146,6 +154,18 @@ if(NOT FAISS_OPT_LEVEL STREQUAL "avx512")
   set_target_properties(swigfaiss_avx512 PROPERTIES EXCLUDE_FROM_ALL TRUE)
 endif()
 
+set_property(SOURCE swigfaiss_avx512_spr.swig
+  PROPERTY SWIG_MODULE_NAME swigfaiss_avx512_spr)
+swig_add_library(swigfaiss_avx512_spr
+  TYPE SHARED
+  LANGUAGE python
+  SOURCES swigfaiss_avx512_spr.swig
+)
+set_property(TARGET swigfaiss_avx512_spr PROPERTY SWIG_COMPILE_OPTIONS -doxygen)
+if(NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr")
+  set_target_properties(swigfaiss_avx512_spr PROPERTIES EXCLUDE_FROM_ALL TRUE)
+endif()
+
 set_property(SOURCE swigfaiss_sve.swig
   PROPERTY SWIG_MODULE_NAME swigfaiss_sve)
 swig_add_library(swigfaiss_sve
@@ -172,6 +192,7 @@ if(NOT WIN32)
   set_target_properties(swigfaiss PROPERTIES SUFFIX .so)
   set_target_properties(swigfaiss_avx2 PROPERTIES SUFFIX .so)
   set_target_properties(swigfaiss_avx512 PROPERTIES SUFFIX .so)
+  set_target_properties(swigfaiss_avx512_spr PROPERTIES SUFFIX .so)
   set_target_properties(swigfaiss_sve PROPERTIES SUFFIX .so)
   set_target_properties(faiss_example_external_module PROPERTIES SUFFIX .so)
 else()
@@ -179,6 +200,7 @@ else()
   target_compile_options(swigfaiss PRIVATE /bigobj)
   target_compile_options(swigfaiss_avx2 PRIVATE /bigobj)
   target_compile_options(swigfaiss_avx512 PRIVATE /bigobj)
+  target_compile_options(swigfaiss_avx512_spr PRIVATE /bigobj)
   target_compile_options(swigfaiss_sve PRIVATE /bigobj)
   target_compile_options(faiss_example_external_module PRIVATE /bigobj)
 endif()
@@ -188,6 +210,7 @@ if(FAISS_ENABLE_GPU)
     target_link_libraries(swigfaiss PRIVATE hip::host)
     target_link_libraries(swigfaiss_avx2 PRIVATE hip::host)
     target_link_libraries(swigfaiss_avx512 PRIVATE hip::host)
+    target_link_libraries(swigfaiss_avx512_spr PRIVATE hip::host)
     target_link_libraries(faiss_example_external_module PRIVATE hip::host)
   else()
     find_package(CUDAToolkit REQUIRED)
@@ -197,6 +220,7 @@ if(FAISS_ENABLE_GPU)
     target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
     target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
     target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss_avx512_spr PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
     target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
   endif()
 endif()
@@ -224,6 +248,13 @@ target_link_libraries(swigfaiss_avx512 PRIVATE
   OpenMP::OpenMP_CXX
 )
 
+target_link_libraries(swigfaiss_avx512_spr PRIVATE
+  faiss_avx512_spr
+  Python::Module
+  Python::NumPy
+  OpenMP::OpenMP_CXX
+)
+
 target_link_libraries(swigfaiss_sve PRIVATE
   faiss_sve
   Python::Module
@@ -244,6 +275,7 @@ target_link_libraries(faiss_example_external_module PRIVATE
 target_include_directories(swigfaiss PRIVATE ${PROJECT_SOURCE_DIR}/../..)
 target_include_directories(swigfaiss_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/../..)
 target_include_directories(swigfaiss_avx512 PRIVATE ${PROJECT_SOURCE_DIR}/../..)
+target_include_directories(swigfaiss_avx512_spr PRIVATE ${PROJECT_SOURCE_DIR}/../..)
 target_include_directories(swigfaiss_sve PRIVATE ${PROJECT_SOURCE_DIR}/../..)
 target_include_directories(faiss_example_external_module PRIVATE ${PROJECT_SOURCE_DIR}/../..)
 
@@ -270,6 +302,7 @@ target_include_directories(faiss_python_callbacks PRIVATE ${Python_INCLUDE_DIRS}
 target_link_libraries(swigfaiss PRIVATE faiss_python_callbacks)
 target_link_libraries(swigfaiss_avx2 PRIVATE faiss_python_callbacks)
 target_link_libraries(swigfaiss_avx512 PRIVATE faiss_python_callbacks)
+target_link_libraries(swigfaiss_avx512_spr PRIVATE faiss_python_callbacks)
 target_link_libraries(swigfaiss_sve PRIVATE faiss_python_callbacks)
 target_link_libraries(faiss_example_external_module PRIVATE faiss_python_callbacks)
 
diff --git a/faiss/python/loader.py b/faiss/python/loader.py
index 9f5be7d2ed..caef9e5512 100644
--- a/faiss/python/loader.py
+++ b/faiss/python/loader.py
@@ -67,6 +67,9 @@ def is_sve_supported():
             result.add("AVX2")
         if "avx512" in numpy.distutils.cpuinfo.cpu.info[0].get('flags', ""):
             result.add("AVX512")
+        if "avx512_fp16" in numpy.distutils.cpuinfo.cpu.info[0].get('flags', ""):
+            # avx512_fp16 is supported starting SPR
+            result.add("AVX512_SPR")
         if is_sve_supported():
             result.add("SVE")
         for f in os.getenv("FAISS_DISABLE_CPU_FEATURES", "").split(", \t\n\r"):
@@ -92,6 +95,18 @@ def is_sve_supported():
     instruction_sets.add(opt_level)
 
 loaded = False
+has_AVX512_SPR = any("AVX512_SPR" in x.upper() for x in instruction_sets)
+if has_AVX512_SPR:
+    try:
+        logger.info("Loading faiss with AVX512-SPR support.")
+        from .swigfaiss_avx512_spr import *
+        logger.info("Successfully loaded faiss with AVX512-SPR support.")
+        loaded = True
+    except ImportError as e:
+        logger.info(f"Could not load library with AVX512-SPR support due to:\n{e!r}")
+        # reset so that we load without AVX512 below
+        loaded = False
+
 has_AVX512 = any("AVX512" in x.upper() for x in instruction_sets)
 if has_AVX512:
     try:
diff --git a/faiss/python/setup.py b/faiss/python/setup.py
index 89c7671f7f..90c6846a58 100644
--- a/faiss/python/setup.py
+++ b/faiss/python/setup.py
@@ -28,6 +28,7 @@
 swigfaiss_generic_lib = f"{prefix}_swigfaiss{ext}"
 swigfaiss_avx2_lib = f"{prefix}_swigfaiss_avx2{ext}"
 swigfaiss_avx512_lib = f"{prefix}_swigfaiss_avx512{ext}"
+swigfaiss_avx512_spr_lib = f"{prefix}_swigfaiss_avx512_spr{ext}"
 callbacks_lib = f"{prefix}libfaiss_python_callbacks{ext}"
 swigfaiss_sve_lib = f"{prefix}_swigfaiss_sve{ext}"
 faiss_example_external_module_lib = f"_faiss_example_external_module{ext}"
@@ -35,6 +36,7 @@
 found_swigfaiss_generic = os.path.exists(swigfaiss_generic_lib)
 found_swigfaiss_avx2 = os.path.exists(swigfaiss_avx2_lib)
 found_swigfaiss_avx512 = os.path.exists(swigfaiss_avx512_lib)
+found_swigfaiss_avx512_spr = os.path.exists(swigfaiss_avx512_spr_lib)
 found_callbacks = os.path.exists(callbacks_lib)
 found_swigfaiss_sve = os.path.exists(swigfaiss_sve_lib)
 found_faiss_example_external_module_lib = os.path.exists(
@@ -42,10 +44,10 @@
 )
 
 assert (
-    found_swigfaiss_generic or found_swigfaiss_avx2 or found_swigfaiss_avx512 or found_swigfaiss_sve or found_faiss_example_external_module_lib
+    found_swigfaiss_generic or found_swigfaiss_avx2 or found_swigfaiss_avx512 or found_swigfaiss_avx512_spr or found_swigfaiss_sve or found_faiss_example_external_module_lib
 ), (
     f"Could not find {swigfaiss_generic_lib} or "
-    f"{swigfaiss_avx2_lib} or {swigfaiss_avx512_lib} or {swigfaiss_sve_lib} or {faiss_example_external_module_lib}. "
+    f"{swigfaiss_avx2_lib} or {swigfaiss_avx512_lib} or {swigfaiss_avx512_spr_lib} or {swigfaiss_sve_lib} or {faiss_example_external_module_lib}. "
     f"Faiss may not be compiled yet."
 )
 
@@ -64,6 +66,11 @@
     shutil.copyfile("swigfaiss_avx512.py", "faiss/swigfaiss_avx512.py")
     shutil.copyfile(swigfaiss_avx512_lib, f"faiss/_swigfaiss_avx512{ext}")
 
+if found_swigfaiss_avx512_spr:
+    print(f"Copying {swigfaiss_avx512_spr_lib}")
+    shutil.copyfile("swigfaiss_avx512_spr.py", "faiss/swigfaiss_avx512_spr.py")
+    shutil.copyfile(swigfaiss_avx512_spr_lib, f"faiss/_swigfaiss_avx512_spr{ext}")
+
 if found_callbacks:
     print(f"Copying {callbacks_lib}")
     shutil.copyfile(callbacks_lib, f"faiss/{callbacks_lib}")
diff --git a/tests/test_contrib.py b/tests/test_contrib.py
index a7b0b09155..ba185f92b2 100644
--- a/tests/test_contrib.py
+++ b/tests/test_contrib.py
@@ -573,8 +573,7 @@ def test_ivf_train_2level(self):
 
         # normally 47 / 200 differences
         ndiff = (Iref != Inew).sum()
-        self.assertLess(ndiff, 51)
-
+        self.assertLess(ndiff, 53)
 
 class TestBigBatchSearch(unittest.TestCase):
 

From ab8cb9cc20415fa3cf2a72a55f2b68e9ea9fea9a Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 23 Dec 2024 10:40:54 -0800
Subject: [PATCH 3/8] Link cuVS Docs (#4084)

Summary:
Small updates to the ReadMe files. More detailed description in a follow up PR for the wiki.

Remove the cuvs conda CI checks

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4084

Reviewed By: mengdilin

Differential Revision: D67602013

Pulled By: mnorris11

fbshipit-source-id: f7c40440d278f00195bcad2dbdd2187325f40662
---
 .github/workflows/build-pull-request.yml | 30 ------------------
 INSTALL.md                               | 39 ++++++++++++++++++------
 README.md                                |  2 +-
 faiss/gpu/impl/CuvsIVFFlat.cu            |  2 ++
 faiss/gpu/impl/CuvsIVFPQ.cu              |  2 ++
 5 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/build-pull-request.yml b/.github/workflows/build-pull-request.yml
index b312c71d37..bc0d2d625a 100644
--- a/.github/workflows/build-pull-request.yml
+++ b/.github/workflows/build-pull-request.yml
@@ -143,36 +143,6 @@ jobs:
           fetch-tags: true
       - name: Build and Package (conda)
         uses: ./.github/actions/build_conda
-  linux-x86_64-GPU-CUVS-CUDA11-8-0-conda:
-    name: Linux x86_64 GPU w/ cuVS conda (CUDA 11.8.0)
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - uses: ./.github/actions/build_conda
-        with:
-          cuvs: "ON"
-          cuda: "11.8.0"
-  linux-x86_64-GPU-CUVS-CUDA12-4-0-conda:
-    name: Linux x86_64 GPU w/ cuVS conda (CUDA 12.4.0)
-    runs-on: 4-core-ubuntu-gpu-t4
-    env:
-      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          fetch-tags: true
-      - uses: ./.github/actions/build_conda
-        with:
-          cuvs: "ON"
-          cuda: "12.4.0"
   windows-x86_64-conda:
     name: Windows x86_64 (conda)
     needs: linux-x86_64-cmake
diff --git a/INSTALL.md b/INSTALL.md
index c5f76d47d5..26b51a80b1 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -6,7 +6,7 @@ pre-release nightly builds.
 
 - The CPU-only faiss-cpu conda package is currently available on Linux (x86-64 and aarch64), OSX (arm64 only), and Windows (x86-64)
 - faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86-64 only) for CUDA 11.4 and 12.1
-- faiss-gpu-raft containing both CPU and GPU indices provided by NVIDIA RAFT, is available on Linux (x86-64 only) for CUDA 11.8 and 12.1.
+- faiss-gpu-raft [^1] package containing GPU indices provided by [NVIDIA RAFT](https://github.com/rapidsai/raft/) version 24.06, is available on Linux (x86-64 only) for CUDA 11.8 and 12.4. 
 
 To install the latest stable release:
 
@@ -23,10 +23,9 @@ $ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1
 # GPU(+CPU) version using AMD ROCm not yet available
 ```
 
-For faiss-gpu, the nvidia channel is required for CUDA, which is not
-published in the main anaconda channel.
+For faiss-gpu, the nvidia channel is required for CUDA, which is not published in the main anaconda channel.
 
-For faiss-gpu-raft, the nvidia, rapidsai and conda-forge channels are required.
+For faiss-gpu-raft, the rapidsai, conda-forge and nvidia channels are required.
 
 Nightly pre-release packages can be installed as follows:
 
@@ -37,8 +36,11 @@ $ conda install -c pytorch/label/nightly faiss-cpu
 # GPU(+CPU) version
 $ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.9.0
 
-# GPU(+CPU) version with NVIDIA RAFT
-conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1.9.0 pytorch pytorch-cuda numpy
+# GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 12.4)
+conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=12.0,<=12.5'
+
+# GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 11.8)
+conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=11.4,<=11.8'
 
 # GPU(+CPU) version using AMD ROCm not yet available
 ```
@@ -68,7 +70,7 @@ $ conda install -c conda-forge faiss-cpu
 # GPU version
 $ conda install -c conda-forge faiss-gpu
 
-# AMD ROCm version not yet available
+# NVIDIA cuVS and AMD ROCm version not yet available
 ```
 
 You can tell which channel your conda packages come from by using `conda list`.
@@ -95,6 +97,8 @@ The optional requirements are:
   - the CUDA toolkit,
 - for AMD GPUs:
   - AMD ROCm,
+- for using NVIDIA cuVS implementations:
+  - libcuvs=24.12
 - for the python bindings:
   - python 3,
   - numpy,
@@ -103,6 +107,19 @@ The optional requirements are:
 Indications for specific configurations are available in the [troubleshooting
 section of the wiki](https://github.com/facebookresearch/faiss/wiki/Troubleshooting).
 
+### Building with NVIDIA cuVS
+
+The libcuvs dependency should be installed via conda:
+1. With CUDA 12.0 - 12.5:
+```
+conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=12.0,<=12.5'
+```
+2. With CUDA 11.4 - 11.8
+```
+conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=11.4,<=11.8'
+```
+For more ways to install cuVS 24.12, refer to the [RAPIDS Installation Guide](https://docs.rapids.ai/install).
+
 ## Step 1: invoking CMake
 
 ``` shell
@@ -118,9 +135,9 @@ Several options can be passed to CMake, among which:
   values are `ON` and `OFF`),
   - `-DFAISS_ENABLE_PYTHON=OFF` in order to disable building python bindings
   (possible values are `ON` and `OFF`),
-  - `-DFAISS_ENABLE_CUVS=ON` in order to enable building the cuVS implementations
-    of the IVF-Flat and IVF-PQ GPU-accelerated indices (default is `OFF`, possible
-    values are `ON` and `OFF`)
+  - `-DFAISS_ENABLE_CUVS=ON` in order to use the NVIDIA cuVS implementations
+    of the IVF-Flat, IVF-PQ and [CAGRA](https://arxiv.org/pdf/2308.15136) GPU-accelerated indices (default is `OFF`, possible, values are `ON` and `OFF`).
+    Note: `-DFAISS_ENABLE_GPU` must be set to `ON` when enabling this option.
   - `-DBUILD_TESTING=OFF` in order to disable building C++ tests,
   - `-DBUILD_SHARED_LIBS=ON` in order to build a shared library (possible values
   are `ON` and `OFF`),
@@ -302,3 +319,5 @@ and you can run
 $ python demos/demo_auto_tune.py
 ```
 to test the GPU code.
+
+[^1]: The vector search and clustering algorithms in NVIDIA RAFT have been formally migrated to [NVIDIA cuVS](https://github.com/rapidsai/cuvs). This package is being renamed to `faiss-gpu-cuvs` in the next stable release, which will use these GPU implementations from the pre-compiled `libcuvs=24.12` binary.
diff --git a/README.md b/README.md
index f00f4d7a3c..468ba59ab6 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ The GPU implementation can accept input from either CPU or GPU memory. On a serv
 
 ## Installing
 
-Faiss comes with precompiled libraries for Anaconda in Python, see [faiss-cpu](https://anaconda.org/pytorch/faiss-cpu) and [faiss-gpu](https://anaconda.org/pytorch/faiss-gpu). The library is mostly implemented in C++, the only dependency is a [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) implementation. Optional GPU support is provided via CUDA or AMD ROCm, and the Python interface is also optional. It compiles with cmake. See [INSTALL.md](INSTALL.md) for details.
+Faiss comes with precompiled libraries for Anaconda in Python, see [faiss-cpu](https://anaconda.org/pytorch/faiss-cpu), [faiss-gpu](https://anaconda.org/pytorch/faiss-gpu) and [faiss-gpu-cuvs](https://anaconda.org/pytorch/faiss-gpu-cuvs). The library is mostly implemented in C++, the only dependency is a [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) implementation. Optional GPU support is provided via CUDA or AMD ROCm, and the Python interface is also optional. The backend GPU implementations of NVIDIA [cuVS](https://github.com/rapidsai/cuvs) can also be enabled optionally. It compiles with cmake. See [INSTALL.md](INSTALL.md) for details.
 
 ## How Faiss works
 
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index 0de7100c72..2cccee8605 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -291,6 +291,8 @@ void CuvsIVFFlat::searchPreassigned(
         Tensor<idx_t, 2, true>& outIndices,
         bool storePairs) {
     // TODO: Fill this in!
+    // Reference issue: https://github.com/facebookresearch/faiss/issues/3243
+    FAISS_THROW_MSG("searchPreassigned is not implemented for cuVS index");
 }
 
 void CuvsIVFFlat::updateQuantizer(Index* quantizer) {
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cu b/faiss/gpu/impl/CuvsIVFPQ.cu
index 2fc94de0f0..1e2fef225d 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cu
+++ b/faiss/gpu/impl/CuvsIVFPQ.cu
@@ -229,6 +229,8 @@ void CuvsIVFPQ::searchPreassigned(
         Tensor<idx_t, 2, true>& outIndices,
         bool storePairs) {
     // TODO: Fill this in!
+    // Reference issue: https://github.com/facebookresearch/faiss/issues/3243
+    FAISS_THROW_MSG("searchPreassigned is not implemented for cuVS index");
 }
 
 size_t CuvsIVFPQ::getGpuListEncodingSize_(idx_t listId) {

From 3c8dc4194907e9b911551d5a009468106f8b9c7f Mon Sep 17 00:00:00 2001
From: Satyendra Mishra <satyendra@meta.com>
Date: Mon, 23 Dec 2024 11:27:51 -0800
Subject: [PATCH 4/8] Set KnnDescriptor.desc_name in the Benchmarking core
 framework in FAISS like other descriptors (#4109)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4109

Set KnnDescriptor.desc_name in the Benchmarking core framework in FAISS like other descriptors

Reviewed By: mnorris11

Differential Revision: D67539874

fbshipit-source-id: 09ffb76296f466ae2d3b0eb551917f429bc7300f
---
 benchs/bench_fw/descriptors.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py
index f6164f54e2..8b1d65a505 100644
--- a/benchs/bench_fw/descriptors.py
+++ b/benchs/bench_fw/descriptors.py
@@ -341,6 +341,8 @@ def __hash__(self):
         return hash(str(self))
 
     def get_name(self):
+        if self.desc_name is not None:
+            return self.desc_name
         name = self.index_desc.get_name()
         name += IndexBaseDescriptor.param_dict_to_name(self.search_params)
         name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
@@ -350,6 +352,7 @@ def get_name(self):
             name += "rec."
         else:
             name += "knn."
+        self.desc_name = name
         return name
 
     def flat_name(self):

From ab479a1ca75142606ed94683d538dc358e28e69d Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Thu, 26 Dec 2024 21:33:45 -0800
Subject: [PATCH 5/8] enable quiet mode for conda install (#4112)

Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4112

Reviewed By: gtwang01

Differential Revision: D67662530

Pulled By: junjieqi

fbshipit-source-id: 576198d79e947009025641bfa94b3ad9ea2fe1b4
---
 .github/actions/build_conda/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
index 95c2d60c1f..ff860007b2 100644
--- a/.github/actions/build_conda/action.yml
+++ b/.github/actions/build_conda/action.yml
@@ -34,8 +34,8 @@ runs:
     - name: Install conda build tools
       shell: ${{ steps.choose_shell.outputs.shell }}
       run: |
-        conda install -y "conda!=24.11.0"
-        conda install -y "conda-build!=24.11.0"
+        conda install -y -q "conda!=24.11.0"
+        conda install -y -q "conda-build!=24.11.0"
     - name: Fix CI failure
       shell: ${{ steps.choose_shell.outputs.shell }}
       if: runner.os != 'Windows'

From 0cbc2a885cde923d80c4bf9c9d6f4d81665f3f64 Mon Sep 17 00:00:00 2001
From: Mulugeta Mammo <mulugeta.mammo@intel.com>
Date: Fri, 27 Dec 2024 09:44:28 -0800
Subject: [PATCH 6/8] Use _mm512_popcnt_epi64 to speedup hamming distance
 evaluation. (#4020)

Summary:
The `_mm512_popcnt_epi64` intrinsic is used to accelerate Hamming distance calculations in `HammingComputerDefault` and `HammingComputer64`.

Benchmarking with [bench_hamming_computer](https://github.com/facebookresearch/faiss/blob/main/benchs/bench_hamming_computer.cpp) on AWS [r7i](https://aws.amazon.com/ec2/instance-types/r7i/) instance shows a performance improvement of up to 30% compared to AVX-2.

This PR depends on [PR#4025](https://github.com/facebookresearch/faiss/pull/4025)

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4020

Reviewed By: junjieqi

Differential Revision: D67650183

Pulled By: mengdilin

fbshipit-source-id: 17e5b68570dced1fea0b885dd4e67c17dfc7bece
---
 faiss/CMakeLists.txt                      |   1 +
 faiss/utils/hamming_distance/avx512-inl.h | 490 ++++++++++++++++++++++
 faiss/utils/hamming_distance/hamdis-inl.h |   3 +
 3 files changed, 494 insertions(+)
 create mode 100644 faiss/utils/hamming_distance/avx512-inl.h

diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 535710b829..6e9eb883a7 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -230,6 +230,7 @@ set(FAISS_HEADERS
   utils/hamming_distance/hamdis-inl.h
   utils/hamming_distance/neon-inl.h
   utils/hamming_distance/avx2-inl.h
+  utils/hamming_distance/avx512-inl.h
 )
 
 if(NOT WIN32)
diff --git a/faiss/utils/hamming_distance/avx512-inl.h b/faiss/utils/hamming_distance/avx512-inl.h
new file mode 100644
index 0000000000..2b2302e8d5
--- /dev/null
+++ b/faiss/utils/hamming_distance/avx512-inl.h
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef HAMMING_AVX512_INL_H
+#define HAMMING_AVX512_INL_H
+
+// AVX512 version
+// The _mm512_popcnt_epi64 intrinsic is used to accelerate Hamming distance
+// calculations in HammingComputerDefault and HammingComputer64. This intrinsic
+// is not available in the default FAISS avx512 build mode but is only
+// available in the avx512_spr build mode, which targets Intel(R) Sapphire
+// Rapids.
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
+#include <faiss/impl/platform_macros.h>
+
+#include <immintrin.h>
+
+namespace faiss {
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++) {
+        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
+    }
+    return h;
+}
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
+    const size_t nwords = nbits / 64;
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/* specialized (optimized) functions */
+template <>
+inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]);
+}
+
+template <>
+inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
+}
+
+template <>
+inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
+            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
+}
+
+/* Hamming distances for multiple of 64 bits */
+inline hamdis_t hamming(
+        const uint64_t* bs1,
+        const uint64_t* bs2,
+        size_t nwords) {
+    hamdis_t h = 0;
+    for (size_t i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4() {}
+
+    HammingComputer4(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 4);
+        a0 = *(uint32_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return popcount64(*(uint32_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 4;
+    }
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8() {}
+
+    HammingComputer8(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return popcount64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+struct HammingComputer16 {
+    uint64_t a0, a1;
+
+    HammingComputer16() {}
+
+    HammingComputer16(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint64_t a0, a1;
+    uint32_t a2;
+
+    HammingComputer20() {}
+
+    HammingComputer20(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 20);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(*(uint32_t*)(b + 2) ^ a2);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 20;
+    }
+};
+
+struct HammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    HammingComputer32() {}
+
+    HammingComputer32(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct HammingComputer64 {
+    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
+    const uint64_t* a;
+
+    HammingComputer64() {}
+
+    HammingComputer64(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, int code_size) {
+        assert(code_size == 64);
+        a = (uint64_t*)a8;
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+        a4 = a[4];
+        a5 = a[5];
+        a6 = a[6];
+        a7 = a[7];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+#ifdef __AVX512VPOPCNTDQ__
+        __m512i vxor =
+                _mm512_xor_si512(_mm512_loadu_si512(a), _mm512_loadu_si512(b));
+        __m512i vpcnt = _mm512_popcnt_epi64(vxor);
+        // reduce performs better than adding the lower and higher parts
+        return _mm512_reduce_add_epi32(vpcnt);
+#else
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
+                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
+                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
+#endif
+    }
+
+    inline static constexpr int get_code_size() {
+        return 64;
+    }
+};
+
+struct HammingComputerDefault {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+
+    HammingComputerDefault() {}
+
+    HammingComputerDefault(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8_2, int code_size) {
+        this->a8 = a8_2;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        int accu = 0;
+
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+
+        int i = 0;
+#ifdef __AVX512VPOPCNTDQ__
+        int quotient64 = quotient8 / 8;
+        for (; i < quotient64; ++i) {
+            __m512i vxor = _mm512_xor_si512(
+                    _mm512_loadu_si512(&a64[i * 8]),
+                    _mm512_loadu_si512(&b64[i * 8]));
+            __m512i vpcnt = _mm512_popcnt_epi64(vxor);
+            // reduce performs better than adding the lower and higher parts
+            accu += _mm512_reduce_add_epi32(vpcnt);
+        }
+        i *= 8;
+#endif
+        int len = quotient8 - i;
+        switch (len & 7) {
+            default:
+                while (len > 7) {
+                    len -= 8;
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                    case 7:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 6:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 5:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 4:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 3:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 2:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                        [[fallthrough]];
+                    case 1:
+                        accu += popcount64(a64[i] ^ b64[i]);
+                        i++;
+                }
+        }
+        if (remainder8) {
+            const uint8_t* a = a8 + 8 * quotient8;
+            const uint8_t* b = b8 + 8 * quotient8;
+            switch (remainder8) {
+                case 7:
+                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                    [[fallthrough]];
+                case 6:
+                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                    [[fallthrough]];
+                case 5:
+                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                    [[fallthrough]];
+                case 4:
+                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                    [[fallthrough]];
+                case 3:
+                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                    [[fallthrough]];
+                case 2:
+                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                    [[fallthrough]];
+                case 1:
+                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                    [[fallthrough]];
+                default:
+                    break;
+            }
+        }
+
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return quotient8 * 8 + remainder8;
+    }
+};
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+inline int generalized_hamming_64(uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64(a);
+}
+
+struct GenHammingComputer8 {
+    uint64_t a0;
+
+    GenHammingComputer8(const uint8_t* a, int code_size) {
+        assert(code_size == 8);
+        a0 = *(uint64_t*)a;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        return generalized_hamming_64(*(uint64_t*)b ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+// I'm not sure whether this version is faster of slower, tbh
+// todo: test on different CPUs
+struct GenHammingComputer16 {
+    __m128i a;
+
+    GenHammingComputer16(const uint8_t* a8, int code_size) {
+        assert(code_size == 16);
+        a = _mm_loadu_si128((const __m128i_u*)a8);
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const __m128i b = _mm_loadu_si128((const __m128i_u*)b8);
+        const __m128i cmp = _mm_cmpeq_epi8(a, b);
+        const auto movemask = _mm_movemask_epi8(cmp);
+        return 16 - popcount32(movemask);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+struct GenHammingComputer32 {
+    __m256i a;
+
+    GenHammingComputer32(const uint8_t* a8, int code_size) {
+        assert(code_size == 32);
+        a = _mm256_loadu_si256((const __m256i_u*)a8);
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const __m256i b = _mm256_loadu_si256((const __m256i_u*)b8);
+        const __m256i cmp = _mm256_cmpeq_epi8(a, b);
+        const uint32_t movemask = _mm256_movemask_epi8(cmp);
+        return 32 - popcount32(movemask);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+// A specialized version might be needed for the very long
+// GenHamming code_size. In such a case, one may accumulate
+// counts using _mm256_sub_epi8 and then compute a horizontal
+// sum (using _mm256_sad_epu8, maybe, in blocks of no larger
+// than 256 * 32 bytes).
+
+struct GenHammingComputerM8 {
+    const uint64_t* a;
+    int n;
+
+    GenHammingComputerM8(const uint8_t* a8, int code_size) {
+        assert(code_size % 8 == 0);
+        a = (uint64_t*)a8;
+        n = code_size / 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* b = (uint64_t*)b8;
+        int accu = 0;
+
+        int i = 0;
+        int n4 = (n / 4) * 4;
+        for (; i < n4; i += 4) {
+            const __m256i av = _mm256_loadu_si256((const __m256i_u*)(a + i));
+            const __m256i bv = _mm256_loadu_si256((const __m256i_u*)(b + i));
+            const __m256i cmp = _mm256_cmpeq_epi8(av, bv);
+            const uint32_t movemask = _mm256_movemask_epi8(cmp);
+            accu += 32 - popcount32(movemask);
+        }
+
+        for (; i < n; i++)
+            accu += generalized_hamming_64(a[i] ^ b[i]);
+        return accu;
+    }
+
+    inline int get_code_size() const {
+        return n * 8;
+    }
+};
+
+} // namespace faiss
+
+#endif
diff --git a/faiss/utils/hamming_distance/hamdis-inl.h b/faiss/utils/hamming_distance/hamdis-inl.h
index eac7054317..0c9ac1ba63 100644
--- a/faiss/utils/hamming_distance/hamdis-inl.h
+++ b/faiss/utils/hamming_distance/hamdis-inl.h
@@ -16,6 +16,9 @@
 #ifdef __aarch64__
 // ARM compilers may produce inoptimal code for Hamming distance somewhy.
 #include <faiss/utils/hamming_distance/neon-inl.h>
+#elif __AVX512F__
+// offers better performance where __AVX512VPOPCNTDQ__ is supported
+#include <faiss/utils/hamming_distance/avx512-inl.h>
 #elif __AVX2__
 // better versions for GenHammingComputer
 #include <faiss/utils/hamming_distance/avx2-inl.h>

From 9590ad27460f65fe3dae3ba61d8b5e3e8d03265f Mon Sep 17 00:00:00 2001
From: Maria Lomeli <marialomeli@meta.com>
Date: Mon, 6 Jan 2025 09:48:32 -0800
Subject: [PATCH 7/8] PQ with pytorch (#4116)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4116

This diff implements Product Quantization using Pytorch only.

Reviewed By: mdouze

Differential Revision: D67766798

fbshipit-source-id: fe2d44a674fc2056f7e2082e9765052c98fdc8f8
---
 contrib/torch/clustering.py   |  1 +
 contrib/torch/quantization.py | 61 +++++++++++++++++++++++++++++------
 tests/torch_test_contrib.py   | 33 ++++++++++++++++---
 3 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/contrib/torch/clustering.py b/contrib/torch/clustering.py
index 7438534605..8ff5cb24c3 100644
--- a/contrib/torch/clustering.py
+++ b/contrib/torch/clustering.py
@@ -13,6 +13,7 @@
 # the kmeans can produce both torch and numpy centroids
 from faiss.contrib.clustering import kmeans
 
+
 class DatasetAssign:
     """Wrapper for a tensor that offers a function to assign the vectors
     to centroids. All other implementations offer the same interface"""
diff --git a/contrib/torch/quantization.py b/contrib/torch/quantization.py
index a1d8f7dd8a..2ae6599a0c 100644
--- a/contrib/torch/quantization.py
+++ b/contrib/torch/quantization.py
@@ -7,33 +7,47 @@
 This contrib module contains Pytorch code for quantization.
 """
 
-import numpy as np
 import torch
 import faiss
-
-from faiss.contrib import torch_utils
+import math
+from faiss.contrib.torch import clustering
+# the kmeans can produce both torch and numpy centroids
 
 
 class Quantizer:
 
     def __init__(self, d, code_size):
+        """
+        d: dimension of vectors
+        code_size: nb of bytes of the code (per vector)
+        """
         self.d = d
         self.code_size = code_size
 
     def train(self, x):
+        """
+        takes a n-by-d array and peforms training
+        """
         pass
 
     def encode(self, x):
+        """
+        takes a n-by-d float array, encodes to an n-by-code_size uint8 array
+        """
         pass
 
-    def decode(self, x):
+    def decode(self, codes):
+        """
+        takes a n-by-code_size uint8 array, returns a n-by-d array
+        """
         pass
 
 
 class VectorQuantizer(Quantizer):
 
     def __init__(self, d, k):
-        code_size = int(torch.ceil(torch.log2(k) / 8))
+
+        code_size = int(math.ceil(torch.log2(k) / 8))
         Quantizer.__init__(d, code_size)
         self.k = k
 
@@ -42,12 +56,41 @@ def train(self, x):
 
 
 class ProductQuantizer(Quantizer):
-
     def __init__(self, d, M, nbits):
-        code_size = int(torch.ceil(M * nbits / 8))
-        Quantizer.__init__(d, code_size)
+        """ M: number of subvectors, d%M == 0
+        nbits: number of bits that each vector is encoded into
+        """
+        assert d % M == 0
+        assert nbits == 8  # todo: implement other nbits values
+        code_size = int(math.ceil(M * nbits / 8))
+        Quantizer.__init__(self, d, code_size)
         self.M = M
         self.nbits = nbits
+        self.code_size = code_size
 
     def train(self, x):
-        pass
+        nc = 2 ** self.nbits
+        sd = self.d // self.M
+        dev = x.device
+        dtype = x.dtype
+        self.codebook = torch.zeros((self.M, nc, sd), device=dev, dtype=dtype)
+        for m in range(self.M):
+            xsub = x[:, m * self.d // self.M: (m + 1) * self.d // self.M]
+            data = clustering.DatasetAssign(xsub.contiguous())
+            self.codebook[m] = clustering.kmeans(2 ** self.nbits, data)
+
+    def encode(self, x):
+        codes = torch.zeros((x.shape[0], self.code_size), dtype=torch.uint8)
+        for m in range(self.M):
+            xsub = x[:, m * self.d // self.M:(m + 1) * self.d // self.M]
+            _, I = faiss.knn(xsub.contiguous(), self.codebook[m], 1)
+            codes[:, m] = I.ravel()
+        return codes
+
+    def decode(self, codes):
+        idxs = [codes[:, m].long() for m in range(self.M)]
+        vectors = [self.codebook[m, idxs[m], :] for m in range(self.M)]
+        stacked_vectors = torch.stack(vectors, dim=1)
+        cbd = self.codebook.shape[-1]
+        x_rec = stacked_vectors.reshape(-1, cbd * self.M)
+        return x_rec
diff --git a/tests/torch_test_contrib.py b/tests/torch_test_contrib.py
index 26c381b3cc..3eb6c6c43c 100644
--- a/tests/torch_test_contrib.py
+++ b/tests/torch_test_contrib.py
@@ -4,13 +4,14 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch  # usort: skip
-import unittest   # usort: skip
-import numpy as np   # usort: skip
+import unittest  # usort: skip
+import numpy as np  # usort: skip
 
-import faiss   # usort: skip
+import faiss  # usort: skip
 import faiss.contrib.torch_utils  # usort: skip
 from faiss.contrib import datasets
-from faiss.contrib.torch import clustering
+from faiss.contrib.torch import clustering, quantization
+
 
 
 
@@ -400,3 +401,27 @@ def test_python_kmeans(self):
         # 33498.332 33380.477
         # print(err, err2)        1/0
         self.assertLess(err2, err * 1.1)
+
+
+class TestQuantization(unittest.TestCase):
+    def test_python_product_quantization(self):
+        """ Test the python implementation of product quantization """
+        d = 64
+        n = 10000
+        cs = 4
+        nbits = 8
+        M = 4
+        x = np.random.random(size=(n, d)).astype('float32')
+        pq = faiss.ProductQuantizer(d, cs, nbits)
+        pq.train(x)
+        codes = pq.compute_codes(x)
+        x2 = pq.decode(codes)
+        diff = ((x - x2)**2).sum()
+        # vs pure pytorch impl
+        xt = torch.from_numpy(x)
+        my_pq = quantization.ProductQuantizer(d, M, nbits)
+        my_pq.train(xt)
+        my_codes = my_pq.encode(xt)
+        xt2 = my_pq.decode(my_codes)
+        my_diff = ((xt - xt2)**2).sum()
+        self.assertLess(abs(diff - my_diff), 100)

From 162e6ce1cd3e4848e6a934c93653edf39dcbe47a Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <alexanderguzhva@gmail.com>
Date: Mon, 6 Jan 2025 13:23:22 -0800
Subject: [PATCH 8/8] add range_search() to IndexRefine (#4022)

Summary:
This is very convenient to have `range_seach()` in `IndexRefine`. Unlike the plain `search()` method, `range_search()` just reevaluates the computed distances from the baseline index. The labels are not re-sorted according to new distances, because this is not listed as a requirement in a method description
https://github.com/facebookresearch/faiss/blob/adb188411a98c3af5b7295c7016e5f46fee9eb07/faiss/Index.h#L150-L161
https://github.com/facebookresearch/faiss/blob/adb188411a98c3af5b7295c7016e5f46fee9eb07/faiss/impl/AuxIndexStructures.h#L35

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4022

Reviewed By: mnorris11

Differential Revision: D66116082

Pulled By: gtwang01

fbshipit-source-id: 915aca2570d5863c876c9497d4c885e270b9b220
---
 faiss/IndexRefine.cpp | 39 ++++++++++++++++++++++++++++++++
 faiss/IndexRefine.h   |  7 ++++++
 tests/test_refine.py  | 52 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/faiss/IndexRefine.cpp b/faiss/IndexRefine.cpp
index 8bc429a5e9..6f1f588e2e 100644
--- a/faiss/IndexRefine.cpp
+++ b/faiss/IndexRefine.cpp
@@ -166,6 +166,45 @@ void IndexRefine::search(
     }
 }
 
+void IndexRefine::range_search(
+        idx_t n,
+        const float* x,
+        float radius,
+        RangeSearchResult* result,
+        const SearchParameters* params_in) const {
+    const IndexRefineSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IndexRefineSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexRefine params have incorrect type");
+    }
+
+    SearchParameters* base_index_params =
+            (params != nullptr) ? params->base_index_params : nullptr;
+
+    base_index->range_search(n, x, radius, result, base_index_params);
+
+#pragma omp parallel if (n > 1)
+    {
+        std::unique_ptr<DistanceComputer> dc(
+                refine_index->get_distance_computer());
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            dc->set_query(x + i * d);
+
+            // reevaluate distances
+            const size_t idx_start = result->lims[i];
+            const size_t idx_end = result->lims[i + 1];
+
+            for (size_t j = idx_start; j < idx_end; j++) {
+                const auto label = result->labels[j];
+                result->distances[j] = (*dc)(label);
+            }
+        }
+    }
+}
+
 void IndexRefine::reconstruct(idx_t key, float* recons) const {
     refine_index->reconstruct(key, recons);
 }
diff --git a/faiss/IndexRefine.h b/faiss/IndexRefine.h
index 9ad4e4be29..255271695f 100644
--- a/faiss/IndexRefine.h
+++ b/faiss/IndexRefine.h
@@ -54,6 +54,13 @@ struct IndexRefine : Index {
             idx_t* labels,
             const SearchParameters* params = nullptr) const override;
 
+    void range_search(
+            idx_t n,
+            const float* x,
+            float radius,
+            RangeSearchResult* result,
+            const SearchParameters* params = nullptr) const override;
+
     // reconstruct is routed to the refine_index
     void reconstruct(idx_t key, float* recons) const override;
 
diff --git a/tests/test_refine.py b/tests/test_refine.py
index f272584245..9b9ce73d0d 100644
--- a/tests/test_refine.py
+++ b/tests/test_refine.py
@@ -8,7 +8,7 @@
 import unittest
 import faiss
 
-from faiss.contrib import datasets
+from faiss.contrib import datasets, evaluation
 
 
 class TestDistanceComputer(unittest.TestCase):
@@ -119,3 +119,53 @@ def test_rflat(self):
     def test_refine_sq8(self):
         # this case uses the IndexRefine class
         self.do_test("IVF8,PQ2x4np,Refine(SQ8)")
+
+
+class TestIndexRefineRangeSearch(unittest.TestCase):
+
+    def do_test(self, factory_string):
+        d = 32
+        radius = 8
+
+        ds = datasets.SyntheticDataset(d, 1024, 512, 256)
+
+        index = faiss.index_factory(d, factory_string)
+        index.train(ds.get_train())
+        index.add(ds.get_database())
+        xq = ds.get_queries()
+        xb = ds.get_database()
+
+        # perform a range_search
+        lims_1, D1, I1 = index.range_search(xq, radius)
+
+        # create a baseline (FlatL2)
+        index_flat = faiss.IndexFlatL2(d)
+        index_flat.train(ds.get_train())
+        index_flat.add(ds.get_database())
+
+        lims_ref, Dref, Iref = index_flat.range_search(xq, radius)
+
+        # add a refine index on top of the index
+        index_r = faiss.IndexRefine(index, index_flat)
+        lims_2, D2, I2 = index_r.range_search(xq, radius)
+
+        # validate: refined range_search() keeps indices untouched
+        precision_1, recall_1 = evaluation.range_PR(lims_ref, Iref, lims_1, I1)
+
+        precision_2, recall_2 = evaluation.range_PR(lims_ref, Iref, lims_2, I2)
+
+        self.assertAlmostEqual(recall_1, recall_2)
+
+        # validate: refined range_search() updates distances, and new distances are correct L2 distances
+        for iq in range(0, ds.nq):
+            start_lim = lims_2[iq]
+            end_lim = lims_2[iq + 1]
+            for i_lim in range(start_lim, end_lim):
+                idx = I2[i_lim]
+                l2_dis = np.sum(np.square(xq[iq : iq + 1,] - xb[idx : idx + 1,]))
+
+                self.assertAlmostEqual(l2_dis, D2[i_lim], places=4)
+
+
+    def test_refine_1(self):
+        self.do_test("SQ4")