From 3a1ad461ff9e75349a3ac3d8eafb44883db4d824 Mon Sep 17 00:00:00 2001 From: Artem Labazov Date: Fri, 20 Dec 2024 15:30:33 -0800 Subject: [PATCH 1/8] Conditionally compile extras like benchmarks and demos (#4094) Summary: While embedding faiss as a subproject, these targets are quite useless. This PR adss a new `FAISS_ENABLE_EXTRAS` CMake option to exclude them, default is ON to preserve current behaviour. Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4094 Reviewed By: satymish Differential Revision: D67345666 Pulled By: gtwang01 fbshipit-source-id: c8d9ef14b21816e45f681412bda257192f5365d0 --- CMakeLists.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bd1ad3eaf..50843cb716 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,6 +64,7 @@ option(FAISS_ENABLE_CUVS "Enable cuVS for GPU indexes." OFF) option(FAISS_ENABLE_ROCM "Enable ROCm for GPU indexes." OFF) option(FAISS_ENABLE_PYTHON "Build Python extension." ON) option(FAISS_ENABLE_C_API "Build C API." OFF) +option(FAISS_ENABLE_EXTRAS "Build extras like benchmarks and demos" ON) option(FAISS_USE_LTO "Enable Link-Time optimization" OFF) if(FAISS_ENABLE_GPU) @@ -103,10 +104,11 @@ if(FAISS_ENABLE_C_API) add_subdirectory(c_api) endif() -add_subdirectory(demos) -add_subdirectory(benchs) -add_subdirectory(tutorial/cpp) - +if(FAISS_ENABLE_EXTRAS) + add_subdirectory(demos) + add_subdirectory(benchs) + add_subdirectory(tutorial/cpp) +endif() # CTest must be included in the top level to enable `make test` target. include(CTest) From 3beb07b198a83450d6ed65c7bf9f4e36ef2a97c9 Mon Sep 17 00:00:00 2001 From: Mulugeta Mammo Date: Mon, 23 Dec 2024 08:56:26 -0800 Subject: [PATCH 2/8] Add a new architecture mode: 'avx512_spr'. (#4025) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: This PR adds a new architecture mode to support the new extensions to AVX512, namely [AVX512-FP16](https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide), which have been available since IntelĀ® Sapphire Rapids. This PR is a prerequisite for [PR#4020](https://github.com/facebookresearch/faiss/pull/4020) that speeds up hamming distance evaluations. Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4025 Reviewed By: pankajsingh88 Differential Revision: D67524575 Pulled By: mengdilin fbshipit-source-id: f3a09943b062d720b241f95aef2f390923ffd779 --- .github/workflows/build-pull-request.yml | 11 ++++++++ .gitignore | 1 + CMakeLists.txt | 2 +- INSTALL.md | 8 +++++- cmake/link_to_faiss_lib.cmake | 13 ++++++++- faiss/CMakeLists.txt | 35 ++++++++++++++++++++++-- faiss/gpu/CMakeLists.txt | 2 ++ faiss/python/CMakeLists.txt | 33 ++++++++++++++++++++++ faiss/python/loader.py | 15 ++++++++++ faiss/python/setup.py | 11 ++++++-- tests/test_contrib.py | 3 +- 11 files changed, 125 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build-pull-request.yml b/.github/workflows/build-pull-request.yml index f94077513d..b312c71d37 100644 --- a/.github/workflows/build-pull-request.yml +++ b/.github/workflows/build-pull-request.yml @@ -60,6 +60,17 @@ jobs: uses: ./.github/actions/build_cmake with: opt_level: avx512 + linux-x86_64-AVX512_SPR-cmake: + name: Linux x86_64 AVX512_SPR (cmake) + needs: linux-x86_64-cmake + runs-on: faiss-aws-m7i.large + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Build and Test (cmake) + uses: ./.github/actions/build_cmake + with: + opt_level: avx512_spr linux-x86_64-GPU-cmake: name: Linux x86_64 GPU (cmake) needs: linux-x86_64-cmake diff --git a/.gitignore b/.gitignore index 01b98f0a9c..52a99e8fc0 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,5 @@ /tests/gtest/ faiss/python/swigfaiss_avx2.swig faiss/python/swigfaiss_avx512.swig +faiss/python/swigfaiss_avx512_spr.swig faiss/python/swigfaiss_sve.swig diff --git a/CMakeLists.txt b/CMakeLists.txt index 50843cb716..fc6c0d55f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,7 +57,7 @@ set(CMAKE_CXX_STANDARD 17) list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") -# Valid values are "generic", "avx2", "avx512", "sve". +# Valid values are "generic", "avx2", "avx512", "avx512_spr", "sve". option(FAISS_OPT_LEVEL "" "generic") option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON) option(FAISS_ENABLE_CUVS "Enable cuVS for GPU indexes." OFF) diff --git a/INSTALL.md b/INSTALL.md index e16de484fe..c5f76d47d5 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -131,7 +131,7 @@ Several options can be passed to CMake, among which: optimization options (enables `-O3` on gcc for instance), - `-DFAISS_OPT_LEVEL=avx2` in order to enable the required compiler flags to generate code using optimized SIMD/Vector instructions. Possible values are below: - - On x86-64, `generic`, `avx2` and `avx512`, by increasing order of optimization, + - On x86-64, `generic`, `avx2`, 'avx512', and `avx512_spr` (for avx512 features available since Intel(R) Sapphire Rapids), by increasing order of optimization, - On aarch64, `generic` and `sve`, by increasing order of optimization, - `-DFAISS_USE_LTO=ON` in order to enable [Link-Time Optimization](https://en.wikipedia.org/wiki/Link-time_optimization) (default is `OFF`, possible values are `ON` and `OFF`). - BLAS-related options: @@ -180,6 +180,12 @@ For AVX512: $ make -C build -j faiss_avx512 ``` +For AVX512 features available since Intel(R) Sapphire Rapids. + +``` shell +$ make -C build -j faiss_avx512_spr +``` + This will ensure the creation of neccesary files when building and installing the python package. ## Step 3: Building the python bindings (optional) diff --git a/cmake/link_to_faiss_lib.cmake b/cmake/link_to_faiss_lib.cmake index 939ed61fc9..e04453ddc3 100644 --- a/cmake/link_to_faiss_lib.cmake +++ b/cmake/link_to_faiss_lib.cmake @@ -5,7 +5,7 @@ # LICENSE file in the root directory of this source tree. function(link_to_faiss_lib target) - if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "sve") + if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr" AND NOT FAISS_OPT_LEVEL STREQUAL "sve") target_link_libraries(${target} PRIVATE faiss) endif() @@ -27,6 +27,17 @@ function(link_to_faiss_lib target) target_link_libraries(${target} PRIVATE faiss_avx512) endif() + if(FAISS_OPT_LEVEL STREQUAL "avx512_spr") + if(NOT WIN32) + # Architecture mode to support AVX512 extensions available since Intel (R) Sapphire Rapids. + # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide + target_compile_options(${target} PRIVATE $<$:-march=sapphirerapids -mtune=sapphirerapids>) + else() + target_compile_options(${target} PRIVATE $<$:/arch:AVX512>) + endif() + target_link_libraries(${target} PRIVATE faiss_avx512_spr) + endif() + if(FAISS_OPT_LEVEL STREQUAL "sve") if(NOT WIN32) if("${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "(^| )-march=native") diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index 5e91038eb7..535710b829 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -243,7 +243,7 @@ set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE) add_library(faiss ${FAISS_SRC}) add_library(faiss_avx2 ${FAISS_SRC}) -if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512") +if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr") set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE) endif() if(NOT WIN32) @@ -272,6 +272,20 @@ else() add_compile_options(/bigobj) endif() +add_library(faiss_avx512_spr ${FAISS_SRC}) +if(NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr") + set_target_properties(faiss_avx512_spr PROPERTIES EXCLUDE_FROM_ALL TRUE) +endif() +if(NOT WIN32) + # Architecture mode to support AVX512 extensions available since Intel(R) Sapphire Rapids. + # Ref: https://networkbuilders.intel.com/solutionslibrary/intel-avx-512-fp16-instruction-set-for-intel-xeon-processor-based-products-technology-guide + target_compile_options(faiss_avx512_spr PRIVATE $<$:-march=sapphirerapids -mtune=sapphirerapids>) +else() + target_compile_options(faiss_avx512_spr PRIVATE $<$:/arch:AVX512>) + # we need bigobj for the swig wrapper + add_compile_options(/bigobj) +endif() + add_library(faiss_sve ${FAISS_SRC}) if(NOT FAISS_OPT_LEVEL STREQUAL "sve") set_target_properties(faiss_sve PROPERTIES EXCLUDE_FROM_ALL TRUE) @@ -307,10 +321,13 @@ target_include_directories(faiss_avx2 PUBLIC target_include_directories(faiss_avx512 PUBLIC $) # Handle `#include `. +target_include_directories(faiss_avx512_spr PUBLIC + $) +# Handle `#include `. target_include_directories(faiss_sve PUBLIC $) -set_target_properties(faiss faiss_avx2 faiss_avx512 faiss_sve PROPERTIES +set_target_properties(faiss faiss_avx2 faiss_avx512 faiss_avx512_spr faiss_sve PROPERTIES POSITION_INDEPENDENT_CODE ON WINDOWS_EXPORT_ALL_SYMBOLS ON ) @@ -319,6 +336,7 @@ if(WIN32) target_compile_definitions(faiss PRIVATE FAISS_MAIN_LIB) target_compile_definitions(faiss_avx2 PRIVATE FAISS_MAIN_LIB) target_compile_definitions(faiss_avx512 PRIVATE FAISS_MAIN_LIB) + target_compile_definitions(faiss_avx512_spr PRIVATE FAISS_MAIN_LIB) target_compile_definitions(faiss_sve PRIVATE FAISS_MAIN_LIB) endif() @@ -328,6 +346,7 @@ if (${finteger_idx} EQUAL -1) endif() target_compile_definitions(faiss_avx2 PRIVATE FINTEGER=int) target_compile_definitions(faiss_avx512 PRIVATE FINTEGER=int) +target_compile_definitions(faiss_avx512_spr PRIVATE FINTEGER=int) target_compile_definitions(faiss_sve PRIVATE FINTEGER=int) if(FAISS_USE_LTO) @@ -339,6 +358,7 @@ if(FAISS_USE_LTO) set_property(TARGET faiss PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) set_property(TARGET faiss_avx2 PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) set_property(TARGET faiss_avx512 PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) + set_property(TARGET faiss_avx512_spr PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) else() message(STATUS "LTO not supported: <${ipo_error}>") endif() @@ -348,6 +368,7 @@ find_package(OpenMP REQUIRED) target_link_libraries(faiss PRIVATE OpenMP::OpenMP_CXX) target_link_libraries(faiss_avx2 PRIVATE OpenMP::OpenMP_CXX) target_link_libraries(faiss_avx512 PRIVATE OpenMP::OpenMP_CXX) +target_link_libraries(faiss_avx512_spr PRIVATE OpenMP::OpenMP_CXX) target_link_libraries(faiss_sve PRIVATE OpenMP::OpenMP_CXX) find_package(MKL) @@ -355,17 +376,20 @@ if(MKL_FOUND) target_link_libraries(faiss PRIVATE ${MKL_LIBRARIES}) target_link_libraries(faiss_avx2 PRIVATE ${MKL_LIBRARIES}) target_link_libraries(faiss_avx512 PRIVATE ${MKL_LIBRARIES}) + target_link_libraries(faiss_avx512_spr PRIVATE ${MKL_LIBRARIES}) else() find_package(BLAS REQUIRED) target_link_libraries(faiss PRIVATE ${BLAS_LIBRARIES}) target_link_libraries(faiss_avx2 PRIVATE ${BLAS_LIBRARIES}) target_link_libraries(faiss_avx512 PRIVATE ${BLAS_LIBRARIES}) + target_link_libraries(faiss_avx512_spr PRIVATE ${BLAS_LIBRARIES}) target_link_libraries(faiss_sve PRIVATE ${BLAS_LIBRARIES}) find_package(LAPACK REQUIRED) target_link_libraries(faiss PRIVATE ${LAPACK_LIBRARIES}) target_link_libraries(faiss_avx2 PRIVATE ${LAPACK_LIBRARIES}) target_link_libraries(faiss_avx512 PRIVATE ${LAPACK_LIBRARIES}) + target_link_libraries(faiss_avx512_spr PRIVATE ${LAPACK_LIBRARIES}) target_link_libraries(faiss_sve PRIVATE ${LAPACK_LIBRARIES}) endif() @@ -390,6 +414,13 @@ if(FAISS_OPT_LEVEL STREQUAL "avx512") LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) endif() +if(FAISS_OPT_LEVEL STREQUAL "avx512_spr") + install(TARGETS faiss_avx2 faiss_avx512 faiss_avx512_spr + EXPORT faiss-targets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) +endif() if(FAISS_OPT_LEVEL STREQUAL "sve") install(TARGETS faiss_sve EXPORT faiss-targets diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index 84cb222145..16574aab61 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -270,6 +270,7 @@ if(FAISS_ENABLE_CUVS) target_compile_definitions(faiss PUBLIC USE_NVIDIA_CUVS=1) target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_CUVS=1) target_compile_definitions(faiss_avx512 PUBLIC USE_NVIDIA_CUVS=1) + target_compile_definitions(faiss_avx512_spr PUBLIC USE_NVIDIA_CUVS=1) # Mark all functions as hidden so that we don't generate # global 'public' functions that also exist in libraft.so @@ -305,6 +306,7 @@ set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE) target_link_libraries(faiss PRIVATE "$") target_link_libraries(faiss_avx2 PRIVATE "$") target_link_libraries(faiss_avx512 PRIVATE "$") +target_link_libraries(faiss_avx512_spr PRIVATE "$") target_link_libraries(faiss_sve PRIVATE "$") foreach(header ${FAISS_GPU_HEADERS}) diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt index 41e08eb348..3fc46f4c56 100644 --- a/faiss/python/CMakeLists.txt +++ b/faiss/python/CMakeLists.txt @@ -54,11 +54,13 @@ endmacro() # we duplicate the source in order to override the module name. configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_avx2.swig COPYONLY) configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_avx512.swig COPYONLY) +configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_avx512_spr.swig COPYONLY) configure_file(swigfaiss.swig ${CMAKE_CURRENT_SOURCE_DIR}/swigfaiss_sve.swig COPYONLY) configure_swigfaiss(swigfaiss.swig) configure_swigfaiss(swigfaiss_avx2.swig) configure_swigfaiss(swigfaiss_avx512.swig) +configure_swigfaiss(swigfaiss_avx512_spr.swig) configure_swigfaiss(swigfaiss_sve.swig) configure_swigfaiss(faiss_example_external_module.swig) @@ -72,6 +74,8 @@ if(TARGET faiss) "${faiss_SOURCE_DIR}/faiss/${h}") list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx512_spr_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/${h}") list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/${h}") list(APPEND SWIG_MODULE_faiss_example_external_module_EXTRA_DEPS @@ -85,6 +89,8 @@ if(TARGET faiss) "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx512_spr_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") list(APPEND SWIG_MODULE_faiss_example_external_module_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu-rocm/${h}") endforeach() @@ -96,6 +102,8 @@ if(TARGET faiss) "${faiss_SOURCE_DIR}/faiss/gpu/${h}") list(APPEND SWIG_MODULE_swigfaiss_avx512_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu/${h}") + list(APPEND SWIG_MODULE_swigfaiss_avx512_spr_EXTRA_DEPS + "${faiss_SOURCE_DIR}/faiss/gpu/${h}") list(APPEND SWIG_MODULE_swigfaiss_sve_EXTRA_DEPS "${faiss_SOURCE_DIR}/faiss/gpu/${h}") list(APPEND SWIG_MODULE_faiss_example_external_module_EXTRA_DEPS @@ -146,6 +154,18 @@ if(NOT FAISS_OPT_LEVEL STREQUAL "avx512") set_target_properties(swigfaiss_avx512 PROPERTIES EXCLUDE_FROM_ALL TRUE) endif() +set_property(SOURCE swigfaiss_avx512_spr.swig + PROPERTY SWIG_MODULE_NAME swigfaiss_avx512_spr) +swig_add_library(swigfaiss_avx512_spr + TYPE SHARED + LANGUAGE python + SOURCES swigfaiss_avx512_spr.swig +) +set_property(TARGET swigfaiss_avx512_spr PROPERTY SWIG_COMPILE_OPTIONS -doxygen) +if(NOT FAISS_OPT_LEVEL STREQUAL "avx512_spr") + set_target_properties(swigfaiss_avx512_spr PROPERTIES EXCLUDE_FROM_ALL TRUE) +endif() + set_property(SOURCE swigfaiss_sve.swig PROPERTY SWIG_MODULE_NAME swigfaiss_sve) swig_add_library(swigfaiss_sve @@ -172,6 +192,7 @@ if(NOT WIN32) set_target_properties(swigfaiss PROPERTIES SUFFIX .so) set_target_properties(swigfaiss_avx2 PROPERTIES SUFFIX .so) set_target_properties(swigfaiss_avx512 PROPERTIES SUFFIX .so) + set_target_properties(swigfaiss_avx512_spr PROPERTIES SUFFIX .so) set_target_properties(swigfaiss_sve PROPERTIES SUFFIX .so) set_target_properties(faiss_example_external_module PROPERTIES SUFFIX .so) else() @@ -179,6 +200,7 @@ else() target_compile_options(swigfaiss PRIVATE /bigobj) target_compile_options(swigfaiss_avx2 PRIVATE /bigobj) target_compile_options(swigfaiss_avx512 PRIVATE /bigobj) + target_compile_options(swigfaiss_avx512_spr PRIVATE /bigobj) target_compile_options(swigfaiss_sve PRIVATE /bigobj) target_compile_options(faiss_example_external_module PRIVATE /bigobj) endif() @@ -188,6 +210,7 @@ if(FAISS_ENABLE_GPU) target_link_libraries(swigfaiss PRIVATE hip::host) target_link_libraries(swigfaiss_avx2 PRIVATE hip::host) target_link_libraries(swigfaiss_avx512 PRIVATE hip::host) + target_link_libraries(swigfaiss_avx512_spr PRIVATE hip::host) target_link_libraries(faiss_example_external_module PRIVATE hip::host) else() find_package(CUDAToolkit REQUIRED) @@ -197,6 +220,7 @@ if(FAISS_ENABLE_GPU) target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$:cuvs::cuvs>) target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$:cuvs::cuvs>) target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$:cuvs::cuvs>) + target_link_libraries(swigfaiss_avx512_spr PRIVATE CUDA::cudart $<$:cuvs::cuvs>) target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$:cuvs::cuvs>) endif() endif() @@ -224,6 +248,13 @@ target_link_libraries(swigfaiss_avx512 PRIVATE OpenMP::OpenMP_CXX ) +target_link_libraries(swigfaiss_avx512_spr PRIVATE + faiss_avx512_spr + Python::Module + Python::NumPy + OpenMP::OpenMP_CXX +) + target_link_libraries(swigfaiss_sve PRIVATE faiss_sve Python::Module @@ -244,6 +275,7 @@ target_link_libraries(faiss_example_external_module PRIVATE target_include_directories(swigfaiss PRIVATE ${PROJECT_SOURCE_DIR}/../..) target_include_directories(swigfaiss_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/../..) target_include_directories(swigfaiss_avx512 PRIVATE ${PROJECT_SOURCE_DIR}/../..) +target_include_directories(swigfaiss_avx512_spr PRIVATE ${PROJECT_SOURCE_DIR}/../..) target_include_directories(swigfaiss_sve PRIVATE ${PROJECT_SOURCE_DIR}/../..) target_include_directories(faiss_example_external_module PRIVATE ${PROJECT_SOURCE_DIR}/../..) @@ -270,6 +302,7 @@ target_include_directories(faiss_python_callbacks PRIVATE ${Python_INCLUDE_DIRS} target_link_libraries(swigfaiss PRIVATE faiss_python_callbacks) target_link_libraries(swigfaiss_avx2 PRIVATE faiss_python_callbacks) target_link_libraries(swigfaiss_avx512 PRIVATE faiss_python_callbacks) +target_link_libraries(swigfaiss_avx512_spr PRIVATE faiss_python_callbacks) target_link_libraries(swigfaiss_sve PRIVATE faiss_python_callbacks) target_link_libraries(faiss_example_external_module PRIVATE faiss_python_callbacks) diff --git a/faiss/python/loader.py b/faiss/python/loader.py index 9f5be7d2ed..caef9e5512 100644 --- a/faiss/python/loader.py +++ b/faiss/python/loader.py @@ -67,6 +67,9 @@ def is_sve_supported(): result.add("AVX2") if "avx512" in numpy.distutils.cpuinfo.cpu.info[0].get('flags', ""): result.add("AVX512") + if "avx512_fp16" in numpy.distutils.cpuinfo.cpu.info[0].get('flags', ""): + # avx512_fp16 is supported starting SPR + result.add("AVX512_SPR") if is_sve_supported(): result.add("SVE") for f in os.getenv("FAISS_DISABLE_CPU_FEATURES", "").split(", \t\n\r"): @@ -92,6 +95,18 @@ def is_sve_supported(): instruction_sets.add(opt_level) loaded = False +has_AVX512_SPR = any("AVX512_SPR" in x.upper() for x in instruction_sets) +if has_AVX512_SPR: + try: + logger.info("Loading faiss with AVX512-SPR support.") + from .swigfaiss_avx512_spr import * + logger.info("Successfully loaded faiss with AVX512-SPR support.") + loaded = True + except ImportError as e: + logger.info(f"Could not load library with AVX512-SPR support due to:\n{e!r}") + # reset so that we load without AVX512 below + loaded = False + has_AVX512 = any("AVX512" in x.upper() for x in instruction_sets) if has_AVX512: try: diff --git a/faiss/python/setup.py b/faiss/python/setup.py index 89c7671f7f..90c6846a58 100644 --- a/faiss/python/setup.py +++ b/faiss/python/setup.py @@ -28,6 +28,7 @@ swigfaiss_generic_lib = f"{prefix}_swigfaiss{ext}" swigfaiss_avx2_lib = f"{prefix}_swigfaiss_avx2{ext}" swigfaiss_avx512_lib = f"{prefix}_swigfaiss_avx512{ext}" +swigfaiss_avx512_spr_lib = f"{prefix}_swigfaiss_avx512_spr{ext}" callbacks_lib = f"{prefix}libfaiss_python_callbacks{ext}" swigfaiss_sve_lib = f"{prefix}_swigfaiss_sve{ext}" faiss_example_external_module_lib = f"_faiss_example_external_module{ext}" @@ -35,6 +36,7 @@ found_swigfaiss_generic = os.path.exists(swigfaiss_generic_lib) found_swigfaiss_avx2 = os.path.exists(swigfaiss_avx2_lib) found_swigfaiss_avx512 = os.path.exists(swigfaiss_avx512_lib) +found_swigfaiss_avx512_spr = os.path.exists(swigfaiss_avx512_spr_lib) found_callbacks = os.path.exists(callbacks_lib) found_swigfaiss_sve = os.path.exists(swigfaiss_sve_lib) found_faiss_example_external_module_lib = os.path.exists( @@ -42,10 +44,10 @@ ) assert ( - found_swigfaiss_generic or found_swigfaiss_avx2 or found_swigfaiss_avx512 or found_swigfaiss_sve or found_faiss_example_external_module_lib + found_swigfaiss_generic or found_swigfaiss_avx2 or found_swigfaiss_avx512 or found_swigfaiss_avx512_spr or found_swigfaiss_sve or found_faiss_example_external_module_lib ), ( f"Could not find {swigfaiss_generic_lib} or " - f"{swigfaiss_avx2_lib} or {swigfaiss_avx512_lib} or {swigfaiss_sve_lib} or {faiss_example_external_module_lib}. " + f"{swigfaiss_avx2_lib} or {swigfaiss_avx512_lib} or {swigfaiss_avx512_spr_lib} or {swigfaiss_sve_lib} or {faiss_example_external_module_lib}. " f"Faiss may not be compiled yet." ) @@ -64,6 +66,11 @@ shutil.copyfile("swigfaiss_avx512.py", "faiss/swigfaiss_avx512.py") shutil.copyfile(swigfaiss_avx512_lib, f"faiss/_swigfaiss_avx512{ext}") +if found_swigfaiss_avx512_spr: + print(f"Copying {swigfaiss_avx512_spr_lib}") + shutil.copyfile("swigfaiss_avx512_spr.py", "faiss/swigfaiss_avx512_spr.py") + shutil.copyfile(swigfaiss_avx512_spr_lib, f"faiss/_swigfaiss_avx512_spr{ext}") + if found_callbacks: print(f"Copying {callbacks_lib}") shutil.copyfile(callbacks_lib, f"faiss/{callbacks_lib}") diff --git a/tests/test_contrib.py b/tests/test_contrib.py index a7b0b09155..ba185f92b2 100644 --- a/tests/test_contrib.py +++ b/tests/test_contrib.py @@ -573,8 +573,7 @@ def test_ivf_train_2level(self): # normally 47 / 200 differences ndiff = (Iref != Inew).sum() - self.assertLess(ndiff, 51) - + self.assertLess(ndiff, 53) class TestBigBatchSearch(unittest.TestCase): From ab8cb9cc20415fa3cf2a72a55f2b68e9ea9fea9a Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 23 Dec 2024 10:40:54 -0800 Subject: [PATCH 3/8] Link cuVS Docs (#4084) Summary: Small updates to the ReadMe files. More detailed description in a follow up PR for the wiki. Remove the cuvs conda CI checks Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4084 Reviewed By: mengdilin Differential Revision: D67602013 Pulled By: mnorris11 fbshipit-source-id: f7c40440d278f00195bcad2dbdd2187325f40662 --- .github/workflows/build-pull-request.yml | 30 ------------------ INSTALL.md | 39 ++++++++++++++++++------ README.md | 2 +- faiss/gpu/impl/CuvsIVFFlat.cu | 2 ++ faiss/gpu/impl/CuvsIVFPQ.cu | 2 ++ 5 files changed, 34 insertions(+), 41 deletions(-) diff --git a/.github/workflows/build-pull-request.yml b/.github/workflows/build-pull-request.yml index b312c71d37..bc0d2d625a 100644 --- a/.github/workflows/build-pull-request.yml +++ b/.github/workflows/build-pull-request.yml @@ -143,36 +143,6 @@ jobs: fetch-tags: true - name: Build and Package (conda) uses: ./.github/actions/build_conda - linux-x86_64-GPU-CUVS-CUDA11-8-0-conda: - name: Linux x86_64 GPU w/ cuVS conda (CUDA 11.8.0) - runs-on: 4-core-ubuntu-gpu-t4 - env: - CUDA_ARCHS: "70-real;72-real;75-real;80;86-real" - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - fetch-tags: true - - uses: ./.github/actions/build_conda - with: - cuvs: "ON" - cuda: "11.8.0" - linux-x86_64-GPU-CUVS-CUDA12-4-0-conda: - name: Linux x86_64 GPU w/ cuVS conda (CUDA 12.4.0) - runs-on: 4-core-ubuntu-gpu-t4 - env: - CUDA_ARCHS: "70-real;72-real;75-real;80;86-real" - steps: - - name: Checkout - uses: actions/checkout@v4 - with: - fetch-depth: 0 - fetch-tags: true - - uses: ./.github/actions/build_conda - with: - cuvs: "ON" - cuda: "12.4.0" windows-x86_64-conda: name: Windows x86_64 (conda) needs: linux-x86_64-cmake diff --git a/INSTALL.md b/INSTALL.md index c5f76d47d5..26b51a80b1 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -6,7 +6,7 @@ pre-release nightly builds. - The CPU-only faiss-cpu conda package is currently available on Linux (x86-64 and aarch64), OSX (arm64 only), and Windows (x86-64) - faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86-64 only) for CUDA 11.4 and 12.1 -- faiss-gpu-raft containing both CPU and GPU indices provided by NVIDIA RAFT, is available on Linux (x86-64 only) for CUDA 11.8 and 12.1. +- faiss-gpu-raft [^1] package containing GPU indices provided by [NVIDIA RAFT](https://github.com/rapidsai/raft/) version 24.06, is available on Linux (x86-64 only) for CUDA 11.8 and 12.4. To install the latest stable release: @@ -23,10 +23,9 @@ $ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1 # GPU(+CPU) version using AMD ROCm not yet available ``` -For faiss-gpu, the nvidia channel is required for CUDA, which is not -published in the main anaconda channel. +For faiss-gpu, the nvidia channel is required for CUDA, which is not published in the main anaconda channel. -For faiss-gpu-raft, the nvidia, rapidsai and conda-forge channels are required. +For faiss-gpu-raft, the rapidsai, conda-forge and nvidia channels are required. Nightly pre-release packages can be installed as follows: @@ -37,8 +36,11 @@ $ conda install -c pytorch/label/nightly faiss-cpu # GPU(+CPU) version $ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.9.0 -# GPU(+CPU) version with NVIDIA RAFT -conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1.9.0 pytorch pytorch-cuda numpy +# GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 12.4) +conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=12.0,<=12.5' + +# GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 11.8) +conda install -c pytorch -c rapidsai -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version>=11.4,<=11.8' # GPU(+CPU) version using AMD ROCm not yet available ``` @@ -68,7 +70,7 @@ $ conda install -c conda-forge faiss-cpu # GPU version $ conda install -c conda-forge faiss-gpu -# AMD ROCm version not yet available +# NVIDIA cuVS and AMD ROCm version not yet available ``` You can tell which channel your conda packages come from by using `conda list`. @@ -95,6 +97,8 @@ The optional requirements are: - the CUDA toolkit, - for AMD GPUs: - AMD ROCm, +- for using NVIDIA cuVS implementations: + - libcuvs=24.12 - for the python bindings: - python 3, - numpy, @@ -103,6 +107,19 @@ The optional requirements are: Indications for specific configurations are available in the [troubleshooting section of the wiki](https://github.com/facebookresearch/faiss/wiki/Troubleshooting). +### Building with NVIDIA cuVS + +The libcuvs dependency should be installed via conda: +1. With CUDA 12.0 - 12.5: +``` +conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=12.0,<=12.5' +``` +2. With CUDA 11.4 - 11.8 +``` +conda install -c rapidsai -c conda-forge -c nvidia libcuvs=24.12 'cuda-version>=11.4,<=11.8' +``` +For more ways to install cuVS 24.12, refer to the [RAPIDS Installation Guide](https://docs.rapids.ai/install). + ## Step 1: invoking CMake ``` shell @@ -118,9 +135,9 @@ Several options can be passed to CMake, among which: values are `ON` and `OFF`), - `-DFAISS_ENABLE_PYTHON=OFF` in order to disable building python bindings (possible values are `ON` and `OFF`), - - `-DFAISS_ENABLE_CUVS=ON` in order to enable building the cuVS implementations - of the IVF-Flat and IVF-PQ GPU-accelerated indices (default is `OFF`, possible - values are `ON` and `OFF`) + - `-DFAISS_ENABLE_CUVS=ON` in order to use the NVIDIA cuVS implementations + of the IVF-Flat, IVF-PQ and [CAGRA](https://arxiv.org/pdf/2308.15136) GPU-accelerated indices (default is `OFF`, possible, values are `ON` and `OFF`). + Note: `-DFAISS_ENABLE_GPU` must be set to `ON` when enabling this option. - `-DBUILD_TESTING=OFF` in order to disable building C++ tests, - `-DBUILD_SHARED_LIBS=ON` in order to build a shared library (possible values are `ON` and `OFF`), @@ -302,3 +319,5 @@ and you can run $ python demos/demo_auto_tune.py ``` to test the GPU code. + +[^1]: The vector search and clustering algorithms in NVIDIA RAFT have been formally migrated to [NVIDIA cuVS](https://github.com/rapidsai/cuvs). This package is being renamed to `faiss-gpu-cuvs` in the next stable release, which will use these GPU implementations from the pre-compiled `libcuvs=24.12` binary. diff --git a/README.md b/README.md index f00f4d7a3c..468ba59ab6 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The GPU implementation can accept input from either CPU or GPU memory. On a serv ## Installing -Faiss comes with precompiled libraries for Anaconda in Python, see [faiss-cpu](https://anaconda.org/pytorch/faiss-cpu) and [faiss-gpu](https://anaconda.org/pytorch/faiss-gpu). The library is mostly implemented in C++, the only dependency is a [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) implementation. Optional GPU support is provided via CUDA or AMD ROCm, and the Python interface is also optional. It compiles with cmake. See [INSTALL.md](INSTALL.md) for details. +Faiss comes with precompiled libraries for Anaconda in Python, see [faiss-cpu](https://anaconda.org/pytorch/faiss-cpu), [faiss-gpu](https://anaconda.org/pytorch/faiss-gpu) and [faiss-gpu-cuvs](https://anaconda.org/pytorch/faiss-gpu-cuvs). The library is mostly implemented in C++, the only dependency is a [BLAS](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) implementation. Optional GPU support is provided via CUDA or AMD ROCm, and the Python interface is also optional. The backend GPU implementations of NVIDIA [cuVS](https://github.com/rapidsai/cuvs) can also be enabled optionally. It compiles with cmake. See [INSTALL.md](INSTALL.md) for details. ## How Faiss works diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu index 0de7100c72..2cccee8605 100644 --- a/faiss/gpu/impl/CuvsIVFFlat.cu +++ b/faiss/gpu/impl/CuvsIVFFlat.cu @@ -291,6 +291,8 @@ void CuvsIVFFlat::searchPreassigned( Tensor& outIndices, bool storePairs) { // TODO: Fill this in! + // Reference issue: https://github.com/facebookresearch/faiss/issues/3243 + FAISS_THROW_MSG("searchPreassigned is not implemented for cuVS index"); } void CuvsIVFFlat::updateQuantizer(Index* quantizer) { diff --git a/faiss/gpu/impl/CuvsIVFPQ.cu b/faiss/gpu/impl/CuvsIVFPQ.cu index 2fc94de0f0..1e2fef225d 100644 --- a/faiss/gpu/impl/CuvsIVFPQ.cu +++ b/faiss/gpu/impl/CuvsIVFPQ.cu @@ -229,6 +229,8 @@ void CuvsIVFPQ::searchPreassigned( Tensor& outIndices, bool storePairs) { // TODO: Fill this in! + // Reference issue: https://github.com/facebookresearch/faiss/issues/3243 + FAISS_THROW_MSG("searchPreassigned is not implemented for cuVS index"); } size_t CuvsIVFPQ::getGpuListEncodingSize_(idx_t listId) { From 3c8dc4194907e9b911551d5a009468106f8b9c7f Mon Sep 17 00:00:00 2001 From: Satyendra Mishra Date: Mon, 23 Dec 2024 11:27:51 -0800 Subject: [PATCH 4/8] Set KnnDescriptor.desc_name in the Benchmarking core framework in FAISS like other descriptors (#4109) Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4109 Set KnnDescriptor.desc_name in the Benchmarking core framework in FAISS like other descriptors Reviewed By: mnorris11 Differential Revision: D67539874 fbshipit-source-id: 09ffb76296f466ae2d3b0eb551917f429bc7300f --- benchs/bench_fw/descriptors.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py index f6164f54e2..8b1d65a505 100644 --- a/benchs/bench_fw/descriptors.py +++ b/benchs/bench_fw/descriptors.py @@ -341,6 +341,8 @@ def __hash__(self): return hash(str(self)) def get_name(self): + if self.desc_name is not None: + return self.desc_name name = self.index_desc.get_name() name += IndexBaseDescriptor.param_dict_to_name(self.search_params) name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX) @@ -350,6 +352,7 @@ def get_name(self): name += "rec." else: name += "knn." + self.desc_name = name return name def flat_name(self): From ab479a1ca75142606ed94683d538dc358e28e69d Mon Sep 17 00:00:00 2001 From: Junjie Qi Date: Thu, 26 Dec 2024 21:33:45 -0800 Subject: [PATCH 5/8] enable quiet mode for conda install (#4112) Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4112 Reviewed By: gtwang01 Differential Revision: D67662530 Pulled By: junjieqi fbshipit-source-id: 576198d79e947009025641bfa94b3ad9ea2fe1b4 --- .github/actions/build_conda/action.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml index 95c2d60c1f..ff860007b2 100644 --- a/.github/actions/build_conda/action.yml +++ b/.github/actions/build_conda/action.yml @@ -34,8 +34,8 @@ runs: - name: Install conda build tools shell: ${{ steps.choose_shell.outputs.shell }} run: | - conda install -y "conda!=24.11.0" - conda install -y "conda-build!=24.11.0" + conda install -y -q "conda!=24.11.0" + conda install -y -q "conda-build!=24.11.0" - name: Fix CI failure shell: ${{ steps.choose_shell.outputs.shell }} if: runner.os != 'Windows' From 0cbc2a885cde923d80c4bf9c9d6f4d81665f3f64 Mon Sep 17 00:00:00 2001 From: Mulugeta Mammo Date: Fri, 27 Dec 2024 09:44:28 -0800 Subject: [PATCH 6/8] Use _mm512_popcnt_epi64 to speedup hamming distance evaluation. (#4020) Summary: The `_mm512_popcnt_epi64` intrinsic is used to accelerate Hamming distance calculations in `HammingComputerDefault` and `HammingComputer64`. Benchmarking with [bench_hamming_computer](https://github.com/facebookresearch/faiss/blob/main/benchs/bench_hamming_computer.cpp) on AWS [r7i](https://aws.amazon.com/ec2/instance-types/r7i/) instance shows a performance improvement of up to 30% compared to AVX-2. This PR depends on [PR#4025](https://github.com/facebookresearch/faiss/pull/4025) Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4020 Reviewed By: junjieqi Differential Revision: D67650183 Pulled By: mengdilin fbshipit-source-id: 17e5b68570dced1fea0b885dd4e67c17dfc7bece --- faiss/CMakeLists.txt | 1 + faiss/utils/hamming_distance/avx512-inl.h | 490 ++++++++++++++++++++++ faiss/utils/hamming_distance/hamdis-inl.h | 3 + 3 files changed, 494 insertions(+) create mode 100644 faiss/utils/hamming_distance/avx512-inl.h diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index 535710b829..6e9eb883a7 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -230,6 +230,7 @@ set(FAISS_HEADERS utils/hamming_distance/hamdis-inl.h utils/hamming_distance/neon-inl.h utils/hamming_distance/avx2-inl.h + utils/hamming_distance/avx512-inl.h ) if(NOT WIN32) diff --git a/faiss/utils/hamming_distance/avx512-inl.h b/faiss/utils/hamming_distance/avx512-inl.h new file mode 100644 index 0000000000..2b2302e8d5 --- /dev/null +++ b/faiss/utils/hamming_distance/avx512-inl.h @@ -0,0 +1,490 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef HAMMING_AVX512_INL_H +#define HAMMING_AVX512_INL_H + +// AVX512 version +// The _mm512_popcnt_epi64 intrinsic is used to accelerate Hamming distance +// calculations in HammingComputerDefault and HammingComputer64. This intrinsic +// is not available in the default FAISS avx512 build mode but is only +// available in the avx512_spr build mode, which targets Intel(R) Sapphire +// Rapids. + +#include +#include +#include + +#include + +#include + +namespace faiss { + +/* Elementary Hamming distance computation: unoptimized */ +template +inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { + const size_t nbytes = nbits / 8; + size_t i; + T h = 0; + for (i = 0; i < nbytes; i++) { + h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; + } + return h; +} + +/* Hamming distances for multiples of 64 bits */ +template +inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) { + const size_t nwords = nbits / 64; + size_t i; + hamdis_t h = 0; + for (i = 0; i < nwords; i++) { + h += popcount64(bs1[i] ^ bs2[i]); + } + return h; +} + +/* specialized (optimized) functions */ +template <> +inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) { + return popcount64(pa[0] ^ pb[0]); +} + +template <> +inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) { + return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]); +} + +template <> +inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) { + return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) + + popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]); +} + +/* Hamming distances for multiple of 64 bits */ +inline hamdis_t hamming( + const uint64_t* bs1, + const uint64_t* bs2, + size_t nwords) { + hamdis_t h = 0; + for (size_t i = 0; i < nwords; i++) { + h += popcount64(bs1[i] ^ bs2[i]); + } + return h; +} + +/****************************************************************** + * The HammingComputer series of classes compares a single code of + * size 4 to 32 to incoming codes. They are intended for use as a + * template class where it would be inefficient to switch on the code + * size in the inner loop. Hopefully the compiler will inline the + * hamming() functions and put the a0, a1, ... in registers. + ******************************************************************/ + +struct HammingComputer4 { + uint32_t a0; + + HammingComputer4() {} + + HammingComputer4(const uint8_t* a, int code_size) { + set(a, code_size); + } + + void set(const uint8_t* a, int code_size) { + assert(code_size == 4); + a0 = *(uint32_t*)a; + } + + inline int hamming(const uint8_t* b) const { + return popcount64(*(uint32_t*)b ^ a0); + } + + inline static constexpr int get_code_size() { + return 4; + } +}; + +struct HammingComputer8 { + uint64_t a0; + + HammingComputer8() {} + + HammingComputer8(const uint8_t* a, int code_size) { + set(a, code_size); + } + + void set(const uint8_t* a, int code_size) { + assert(code_size == 8); + a0 = *(uint64_t*)a; + } + + inline int hamming(const uint8_t* b) const { + return popcount64(*(uint64_t*)b ^ a0); + } + + inline static constexpr int get_code_size() { + return 8; + } +}; + +struct HammingComputer16 { + uint64_t a0, a1; + + HammingComputer16() {} + + HammingComputer16(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, int code_size) { + assert(code_size == 16); + const uint64_t* a = (uint64_t*)a8; + a0 = a[0]; + a1 = a[1]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = (uint64_t*)b8; + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1); + } + + inline static constexpr int get_code_size() { + return 16; + } +}; + +// when applied to an array, 1/2 of the 64-bit accesses are unaligned. +// This incurs a penalty of ~10% wrt. fully aligned accesses. +struct HammingComputer20 { + uint64_t a0, a1; + uint32_t a2; + + HammingComputer20() {} + + HammingComputer20(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, int code_size) { + assert(code_size == 20); + const uint64_t* a = (uint64_t*)a8; + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = (uint64_t*)b8; + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + + popcount64(*(uint32_t*)(b + 2) ^ a2); + } + + inline static constexpr int get_code_size() { + return 20; + } +}; + +struct HammingComputer32 { + uint64_t a0, a1, a2, a3; + + HammingComputer32() {} + + HammingComputer32(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, int code_size) { + assert(code_size == 32); + const uint64_t* a = (uint64_t*)a8; + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = (uint64_t*)b8; + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + + popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3); + } + + inline static constexpr int get_code_size() { + return 32; + } +}; + +struct HammingComputer64 { + uint64_t a0, a1, a2, a3, a4, a5, a6, a7; + const uint64_t* a; + + HammingComputer64() {} + + HammingComputer64(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, int code_size) { + assert(code_size == 64); + a = (uint64_t*)a8; + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = (uint64_t*)b8; +#ifdef __AVX512VPOPCNTDQ__ + __m512i vxor = + _mm512_xor_si512(_mm512_loadu_si512(a), _mm512_loadu_si512(b)); + __m512i vpcnt = _mm512_popcnt_epi64(vxor); + // reduce performs better than adding the lower and higher parts + return _mm512_reduce_add_epi32(vpcnt); +#else + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + + popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) + + popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) + + popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7); +#endif + } + + inline static constexpr int get_code_size() { + return 64; + } +}; + +struct HammingComputerDefault { + const uint8_t* a8; + int quotient8; + int remainder8; + + HammingComputerDefault() {} + + HammingComputerDefault(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8_2, int code_size) { + this->a8 = a8_2; + quotient8 = code_size / 8; + remainder8 = code_size % 8; + } + + int hamming(const uint8_t* b8) const { + int accu = 0; + + const uint64_t* a64 = reinterpret_cast(a8); + const uint64_t* b64 = reinterpret_cast(b8); + + int i = 0; +#ifdef __AVX512VPOPCNTDQ__ + int quotient64 = quotient8 / 8; + for (; i < quotient64; ++i) { + __m512i vxor = _mm512_xor_si512( + _mm512_loadu_si512(&a64[i * 8]), + _mm512_loadu_si512(&b64[i * 8])); + __m512i vpcnt = _mm512_popcnt_epi64(vxor); + // reduce performs better than adding the lower and higher parts + accu += _mm512_reduce_add_epi32(vpcnt); + } + i *= 8; +#endif + int len = quotient8 - i; + switch (len & 7) { + default: + while (len > 7) { + len -= 8; + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 7: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 6: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 5: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 4: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 3: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 2: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 1: + accu += popcount64(a64[i] ^ b64[i]); + i++; + } + } + if (remainder8) { + const uint8_t* a = a8 + 8 * quotient8; + const uint8_t* b = b8 + 8 * quotient8; + switch (remainder8) { + case 7: + accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; + [[fallthrough]]; + case 6: + accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; + [[fallthrough]]; + case 5: + accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; + [[fallthrough]]; + case 4: + accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; + [[fallthrough]]; + case 3: + accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; + [[fallthrough]]; + case 2: + accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; + [[fallthrough]]; + case 1: + accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; + [[fallthrough]]; + default: + break; + } + } + + return accu; + } + + inline int get_code_size() const { + return quotient8 * 8 + remainder8; + } +}; + +/*************************************************************************** + * generalized Hamming = number of bytes that are different between + * two codes. + ***************************************************************************/ + +inline int generalized_hamming_64(uint64_t a) { + a |= a >> 1; + a |= a >> 2; + a |= a >> 4; + a &= 0x0101010101010101UL; + return popcount64(a); +} + +struct GenHammingComputer8 { + uint64_t a0; + + GenHammingComputer8(const uint8_t* a, int code_size) { + assert(code_size == 8); + a0 = *(uint64_t*)a; + } + + inline int hamming(const uint8_t* b) const { + return generalized_hamming_64(*(uint64_t*)b ^ a0); + } + + inline static constexpr int get_code_size() { + return 8; + } +}; + +// I'm not sure whether this version is faster of slower, tbh +// todo: test on different CPUs +struct GenHammingComputer16 { + __m128i a; + + GenHammingComputer16(const uint8_t* a8, int code_size) { + assert(code_size == 16); + a = _mm_loadu_si128((const __m128i_u*)a8); + } + + inline int hamming(const uint8_t* b8) const { + const __m128i b = _mm_loadu_si128((const __m128i_u*)b8); + const __m128i cmp = _mm_cmpeq_epi8(a, b); + const auto movemask = _mm_movemask_epi8(cmp); + return 16 - popcount32(movemask); + } + + inline static constexpr int get_code_size() { + return 16; + } +}; + +struct GenHammingComputer32 { + __m256i a; + + GenHammingComputer32(const uint8_t* a8, int code_size) { + assert(code_size == 32); + a = _mm256_loadu_si256((const __m256i_u*)a8); + } + + inline int hamming(const uint8_t* b8) const { + const __m256i b = _mm256_loadu_si256((const __m256i_u*)b8); + const __m256i cmp = _mm256_cmpeq_epi8(a, b); + const uint32_t movemask = _mm256_movemask_epi8(cmp); + return 32 - popcount32(movemask); + } + + inline static constexpr int get_code_size() { + return 32; + } +}; + +// A specialized version might be needed for the very long +// GenHamming code_size. In such a case, one may accumulate +// counts using _mm256_sub_epi8 and then compute a horizontal +// sum (using _mm256_sad_epu8, maybe, in blocks of no larger +// than 256 * 32 bytes). + +struct GenHammingComputerM8 { + const uint64_t* a; + int n; + + GenHammingComputerM8(const uint8_t* a8, int code_size) { + assert(code_size % 8 == 0); + a = (uint64_t*)a8; + n = code_size / 8; + } + + int hamming(const uint8_t* b8) const { + const uint64_t* b = (uint64_t*)b8; + int accu = 0; + + int i = 0; + int n4 = (n / 4) * 4; + for (; i < n4; i += 4) { + const __m256i av = _mm256_loadu_si256((const __m256i_u*)(a + i)); + const __m256i bv = _mm256_loadu_si256((const __m256i_u*)(b + i)); + const __m256i cmp = _mm256_cmpeq_epi8(av, bv); + const uint32_t movemask = _mm256_movemask_epi8(cmp); + accu += 32 - popcount32(movemask); + } + + for (; i < n; i++) + accu += generalized_hamming_64(a[i] ^ b[i]); + return accu; + } + + inline int get_code_size() const { + return n * 8; + } +}; + +} // namespace faiss + +#endif diff --git a/faiss/utils/hamming_distance/hamdis-inl.h b/faiss/utils/hamming_distance/hamdis-inl.h index eac7054317..0c9ac1ba63 100644 --- a/faiss/utils/hamming_distance/hamdis-inl.h +++ b/faiss/utils/hamming_distance/hamdis-inl.h @@ -16,6 +16,9 @@ #ifdef __aarch64__ // ARM compilers may produce inoptimal code for Hamming distance somewhy. #include +#elif __AVX512F__ +// offers better performance where __AVX512VPOPCNTDQ__ is supported +#include #elif __AVX2__ // better versions for GenHammingComputer #include From 9590ad27460f65fe3dae3ba61d8b5e3e8d03265f Mon Sep 17 00:00:00 2001 From: Maria Lomeli Date: Mon, 6 Jan 2025 09:48:32 -0800 Subject: [PATCH 7/8] PQ with pytorch (#4116) Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4116 This diff implements Product Quantization using Pytorch only. Reviewed By: mdouze Differential Revision: D67766798 fbshipit-source-id: fe2d44a674fc2056f7e2082e9765052c98fdc8f8 --- contrib/torch/clustering.py | 1 + contrib/torch/quantization.py | 61 +++++++++++++++++++++++++++++------ tests/torch_test_contrib.py | 33 ++++++++++++++++--- 3 files changed, 82 insertions(+), 13 deletions(-) diff --git a/contrib/torch/clustering.py b/contrib/torch/clustering.py index 7438534605..8ff5cb24c3 100644 --- a/contrib/torch/clustering.py +++ b/contrib/torch/clustering.py @@ -13,6 +13,7 @@ # the kmeans can produce both torch and numpy centroids from faiss.contrib.clustering import kmeans + class DatasetAssign: """Wrapper for a tensor that offers a function to assign the vectors to centroids. All other implementations offer the same interface""" diff --git a/contrib/torch/quantization.py b/contrib/torch/quantization.py index a1d8f7dd8a..2ae6599a0c 100644 --- a/contrib/torch/quantization.py +++ b/contrib/torch/quantization.py @@ -7,33 +7,47 @@ This contrib module contains Pytorch code for quantization. """ -import numpy as np import torch import faiss - -from faiss.contrib import torch_utils +import math +from faiss.contrib.torch import clustering +# the kmeans can produce both torch and numpy centroids class Quantizer: def __init__(self, d, code_size): + """ + d: dimension of vectors + code_size: nb of bytes of the code (per vector) + """ self.d = d self.code_size = code_size def train(self, x): + """ + takes a n-by-d array and peforms training + """ pass def encode(self, x): + """ + takes a n-by-d float array, encodes to an n-by-code_size uint8 array + """ pass - def decode(self, x): + def decode(self, codes): + """ + takes a n-by-code_size uint8 array, returns a n-by-d array + """ pass class VectorQuantizer(Quantizer): def __init__(self, d, k): - code_size = int(torch.ceil(torch.log2(k) / 8)) + + code_size = int(math.ceil(torch.log2(k) / 8)) Quantizer.__init__(d, code_size) self.k = k @@ -42,12 +56,41 @@ def train(self, x): class ProductQuantizer(Quantizer): - def __init__(self, d, M, nbits): - code_size = int(torch.ceil(M * nbits / 8)) - Quantizer.__init__(d, code_size) + """ M: number of subvectors, d%M == 0 + nbits: number of bits that each vector is encoded into + """ + assert d % M == 0 + assert nbits == 8 # todo: implement other nbits values + code_size = int(math.ceil(M * nbits / 8)) + Quantizer.__init__(self, d, code_size) self.M = M self.nbits = nbits + self.code_size = code_size def train(self, x): - pass + nc = 2 ** self.nbits + sd = self.d // self.M + dev = x.device + dtype = x.dtype + self.codebook = torch.zeros((self.M, nc, sd), device=dev, dtype=dtype) + for m in range(self.M): + xsub = x[:, m * self.d // self.M: (m + 1) * self.d // self.M] + data = clustering.DatasetAssign(xsub.contiguous()) + self.codebook[m] = clustering.kmeans(2 ** self.nbits, data) + + def encode(self, x): + codes = torch.zeros((x.shape[0], self.code_size), dtype=torch.uint8) + for m in range(self.M): + xsub = x[:, m * self.d // self.M:(m + 1) * self.d // self.M] + _, I = faiss.knn(xsub.contiguous(), self.codebook[m], 1) + codes[:, m] = I.ravel() + return codes + + def decode(self, codes): + idxs = [codes[:, m].long() for m in range(self.M)] + vectors = [self.codebook[m, idxs[m], :] for m in range(self.M)] + stacked_vectors = torch.stack(vectors, dim=1) + cbd = self.codebook.shape[-1] + x_rec = stacked_vectors.reshape(-1, cbd * self.M) + return x_rec diff --git a/tests/torch_test_contrib.py b/tests/torch_test_contrib.py index 26c381b3cc..3eb6c6c43c 100644 --- a/tests/torch_test_contrib.py +++ b/tests/torch_test_contrib.py @@ -4,13 +4,14 @@ # LICENSE file in the root directory of this source tree. import torch # usort: skip -import unittest # usort: skip -import numpy as np # usort: skip +import unittest # usort: skip +import numpy as np # usort: skip -import faiss # usort: skip +import faiss # usort: skip import faiss.contrib.torch_utils # usort: skip from faiss.contrib import datasets -from faiss.contrib.torch import clustering +from faiss.contrib.torch import clustering, quantization + @@ -400,3 +401,27 @@ def test_python_kmeans(self): # 33498.332 33380.477 # print(err, err2) 1/0 self.assertLess(err2, err * 1.1) + + +class TestQuantization(unittest.TestCase): + def test_python_product_quantization(self): + """ Test the python implementation of product quantization """ + d = 64 + n = 10000 + cs = 4 + nbits = 8 + M = 4 + x = np.random.random(size=(n, d)).astype('float32') + pq = faiss.ProductQuantizer(d, cs, nbits) + pq.train(x) + codes = pq.compute_codes(x) + x2 = pq.decode(codes) + diff = ((x - x2)**2).sum() + # vs pure pytorch impl + xt = torch.from_numpy(x) + my_pq = quantization.ProductQuantizer(d, M, nbits) + my_pq.train(xt) + my_codes = my_pq.encode(xt) + xt2 = my_pq.decode(my_codes) + my_diff = ((xt - xt2)**2).sum() + self.assertLess(abs(diff - my_diff), 100) From 162e6ce1cd3e4848e6a934c93653edf39dcbe47a Mon Sep 17 00:00:00 2001 From: Alexandr Guzhva Date: Mon, 6 Jan 2025 13:23:22 -0800 Subject: [PATCH 8/8] add range_search() to IndexRefine (#4022) Summary: This is very convenient to have `range_seach()` in `IndexRefine`. Unlike the plain `search()` method, `range_search()` just reevaluates the computed distances from the baseline index. The labels are not re-sorted according to new distances, because this is not listed as a requirement in a method description https://github.com/facebookresearch/faiss/blob/adb188411a98c3af5b7295c7016e5f46fee9eb07/faiss/Index.h#L150-L161 https://github.com/facebookresearch/faiss/blob/adb188411a98c3af5b7295c7016e5f46fee9eb07/faiss/impl/AuxIndexStructures.h#L35 Pull Request resolved: https://github.com/facebookresearch/faiss/pull/4022 Reviewed By: mnorris11 Differential Revision: D66116082 Pulled By: gtwang01 fbshipit-source-id: 915aca2570d5863c876c9497d4c885e270b9b220 --- faiss/IndexRefine.cpp | 39 ++++++++++++++++++++++++++++++++ faiss/IndexRefine.h | 7 ++++++ tests/test_refine.py | 52 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/faiss/IndexRefine.cpp b/faiss/IndexRefine.cpp index 8bc429a5e9..6f1f588e2e 100644 --- a/faiss/IndexRefine.cpp +++ b/faiss/IndexRefine.cpp @@ -166,6 +166,45 @@ void IndexRefine::search( } } +void IndexRefine::range_search( + idx_t n, + const float* x, + float radius, + RangeSearchResult* result, + const SearchParameters* params_in) const { + const IndexRefineSearchParameters* params = nullptr; + if (params_in) { + params = dynamic_cast(params_in); + FAISS_THROW_IF_NOT_MSG( + params, "IndexRefine params have incorrect type"); + } + + SearchParameters* base_index_params = + (params != nullptr) ? params->base_index_params : nullptr; + + base_index->range_search(n, x, radius, result, base_index_params); + +#pragma omp parallel if (n > 1) + { + std::unique_ptr dc( + refine_index->get_distance_computer()); + +#pragma omp for + for (idx_t i = 0; i < n; i++) { + dc->set_query(x + i * d); + + // reevaluate distances + const size_t idx_start = result->lims[i]; + const size_t idx_end = result->lims[i + 1]; + + for (size_t j = idx_start; j < idx_end; j++) { + const auto label = result->labels[j]; + result->distances[j] = (*dc)(label); + } + } + } +} + void IndexRefine::reconstruct(idx_t key, float* recons) const { refine_index->reconstruct(key, recons); } diff --git a/faiss/IndexRefine.h b/faiss/IndexRefine.h index 9ad4e4be29..255271695f 100644 --- a/faiss/IndexRefine.h +++ b/faiss/IndexRefine.h @@ -54,6 +54,13 @@ struct IndexRefine : Index { idx_t* labels, const SearchParameters* params = nullptr) const override; + void range_search( + idx_t n, + const float* x, + float radius, + RangeSearchResult* result, + const SearchParameters* params = nullptr) const override; + // reconstruct is routed to the refine_index void reconstruct(idx_t key, float* recons) const override; diff --git a/tests/test_refine.py b/tests/test_refine.py index f272584245..9b9ce73d0d 100644 --- a/tests/test_refine.py +++ b/tests/test_refine.py @@ -8,7 +8,7 @@ import unittest import faiss -from faiss.contrib import datasets +from faiss.contrib import datasets, evaluation class TestDistanceComputer(unittest.TestCase): @@ -119,3 +119,53 @@ def test_rflat(self): def test_refine_sq8(self): # this case uses the IndexRefine class self.do_test("IVF8,PQ2x4np,Refine(SQ8)") + + +class TestIndexRefineRangeSearch(unittest.TestCase): + + def do_test(self, factory_string): + d = 32 + radius = 8 + + ds = datasets.SyntheticDataset(d, 1024, 512, 256) + + index = faiss.index_factory(d, factory_string) + index.train(ds.get_train()) + index.add(ds.get_database()) + xq = ds.get_queries() + xb = ds.get_database() + + # perform a range_search + lims_1, D1, I1 = index.range_search(xq, radius) + + # create a baseline (FlatL2) + index_flat = faiss.IndexFlatL2(d) + index_flat.train(ds.get_train()) + index_flat.add(ds.get_database()) + + lims_ref, Dref, Iref = index_flat.range_search(xq, radius) + + # add a refine index on top of the index + index_r = faiss.IndexRefine(index, index_flat) + lims_2, D2, I2 = index_r.range_search(xq, radius) + + # validate: refined range_search() keeps indices untouched + precision_1, recall_1 = evaluation.range_PR(lims_ref, Iref, lims_1, I1) + + precision_2, recall_2 = evaluation.range_PR(lims_ref, Iref, lims_2, I2) + + self.assertAlmostEqual(recall_1, recall_2) + + # validate: refined range_search() updates distances, and new distances are correct L2 distances + for iq in range(0, ds.nq): + start_lim = lims_2[iq] + end_lim = lims_2[iq + 1] + for i_lim in range(start_lim, end_lim): + idx = I2[i_lim] + l2_dis = np.sum(np.square(xq[iq : iq + 1,] - xb[idx : idx + 1,])) + + self.assertAlmostEqual(l2_dis, D2[i_lim], places=4) + + + def test_refine_1(self): + self.do_test("SQ4")