Skip to content

Commit

Permalink
diskann cufe and pyanns (#65)
Browse files Browse the repository at this point in the history
* diskann cufe and pyanns

* diskann fix

* diskann
  • Loading branch information
iliujunn authored Jan 6, 2025
1 parent 3e3f2f1 commit 47fef00
Show file tree
Hide file tree
Showing 366 changed files with 56,701 additions and 9 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ jobs:
- name: Install toolchains
run: |
sudo apt-get update
sudo apt install gcc g++ cmake python3 pip libboost-dev libboost-all-dev libunwind-dev libgoogle-glog-dev libgflags-dev intel-mkl
# - name: Install torch
sudo apt install gcc g++ cmake python3 pip libboost-dev libboost-all-dev libunwind-dev libgoogle-glog-dev libgflags-dev intel-mkl libaio-dev libgoogle-perftools-dev libmkl-full-dev
# - name: Install torch
# run: |
# sudo pip3 install torch==1.13.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
- name: Install torch
Expand All @@ -33,7 +34,7 @@ jobs:
- name: Configure CMake
# Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make.
# See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_PREFIX_PATH='/usr/local/lib/python3.10/dist-packages/torch/share/cmake' -DENABLE_HDF5=ON -DENABLE_PYBIND=ON -DENABLE_PUCK=ON -DENABLE_SPTAG=ON
run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} -DCMAKE_PREFIX_PATH='/usr/local/lib/python3.10/dist-packages/torch/share/cmake' -DENABLE_HDF5=ON -DENABLE_PYBIND=ON -DENABLE_PUCK=ON -DENABLE_SPTAG=ON -DENABLE_DiskANN=ON
- name: Build
# Build your program with the given configuration
run: cmake --build ${{github.workspace}}/build --config ${{env.BUILD_TYPE}}
Expand Down
98 changes: 97 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ project(CANDYBENCH CXX)
include (cmake/FindCuda.cmake)
include (cmake/FindTorch.cmake)
find_package(Torch REQUIRED)
find_package(Python3 REQUIRED COMPONENTS Development)
include_directories(${Python3_INCLUDE_DIRS})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)


Expand Down Expand Up @@ -87,6 +89,26 @@ option(ENABLE_PYBIND
"Enable original pybind and build CANDYBENCH python"
OFF
)
option(ENABLE_DiskANN
"Enable DiskANN support"
OFF
)

if (NOT MSVC)
set(DISKANN_ASYNC_LIB aio)
endif()

set(ENV{TCMALLOC_LARGE_ALLOC_REPORT_THRESHOLD} 500000000000)
# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -DDEBUG -O0 -fsanitize=address -fsanitize=leak -fsanitize=undefined")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -DDEBUG -Wall -Wextra")
if (NOT PYBIND)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -DNDEBUG -march=native -mtune=native -ftree-vectorize")
else()
#-Ofast is super problematic for python. see: https://moyix.blogspot.com/2022/09/someones-been-messing-with-my-subnormals.html
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -march=native -mtune=native -ftree-vectorize")
add_compile_options(-fPIC)
endif()
add_compile_options(-march=native -Wall -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free -fopenmp -fopenmp-simd -funroll-loops -Wfatal-errors -DUSE_AVX2)

#OPTIONAL OPENCL
if (NOT ENABLE_OPENCL)
Expand Down Expand Up @@ -166,6 +188,42 @@ configure_file(
"${PROJECT_BINARY_DIR}/include/sptag_config.h"
)

file(GLOB_RECURSE DISKANN_SRC_FILES
"thirdparty/DiskANN/src/*.cpp"
"thirdparty/DiskANN/python/src/*.cpp"
)
set(DiskANN_BUILD_DIR ${CMAKE_BINARY_DIR}/DiskANN_build)
# Configure build output directories specifically for PUCK
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${DiskANN_BUILD_DIR}/bin)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${DiskANN_BUILD_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${DiskANN_BUILD_DIR}/lib)
file(GLOB DISKANN_SRC_FILES "thirdparty/DiskANN/src/*.cpp" )
file(GLOB DISKANN_HEADER_FILES "thirdparty/DiskANN/include/*.h")
if (NOT ENABLE_DiskANN)
message(STATUS "I will NOT include diskann LIBS")
set(CANDY_DiskANN 0)
else ()
set(CANDY_DiskANN 1)

message(STATUS "I will include support for DiskANN")

# Add DiskANN
find_package(OpenMP REQUIRED)


add_subdirectory(thirdparty/DiskANN)
include_directories(thirdparty/DiskANN/include)
# add_sources(
# thirdparty/DiskANN/src/pq_flash_index.cpp
#
# )
set(LIBRARIES ${LIBRARIES} diskann)
endif ()
configure_file(
"${PROJECT_SOURCE_DIR}/include/diskann_config.h.in"
"${PROJECT_BINARY_DIR}/include/diskann_config.h"
)

# Set PUCK build directory
set(PUCK_BUILD_DIR ${CMAKE_BINARY_DIR}/PUCK_build)

Expand Down Expand Up @@ -251,22 +309,60 @@ add_library(CANDYBENCH SHARED ${CANDY_SOURCE_FILES} ${CANDY_HEADER_FILES} ${CMAK
set_property(TARGET CANDYBENCH PROPERTY CXX_STANDARD 20)
target_include_directories(CANDYBENCH PUBLIC "include")

# 设置 MKL 库的路径
set(MKL_INCLUDE_DIR "/usr/include/mkl")
set(MKL_LIB_DIR "/usr/lib/x86_64-linux-gnu")
#set(MPI_INCLUDE_PATH "/usr/include/openmpi-x86_64")
#set(MPI_LIBRARIES "/usr/lib/x86_64-linux-gnu/openmpi/lib/libmpi.so")

set(MKL_LIBRARIES
"${MKL_LIB_DIR}/libmkl_intel_lp64.so"
"${MKL_LIB_DIR}/libmkl_sequential.so"
"${MKL_LIB_DIR}/libmkl_core.so"
)
target_include_directories(CANDYBENCH PUBLIC ${MKL_INCLUDE_DIR})
# MKL 和其他库的链接
target_link_libraries(CANDYBENCH PUBLIC
${MKL_LIBRARIES}
iomp5
pthread
m
dl
)
target_compile_options(CANDYBENCH PUBLIC
-m64
-DUSE_MKL
-I${MKL_INCLUDE_DIR}
)
link_directories(${MKL_LIB_DIR})

#these 2 ugly lines are for puck
#add_definitions(-DFINTEGER=int)
target_include_directories(CANDYBENCH PUBLIC "/usr/include/mkl")
#target_include_directories(CANDYBENCH PUBLIC "/usr/include/mkl")

target_include_directories(CANDYBENCH PUBLIC "${CMAKE_CURRENT_BINARY_DIR}")
target_include_directories(CANDYBENCH PUBLIC "thirdparty/papi_build/include")
target_include_directories(CANDYBENCH PUBLIC "thirdparty/")
target_include_directories(CANDYBENCH PUBLIC "thirdparty/puck")
target_include_directories(CANDYBENCH PUBLIC "thirdparty/puck/puck")
target_include_directories(CANDYBENCH PUBLIC "thirdparty/DiskANN")
target_include_directories(CANDYBENCH PUBLIC "thirdparty/DiskANN/python/include")
target_include_directories(CANDYBENCH PUBLIC "thirdparty/DiskANN/include")
target_sources(CANDYBENCH PUBLIC ${DISKANN_SRC_FILES})
target_sources(CANDYBENCH PUBLIC thirdparty/DiskANN/python/src/dynamic_memory_index.cpp)
if(OpenMP_CXX_FOUND)
target_link_libraries(CANDYBENCH PRIVATE OpenMP::OpenMP_CXX)
endif()
target_link_options(CANDYBENCH PUBLIC "-Wl,-rpath,./")
target_link_options(CANDYBENCH PUBLIC "-Wl,-rpath,./")
message(WARNING "CANDY GFLAGS_LIBRARY_DIR: ${GFLAGS_LIBRARIES}")
message(WARNING "CANDY GLOG_LIBRARY_DIR: ${GLOG_LIBRARIES}")
#target_link_libraries(CANDYBENCH PUBLIC ${GFLAGS_LIBRARIES} ${GLOG_LIBRARIES})
target_link_libraries(CANDYBENCH PUBLIC ${LIBRARIES})
install(TARGETS CANDYBENCH LIBRARY DESTINATION .)
find_library(LIBAIO_LIBRARIES NAMES aio)
message(STATUS "LIBAIO_LIBRARIES: ${LIBAIO_LIBRARIES}")
target_link_libraries(CANDYBENCH PUBLIC ${LIBAIO_LIBRARIES})
#add_subdirectory(test)
#Add benchmarks with command
#add_subdirectory(benchmark)
Expand Down
2 changes: 1 addition & 1 deletion buildCPUOnly.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ echo "First, make sure you have sudo"
sudo ls
echo "Installing others..."
sudo apt install -y liblapack-dev libblas-dev
sudo apt-get install -y graphviz libboost-all-dev swig libgtest-dev
sudo apt-get install -y graphviz libboost-all-dev swig libgtest-dev libaio-dev libgoogle-perftools-dev libmkl-full-dev
sudo apt-get install libgoogle-glog-dev libgflags-dev
pip install matplotlib pandas==2.0.0
pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cpu
Expand Down
2 changes: 1 addition & 1 deletion buildWithCuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ fi
echo "Installing others..."
sudo apt install -y liblapack-dev libblas-dev
sudo apt-get install -y graphviz libboost-all-dev swig libgflags-dev libgtest-dev
sudo apt-get install -y libcudnn8 libcudnn8-dev
sudo apt-get install -y libcudnn8 libcudnn8-dev libaio-dev libgoogle-perftools-dev libmkl-full-dev
pip install matplotlib pandas==2.0.0
pip install torch==2.4.0
echo "Build CANDY and PyCandy"
Expand Down
4 changes: 4 additions & 0 deletions include/diskann_config.h.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#ifndef CANDY_DISKANN_CONFIG_H_IN_H_
#define CANDY_DISKANN_CONFIG_H_IN_H_
#define CANDY_DISKANN @CANDY_SPTAG@
#endif
10 changes: 9 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def run(self):

def build_extension(self, ext):
extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
cfg = "Debug" if debug else "Release"
os.system("python3 -c 'import torch;print(torch.utils.cmake_prefix_path)' >> 1.txt")
with open('1.txt', 'r') as file:
torchCmake = file.read().rstrip('\n')
Expand All @@ -48,7 +50,13 @@ def build_extension(self, ext):
'-DENABLE_PYBIND=ON',
'-DCMAKE_INSTALL_PREFIX=/usr/local/lib',
'-DENABLE_PAPI=OFF',
'-DENABLE_PUCK=ON'
'-DENABLE_PUCK=ON',
'-DENABLE_DiskANN=ON',
'-DPYBIND=ON',
f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
f"-DPYTHON_EXECUTABLE={sys.executable}",
f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm
f"-DVERSION_INFO={self.distribution.get_version()}" # commented out, we want this set in the CMake file
]

cfg = 'Debug' if self.debug else 'Release'
Expand Down
93 changes: 91 additions & 2 deletions src/PyCANDY.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,16 @@
#if CANDY_PAPI == 1
#include <Utils/ThreadPerfPAPI.hpp>
#endif

#include "defaults.h"
#include "distance.h"
#include <DiskANN/python/include/dynamic_memory_index.h>
#include <DiskANN/python/include/builder.h>
#include <faiss/index_factory.h>
#include<puck/pyapi_wrapper/py_api_wrapper.h>

namespace py = pybind11;
using namespace INTELLI;
using namespace pybind11::literals;
using namespace CANDY;
torch::Tensor add_tensors(torch::Tensor a, torch::Tensor b) {
return a + b;
Expand Down Expand Up @@ -142,6 +146,52 @@ void update_gflag(const char* gflag_key, const char* gflag_val) {
google::SetCommandLineOption(gflag_key, gflag_val);
}

struct Variant
{
std::string disk_builder_name;
std::string memory_builder_name;
std::string dynamic_memory_index_name;
std::string static_memory_index_name;
std::string static_disk_index_name;
};
const Variant FloatVariant{"build_disk_float_index", "build_memory_float_index", "DynamicMemoryFloatIndex",
"StaticMemoryFloatIndex", "StaticDiskFloatIndex"};

const Variant UInt8Variant{"build_disk_uint8_index", "build_memory_uint8_index", "DynamicMemoryUInt8Index",
"StaticMemoryUInt8Index", "StaticDiskUInt8Index"};

const Variant Int8Variant{"build_disk_int8_index", "build_memory_int8_index", "DynamicMemoryInt8Index",
"StaticMemoryInt8Index", "StaticDiskInt8Index"};
template <typename T> inline void add_variant(py::module_ &m, const Variant &variant)
{


m.def(variant.memory_builder_name.c_str(), &diskannpy::build_memory_index<T>, "distance_metric"_a,
"data_file_path"_a, "index_output_path"_a, "graph_degree"_a, "complexity"_a, "alpha"_a, "num_threads"_a,
"use_pq_build"_a, "num_pq_bytes"_a, "use_opq"_a, "filter_complexity"_a = 0, "use_tags"_a = false);
py::class_<diskannpy::DynamicMemoryIndex<T>>(m, variant.dynamic_memory_index_name.c_str())
.def(py::init<const diskann::AlgoType ,const diskann::Metric, const size_t, const size_t, const uint32_t, const uint32_t, const bool,
const uint32_t, const float, const uint32_t, const uint32_t, const uint32_t, const uint32_t,
const uint32_t, const bool>(),
"algo_type"_a,"distance_metric"_a, "dimensions"_a, "max_vectors"_a, "complexity"_a, "graph_degree"_a,
"saturate_graph"_a = diskann::defaults::SATURATE_GRAPH,
"max_occlusion_size"_a = diskann::defaults::MAX_OCCLUSION_SIZE, "alpha"_a = diskann::defaults::ALPHA,
"num_threads"_a = diskann::defaults::NUM_THREADS,
"filter_complexity"_a = diskann::defaults::FILTER_LIST_SIZE,
"num_frozen_points"_a = diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC, "initial_search_complexity"_a = 0,
"search_threads"_a = 0, "concurrent_consolidation"_a = true)
.def("search", &diskannpy::DynamicMemoryIndex<T>::search, "query"_a, "knn"_a, "complexity"_a)
.def("load", &diskannpy::DynamicMemoryIndex<T>::load, "index_path"_a)
.def("batch_search", &diskannpy::DynamicMemoryIndex<T>::batch_search, "queries"_a, "num_queries"_a, "knn"_a,
"complexity"_a, "num_threads"_a)
.def("batch_insert", &diskannpy::DynamicMemoryIndex<T>::batch_insert, "vectors"_a, "ids"_a, "num_inserts"_a,
"num_threads"_a)
.def("save", &diskannpy::DynamicMemoryIndex<T>::save, "save_path"_a = "", "compact_before_save"_a = false)
.def("insert", &diskannpy::DynamicMemoryIndex<T>::insert, "vector"_a, "id"_a)
.def("mark_deleted", &diskannpy::DynamicMemoryIndex<T>::mark_deleted, "id"_a)
.def("consolidate_delete", &diskannpy::DynamicMemoryIndex<T>::consolidate_delete);

}
#define COMPILED_TIME (__DATE__ " " __TIME__)
PYBIND11_MODULE(PyCANDYAlgo, m) {
/**
Expand Down Expand Up @@ -253,7 +303,46 @@ PYBIND11_MODULE(PyCANDYAlgo, m) {
m_puck.def("update_gflag", &py_puck_api::update_gflag, "A function to update gflag");



auto m_diskann = m.def_submodule("diskannpy","diskann interface from microsoft.");
m_diskann.def("add_tensors", &add_tensors, "A function that adds two tensors");



py::module_ default_values = m_diskann.def_submodule(
"defaults",
"A collection of the default values used for common diskann operations. `GRAPH_DEGREE` and `COMPLEXITY` are not"
" set as defaults, but some semi-reasonable default values are selected for your convenience. We urge you to "
"investigate their meaning and adjust them for your use cases.");

default_values.attr("ALPHA") = diskann::defaults::ALPHA;
default_values.attr("NUM_THREADS") = diskann::defaults::NUM_THREADS;
default_values.attr("MAX_OCCLUSION_SIZE") = diskann::defaults::MAX_OCCLUSION_SIZE;
default_values.attr("FILTER_COMPLEXITY") = diskann::defaults::FILTER_LIST_SIZE;
default_values.attr("NUM_FROZEN_POINTS_STATIC") = diskann::defaults::NUM_FROZEN_POINTS_STATIC;
default_values.attr("NUM_FROZEN_POINTS_DYNAMIC") = diskann::defaults::NUM_FROZEN_POINTS_DYNAMIC;
default_values.attr("SATURATE_GRAPH") = diskann::defaults::SATURATE_GRAPH;
default_values.attr("GRAPH_DEGREE") = diskann::defaults::MAX_DEGREE;
default_values.attr("COMPLEXITY") = diskann::defaults::BUILD_LIST_SIZE;
default_values.attr("PQ_DISK_BYTES") = (uint32_t)0;
default_values.attr("USE_PQ_BUILD") = false;
default_values.attr("NUM_PQ_BYTES") = (uint32_t)0;
default_values.attr("USE_OPQ") = false;
add_variant<float>(m_diskann, FloatVariant);
add_variant<uint8_t>(m_diskann, UInt8Variant);
add_variant<int8_t>(m_diskann, Int8Variant);

py::enum_<diskann::Metric>(m_diskann, "Metric")
.value("L2", diskann::Metric::L2)
.value("INNER_PRODUCT", diskann::Metric::INNER_PRODUCT)
.value("COSINE", diskann::Metric::COSINE)
.export_values();
py::enum_<diskann::AlgoType>(m_diskann, "AlgoType")
.value("DISKANN", diskann::AlgoType::DISKANN)
.value("CUFE", diskann::AlgoType::CUFE)
.value("PYANNS", diskann::AlgoType::PYANNS)
.export_values();
m_diskann.attr("defaults") = default_values;
m.attr("diskannpy") = m_diskann;



Expand Down
6 changes: 6 additions & 0 deletions thirdparty/DiskANN/.clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
BasedOnStyle: Microsoft
---
Language: Cpp
SortIncludes: false
...
14 changes: 14 additions & 0 deletions thirdparty/DiskANN/.gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Set the default behavior, in case people don't have core.autocrlf set.
* text=auto

# Explicitly declare text files you want to always be normalized and converted
# to native line endings on checkout.
*.c text
*.h text

# Declare files that will always have CRLF line endings on checkout.
*.sln text eol=crlf

# Denote all files that are truly binary and should not be modified.
*.png binary
*.jpg binary
40 changes: 40 additions & 0 deletions thirdparty/DiskANN/.github/ISSUE_TEMPLATE/bug_report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
name: Bug report
about: Bug reports help us improve! Thanks for submitting yours!
title: "[BUG] "
labels: bug
assignees: ''

---

## Expected Behavior
Tell us what should happen

## Actual Behavior
Tell us what happens instead

## Example Code
Please see [How to create a Minimal, Reproducible example](https://stackoverflow.com/help/minimal-reproducible-example) for some guidance on creating the best possible example of the problem
```bash

```

## Dataset Description
Please tell us about the shape and datatype of your data, (e.g. 128 dimensions, 12.3 billion points, floats)
- Dimensions:
- Number of Points:
- Data type:

## Error
```
Paste the full error, with any sensitive information minimally redacted and marked $$REDACTED$$
```

## Your Environment
* Operating system (e.g. Windows 11 Pro, Ubuntu 22.04.1 LTS)
* DiskANN version (or commit built from)

## Additional Details
Any other contextual information you might feel is important.

2 changes: 2 additions & 0 deletions thirdparty/DiskANN/.github/ISSUE_TEMPLATE/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
blank_issues_enabled: false

Loading

0 comments on commit 47fef00

Please sign in to comment.