Skip to content

Commit

Permalink
Support diskann index for vector field (milvus-io#19093)
Browse files Browse the repository at this point in the history
Signed-off-by: xige-16 <[email protected]>

Signed-off-by: xige-16 <[email protected]>
  • Loading branch information
xige-16 authored Sep 21, 2022
1 parent 9d508df commit 4288401
Show file tree
Hide file tree
Showing 150 changed files with 4,747 additions and 2,903 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/code-checker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ jobs:
restore-keys: ubuntu20.04-go-mod-
- name: Code Check
env:
CHECK_BUILDER: "1"
# CHECK_BUILDER: "1"
OS_NAME: "ubuntu20.04"
run: |
./build/builder.sh /bin/bash -c "make check-proto-product && make verifiers"
centos:
Expand Down
2 changes: 1 addition & 1 deletion build/docker/milvus/ubuntu18.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ FROM milvusdb/openblas:ubuntu18.04-20210428 AS openblas
FROM ubuntu:bionic-20200921

RUN apt-get update && \
apt-get install -y --no-install-recommends libtbb-dev libzstd-dev gfortran netcat iputils-ping ca-certificates && \
apt-get install -y --no-install-recommends libtbb-dev libzstd-dev gfortran netcat iputils-ping ca-certificates uuid-dev libaio-dev libboost-program-options-dev && \
apt-get remove --purge -y && \
rm -rf /var/lib/apt/lists/*

Expand Down
5 changes: 4 additions & 1 deletion build/docker/milvus/ubuntu20.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ FROM milvusdb/openblas:ubuntu20.04-20220914-179ea77 AS openblas
#FROM alpine
FROM ubuntu:focal-20220426

ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=UTC

RUN apt-get update && \
apt-get install -y --no-install-recommends libtbb-dev gfortran netcat iputils-ping ca-certificates liblapack3 && \
apt-get install -y --no-install-recommends curl libtbb-dev gfortran netcat iputils-ping ca-certificates liblapack3 libzstd-dev uuid-dev libaio-dev libboost-program-options-dev libboost-filesystem-dev && \
apt-get remove --purge -y && \
rm -rf /var/lib/apt/lists/*

Expand Down
4 changes: 4 additions & 0 deletions configs/milvus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ queryNode:
cacheSize: 32 # GB, default 32 GB, `cacheSize` is the memory used for caching data for faster query. The `cacheSize` must be less than system memory size.
port: 21123
loadMemoryUsageFactor: 3 # The multiply factor of calculating the memory usage while loading segments
enableDisk: true # enable querynode load disk index, and search on disk index
maxDiskUsagePercentage: 95

stats:
publishInterval: 1000 # Interval for querynode to report node information (milliseconds)
Expand Down Expand Up @@ -238,6 +240,8 @@ indexCoord:

indexNode:
port: 21121
enableDisk: true # enable index node build disk vector index
maxDiskUsagePercentage: 95

scheduler:
buildParallel: 1
Expand Down
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ services:
# - "19530:19530"
environment:
<<: *ccache
OS_NAME: ${OS_NAME}
PULSAR_ADDRESS: ${PULSAR_ADDRESS}
ETCD_ENDPOINTS: ${ETCD_ENDPOINTS}
MINIO_ADDRESS: ${MINIO_ADDRESS}
Expand Down
4 changes: 4 additions & 0 deletions internal/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,10 @@ if ( BUILD_UNIT_TEST STREQUAL "ON" AND BUILD_COVERAGE STREQUAL "ON" )
)
endif ()

if ( BUILD_DISK_ANN STREQUAL "ON" )
ADD_DEFINITIONS(-DBUILD_DISK_ANN=${BUILD_DISK_ANN})
endif ()

# Warning: add_subdirectory(src) must be after append_flags("-ftest-coverage"),
# otherwise cpp code coverage tool will miss src folder
add_subdirectory( thirdparty )
Expand Down
189 changes: 189 additions & 0 deletions internal/core/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
#!/bin/bash

# Compile jobs variable; Usage: $ jobs=12 ./build.sh ...
if [[ ! ${jobs+1} ]]; then
if command -v nproc &> /dev/null
# For linux
then
jobs=$(nproc)
elif command -v sysctl &> /dev/null
# For macOS
then
jobs=$(sysctl -n hw.logicalcpu)
else
jobs=4
fi
fi

SOURCE="${BASH_SOURCE[0]}"
while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
SOURCE="$(readlink "$SOURCE")"
[[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
done
SCRIPTS_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"

BUILD_OUTPUT_DIR="./cmake_build"
BUILD_TYPE="Release"
BUILD_UNITTEST="OFF"
INSTALL_PREFIX="${SCRIPTS_DIR}/output"
MAKE_CLEAN="OFF"
BUILD_COVERAGE="OFF"
PROFILING="OFF"
RUN_CPPLINT="OFF"
CUDA_COMPILER=/usr/local/cuda/bin/nvcc
GPU_VERSION="OFF" #defaults to CPU version
WITH_PROMETHEUS="ON"
CUDA_ARCH="DEFAULT"
CUSTOM_THIRDPARTY_PATH=""
BUILD_DISK_ANN="OFF"

while getopts "p:t:s:f:o:ulrcghzme" arg; do
case $arg in
f)
CUSTOM_THIRDPARTY_PATH=$OPTARG
;;
p)
INSTALL_PREFIX=$OPTARG
;;
o)
BUILD_OUTPUT_DIR=$OPTARG
;;
t)
BUILD_TYPE=$OPTARG # BUILD_TYPE
;;
u)
echo "Build and run unittest cases"
BUILD_UNITTEST="ON"
;;
l)
RUN_CPPLINT="ON"
;;
r)
if [[ -d ${BUILD_OUTPUT_DIR} ]]; then
MAKE_CLEAN="ON"
fi
;;
c)
BUILD_COVERAGE="ON"
;;
z)
PROFILING="ON"
;;
g)
GPU_VERSION="ON"
;;
e)
WITH_PROMETHEUS="OFF"
;;
s)
CUDA_ARCH=$OPTARG
;;
n)
BUILD_DISK_ANN="OFF"
;;
h) # help
echo "
parameter:
-f: custom paths of thirdparty downloaded files(default: NULL)
-p: install prefix(default: $(pwd)/milvus)
-d: db data path(default: /tmp/milvus)
-t: build type(default: Debug)
-u: building unit test options(default: OFF)
-l: run cpplint, clang-format and clang-tidy(default: OFF)
-r: remove previous build directory(default: OFF)
-c: code coverage(default: OFF)
-z: profiling(default: OFF)
-g: build GPU version(default: OFF)
-e: build without prometheus(default: OFF)
-s: build with CUDA arch(default:DEFAULT), for example '-gencode=compute_61,code=sm_61;-gencode=compute_75,code=sm_75'
-h: help
usage:
./build.sh -p \${INSTALL_PREFIX} -t \${BUILD_TYPE} -s \${CUDA_ARCH} -f\${CUSTOM_THIRDPARTY_PATH} [-u] [-l] [-r] [-c] [-z] [-g] [-m] [-e] [-h]
"
exit 0
;;
?)
echo "ERROR! unknown argument"
exit 1
;;
esac
done

if [[ ! -d ${BUILD_OUTPUT_DIR} ]]; then
mkdir ${BUILD_OUTPUT_DIR}
fi

cd ${BUILD_OUTPUT_DIR}

# remove make cache since build.sh -l use default variables
# force update the variables each time
make rebuild_cache >/dev/null 2>&1


if [[ ${MAKE_CLEAN} == "ON" ]]; then
echo "Runing make clean in ${BUILD_OUTPUT_DIR} ..."
make clean
exit 0
fi

unameOut="$(uname -s)"
case "${unameOut}" in
Darwin*)
llvm_prefix="$(brew --prefix llvm)"
export CLANG_TOOLS_PATH="${llvm_prefix}/bin"
export CC="${llvm_prefix}/bin/clang"
export CXX="${llvm_prefix}/bin/clang++"
export LDFLAGS="-L${llvm_prefix}/lib -L/usr/local/opt/libomp/lib"
export CXXFLAGS="-I${llvm_prefix}/include -I/usr/local/include -I/usr/local/opt/libomp/include"
;;
*) echo "==System:${unameOut}";
esac

CMAKE_CMD="cmake \
-DBUILD_UNIT_TEST=${BUILD_UNITTEST} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DOpenBLAS_SOURCE=AUTO \
-DCMAKE_CUDA_COMPILER=${CUDA_COMPILER} \
-DBUILD_COVERAGE=${BUILD_COVERAGE} \
-DENABLE_CPU_PROFILING=${PROFILING} \
-DMILVUS_GPU_VERSION=${GPU_VERSION} \
-DMILVUS_WITH_PROMETHEUS=${WITH_PROMETHEUS} \
-DMILVUS_CUDA_ARCH=${CUDA_ARCH} \
-DCUSTOM_THIRDPARTY_DOWNLOAD_PATH=${CUSTOM_THIRDPARTY_PATH} \
-DKNOWHERE_GPU_VERSION=${SUPPORT_GPU} \
-DBUILD_DISK_ANN=${BUILD_DISK_ANN} \
${SCRIPTS_DIR}"
echo ${CMAKE_CMD}
${CMAKE_CMD}


if [[ ${RUN_CPPLINT} == "ON" ]]; then
# cpplint check
make lint
if [ $? -ne 0 ]; then
echo "ERROR! cpplint check failed"
exit 1
fi
echo "cpplint check passed!"

# clang-format check
make check-clang-format
if [ $? -ne 0 ]; then
echo "ERROR! clang-format check failed"
exit 1
fi
echo "clang-format check passed!"

# clang-tidy check
make check-clang-tidy
if [ $? -ne 0 ]; then
echo "ERROR! clang-tidy check failed"
exit 1
fi
echo "clang-tidy check passed!"
else
# compile and build
make -j ${jobs} install || exit 1
fi
4 changes: 0 additions & 4 deletions internal/core/src/common/CDataType.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@
#include <string>

namespace milvus {
inline bool
IsVectorType(CDataType dtype) {
return dtype == CDataType::FloatVector || dtype == CDataType::BinaryVector;
}

template <typename T, typename = std::enable_if_t<std::is_fundamental_v<T> || std::is_same_v<T, std::string>>>
inline CDataType
Expand Down
9 changes: 2 additions & 7 deletions internal/core/src/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,25 @@ milvus_add_pkg_config("milvus_common")
set(COMMON_SRC
Schema.cpp
SystemProperty.cpp
vector_index_c.cpp
binary_set_c.cpp
init_c.cpp
)

add_library(milvus_common SHARED ${COMMON_SRC})

if ( MSYS )
target_link_libraries(milvus_common
milvus_utils
milvus_config
milvus_log
knowhere
milvus_proto
yaml-cpp
boost_bitset_ext
arrow
parquet
)
else()
target_link_libraries(milvus_common
milvus_utils
milvus_config
milvus_log
knowhere
milvus_proto
yaml-cpp
boost_bitset_ext
arrow
Expand Down
4 changes: 4 additions & 0 deletions internal/core/src/common/Consts.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ const milvus::PkType INVALID_PK; // of std::monostate if not set.
const int64_t START_USER_FIELDID = 100;
const char MAX_LENGTH[] = "max_length";

// const fieldID (rowID and timestamp)
const milvus::FieldId RowFieldID = milvus::FieldId(0);
const milvus::FieldId TimestampFieldID = milvus::FieldId(1);

// fill followed extra info to binlog file
const char ORIGIN_SIZE_KEY[] = "original_size";
const char INDEX_BUILD_ID_KEY[] = "indexBuildID";
Expand Down
7 changes: 0 additions & 7 deletions internal/core/src/common/LoadInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@
#include "common/CDataType.h"
#include "knowhere/index/Index.h"

struct LoadIndexInfo {
int64_t field_id;
CDataType field_type;
std::map<std::string, std::string> index_params;
knowhere::IndexPtr index;
};

// NOTE: field_id can be system field
// NOTE: Refer to common/SystemProperty.cpp for details
// TODO: use arrow to pass field data instead of proto
Expand Down
34 changes: 34 additions & 0 deletions internal/core/src/common/QueryInfo.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <memory>

#include "common/Types.h"

namespace milvus {
struct SearchInfo {
int64_t topk_;
int64_t round_decimal_;
FieldId field_id_;
MetricType metric_type_;
Config search_params_;
};

using SearchInfoPtr = std::shared_ptr<SearchInfo>;

} // namespace milvus
17 changes: 15 additions & 2 deletions internal/core/src/common/Types.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
#include <variant>

#include "knowhere/index/vector_index/helpers/IndexParameter.h"
#include <knowhere/index/IndexType.h>
#include "knowhere/common/BinarySet.h"
#include "knowhere/common/Dataset.h"
#include "pb/schema.pb.h"
#include "pb/segcore.pb.h"
#include "pb/plan.pb.h"
Expand Down Expand Up @@ -109,6 +112,16 @@ using BitsetTypeOpt = std::optional<BitsetType>;
template <typename Type>
using FixedVector = boost::container::vector<Type>;

const FieldId RowFieldID = FieldId(0);
const FieldId TimestampFieldID = FieldId(1);
using Config = nlohmann::json;
using TargetBitmap = boost::dynamic_bitset<>;
using TargetBitmapPtr = std::unique_ptr<TargetBitmap>;

using BinarySet = knowhere::BinarySet;
using DatasetPtr = knowhere::DatasetPtr;
using MetricType = knowhere::MetricType;
// TODO :: type define milvus index type(vector index type and scalar index type)
using IndexType = knowhere::IndexType;
// TODO :: type define milvus index mode, add transfer func from milvus index mode to knowhere index mode
using IndexMode = knowhere::IndexMode;

} // namespace milvus
Loading

0 comments on commit 4288401

Please sign in to comment.