From 4a6705b3a4bcea89461f68c356637cd2687a6d60 Mon Sep 17 00:00:00 2001 From: Rajvaibhav Rahane Date: Mon, 3 Mar 2025 12:50:58 -0800 Subject: [PATCH] Add GPU Index Config Files Signed-off-by: Rajvaibhav Rahane --- .../core/common/models/__init__.py | 21 ++++++ .../common/models/index_builder/__init__.py | 22 ++++++ .../index_builder/gpu_index_build_config.py | 26 +++++++ .../index_builder/gpu_index_cagra_config.py | 34 +++++++++ .../models/index_builder/graph_build_algo.py | 13 ++++ .../index_builder/index_hnsw_cagra_config.py | 28 +++++++ .../ivf_pq_build_cagra_config.py | 50 +++++++++++++ .../ivf_pq_search_cagra_config.py | 14 ++++ .../core/index_builder/__init__.py | 6 ++ .../index_builder/index_config_builder.py | 73 +++++++++++++++++++ .../index_builder/index_config_director.py | 22 ++++++ 11 files changed, 309 insertions(+) create mode 100644 remote_vector_index_builder/core/common/models/index_builder/__init__.py create mode 100755 remote_vector_index_builder/core/common/models/index_builder/gpu_index_build_config.py create mode 100755 remote_vector_index_builder/core/common/models/index_builder/gpu_index_cagra_config.py create mode 100755 remote_vector_index_builder/core/common/models/index_builder/graph_build_algo.py create mode 100755 remote_vector_index_builder/core/common/models/index_builder/index_hnsw_cagra_config.py create mode 100755 remote_vector_index_builder/core/common/models/index_builder/ivf_pq_build_cagra_config.py create mode 100755 remote_vector_index_builder/core/common/models/index_builder/ivf_pq_search_cagra_config.py create mode 100644 remote_vector_index_builder/core/index_builder/index_config_builder.py create mode 100644 remote_vector_index_builder/core/index_builder/index_config_director.py diff --git a/remote_vector_index_builder/core/common/models/__init__.py b/remote_vector_index_builder/core/common/models/__init__.py index fe22b86..e82ee53 100644 --- a/remote_vector_index_builder/core/common/models/__init__.py +++ b/remote_vector_index_builder/core/common/models/__init__.py @@ -4,3 +4,24 @@ # The OpenSearch Contributors require contributions made to # this file be licensed under the Apache-2.0 license or a # compatible open source license. + +from .index_build_parameters import SpaceType +from .index_builder.gpu_index_cagra_config import ( + GPUIndexCagraConfig, + IVFPQSearchCagraConfig, + IVFPQBuildCagraConfig, +) +from .index_builder.index_hnsw_cagra_config import IndexHNSWCagraConfig +from .index_builder.gpu_index_build_config import GPUIndexBuildConfig + +from .index_builder.graph_build_algo import GraphBuildAlgo + +__all__ = [ + "SpaceType", + "GPUIndexCagraConfig", + "IVFPQSearchCagraConfig", + "IVFPQBuildCagraConfig", + "IndexHNSWCagraConfig", + "GPUIndexBuildConfig", + "GraphBuildAlgo", +] diff --git a/remote_vector_index_builder/core/common/models/index_builder/__init__.py b/remote_vector_index_builder/core/common/models/index_builder/__init__.py new file mode 100644 index 0000000..4894fb7 --- /dev/null +++ b/remote_vector_index_builder/core/common/models/index_builder/__init__.py @@ -0,0 +1,22 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from .gpu_index_cagra_config import GPUIndexCagraConfig +from .ivf_pq_search_cagra_config import IVFPQSearchCagraConfig +from .ivf_pq_build_cagra_config import IVFPQBuildCagraConfig +from .index_hnsw_cagra_config import IndexHNSWCagraConfig +from .gpu_index_build_config import GPUIndexBuildConfig +from .graph_build_algo import GraphBuildAlgo + +__all__ = [ + "GPUIndexCagraConfig", + "IVFPQSearchCagraConfig", + "IVFPQBuildCagraConfig", + "IndexHNSWCagraConfig", + "GPUIndexBuildConfig", + "GraphBuildAlgo", +] diff --git a/remote_vector_index_builder/core/common/models/index_builder/gpu_index_build_config.py b/remote_vector_index_builder/core/common/models/index_builder/gpu_index_build_config.py new file mode 100755 index 0000000..fcce13a --- /dev/null +++ b/remote_vector_index_builder/core/common/models/index_builder/gpu_index_build_config.py @@ -0,0 +1,26 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from dataclasses import dataclass, field + +from ..index_build_parameters import SpaceType + +from .gpu_index_cagra_config import GPUIndexCagraConfig +from .index_hnsw_cagra_config import IndexHNSWCagraConfig + + +@dataclass +class GPUIndexBuildConfig: + index_hnsw_cagra_config: IndexHNSWCagraConfig = field( + default_factory=IndexHNSWCagraConfig + ) + gpu_index_cagra_config: GPUIndexCagraConfig = field( + default_factory=GPUIndexCagraConfig + ) + + # type of metric the gpuIndex is created with + metric: SpaceType = SpaceType.L2 diff --git a/remote_vector_index_builder/core/common/models/index_builder/gpu_index_cagra_config.py b/remote_vector_index_builder/core/common/models/index_builder/gpu_index_cagra_config.py new file mode 100755 index 0000000..e3b7073 --- /dev/null +++ b/remote_vector_index_builder/core/common/models/index_builder/gpu_index_cagra_config.py @@ -0,0 +1,34 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from dataclasses import dataclass, field + +from .graph_build_algo import GraphBuildAlgo +from .ivf_pq_build_cagra_config import IVFPQBuildCagraConfig +from .ivf_pq_search_cagra_config import IVFPQSearchCagraConfig + + +@dataclass +class GPUIndexCagraConfig: + # Degree of input graph for pruning + intermediate_graph_degree: int = 64 + # Degree of output graph + graph_degree: int = 32 + # ANN Algorithm to build the knn graph + graph_build_algo: GraphBuildAlgo = GraphBuildAlgo.IVF_PQ + + store_dataset: bool = False + # GPU Device on which the index is resident + device: int = 0 + + ivf_pq_build_config: IVFPQBuildCagraConfig = field( + default_factory=IVFPQBuildCagraConfig + ) + + ivf_pq_search_config: IVFPQSearchCagraConfig = field( + default_factory=IVFPQSearchCagraConfig + ) diff --git a/remote_vector_index_builder/core/common/models/index_builder/graph_build_algo.py b/remote_vector_index_builder/core/common/models/index_builder/graph_build_algo.py new file mode 100755 index 0000000..a6bf18f --- /dev/null +++ b/remote_vector_index_builder/core/common/models/index_builder/graph_build_algo.py @@ -0,0 +1,13 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from enum import Enum + + +class GraphBuildAlgo(Enum): + IVF_PQ = "IVF_PQ" + NN_DESCENT = "NN_DESCENT" diff --git a/remote_vector_index_builder/core/common/models/index_builder/index_hnsw_cagra_config.py b/remote_vector_index_builder/core/common/models/index_builder/index_hnsw_cagra_config.py new file mode 100755 index 0000000..efcd771 --- /dev/null +++ b/remote_vector_index_builder/core/common/models/index_builder/index_hnsw_cagra_config.py @@ -0,0 +1,28 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from dataclasses import dataclass + + +@dataclass +class IndexHNSWCagraConfig: + # expansion factor at search time + ef_search: int = 256 + + # expansion factor at construction time + ef_construction: int = 40 + + # When set to true, the index is immutable. + # This option is used to copy the knn graph from GpuIndexCagra + # to the base level of IndexHNSWCagra without adding upper levels. + # Doing so enables to search the HNSW index, but removes the + # ability to add vectors. + base_level_only: bool = True + + # Set to true to delete internal storage:Index variable + # when destructor is called + own_fields: bool = True diff --git a/remote_vector_index_builder/core/common/models/index_builder/ivf_pq_build_cagra_config.py b/remote_vector_index_builder/core/common/models/index_builder/ivf_pq_build_cagra_config.py new file mode 100755 index 0000000..671de1e --- /dev/null +++ b/remote_vector_index_builder/core/common/models/index_builder/ivf_pq_build_cagra_config.py @@ -0,0 +1,50 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from dataclasses import dataclass + + +@dataclass +class IVFPQBuildCagraConfig: + # The number of inverted lists (clusters) + # Hint: the number of vectors per cluster (`n_rows/n_lists`) should be + # approximately 1,000 to 10,000. + n_lists: int = 1000 + + # The number of iterations searching for kmeans centers (index building). + kmeans_n_iters: int = 10 + # The fraction of data to use during iterative kmeans building. + kmeans_trainset_fraction: float = 0.1 + + # The bit length of the vector element after compression by PQ. + # Possible values: [4, 5, 6, 7, 8]. + # Hint: the smaller the 'pq_bits', the smaller the index size and the + # better the search performance, but the lower the recall. + pq_bits: int = 8 + + # The dimensionality of the vector after compression by PQ. When zero, an + # optimal value is selected using a heuristic. + # pq_bits` must be a multiple of 8. + # Hint: a smaller 'pq_dim' results in a smaller index size and better + # search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be + # set to any number, but multiple of 8 are desirable for good performance. + # If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good + # performance, it is desirable that 'pq_dim' is a multiple of 32 + # Ideally 'pq_dim' should be also a divisor of the dataset dim. + pq_dim: int = 16 + + # By default, the algorithm allocates more space than necessary for + # individual clusters + # This allows to amortize the cost of memory allocation and + # reduce the number of data copies during repeated calls to `extend` + # (extending the database). + # + # The alternative is the conservative allocation behavior; when enabled, + # the algorithm always allocates the minimum amount of memory required to + # store the given number of records. Set this flag to `true` if you prefer + # to use as little GPU memory for the database as possible. + conservative_memory_allocation: bool = True diff --git a/remote_vector_index_builder/core/common/models/index_builder/ivf_pq_search_cagra_config.py b/remote_vector_index_builder/core/common/models/index_builder/ivf_pq_search_cagra_config.py new file mode 100755 index 0000000..d7ca0b8 --- /dev/null +++ b/remote_vector_index_builder/core/common/models/index_builder/ivf_pq_search_cagra_config.py @@ -0,0 +1,14 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from dataclasses import dataclass + + +@dataclass +class IVFPQSearchCagraConfig: + # The number of clusters to search. + n_probes: int = 30 diff --git a/remote_vector_index_builder/core/index_builder/__init__.py b/remote_vector_index_builder/core/index_builder/__init__.py index e69de29..fe22b86 100644 --- a/remote_vector_index_builder/core/index_builder/__init__.py +++ b/remote_vector_index_builder/core/index_builder/__init__.py @@ -0,0 +1,6 @@ +# Copyright OpenSearch Contributors +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. diff --git a/remote_vector_index_builder/core/index_builder/index_config_builder.py b/remote_vector_index_builder/core/index_builder/index_config_builder.py new file mode 100644 index 0000000..59856ba --- /dev/null +++ b/remote_vector_index_builder/core/index_builder/index_config_builder.py @@ -0,0 +1,73 @@ +from typing import Any, Dict, Optional +from remote_vector_index_builder.core.common.models import ( + IndexHNSWCagraConfig, + GPUIndexCagraConfig, + SpaceType, + IVFPQBuildCagraConfig, + IVFPQSearchCagraConfig, + GraphBuildAlgo, + GPUIndexBuildConfig, +) + + +class IndexConfigBuilder: + def __init__(self): + self._hnsw_config: Optional[IndexHNSWCagraConfig] = None + self._gpu_config: Optional[GPUIndexCagraConfig] = None + self._metric: SpaceType = SpaceType("l2") # default metric + + def set_hnsw_config(self, params: Dict[str, Any]) -> "IndexConfigBuilder": + self._hnsw_config = ( + IndexHNSWCagraConfig(**params) if params else IndexHNSWCagraConfig() + ) + return self + + def set_gpu_config(self, params: Dict[str, Any]) -> "IndexConfigBuilder": + if not params: + self._gpu_config = GPUIndexCagraConfig() + return self + + ivf_pq_build_params = params.pop("ivf_pq_build_params", None) + ivf_pq_build_config = ( + IVFPQBuildCagraConfig(**ivf_pq_build_params) + if ivf_pq_build_params + else IVFPQBuildCagraConfig() + ) + + ivf_pq_search_params = params.pop("ivf_pq_search_params", None) + ivf_pq_search_config = ( + IVFPQSearchCagraConfig(**ivf_pq_search_params) + if ivf_pq_search_params + else IVFPQSearchCagraConfig() + ) + + graph_build_algo_param = params.pop("graph_build_algo", None) + graph_build_algo = ( + GraphBuildAlgo(graph_build_algo_param) + if graph_build_algo_param + else GraphBuildAlgo.IVF_PQ + ) + + self._gpu_config = GPUIndexCagraConfig( + **params, + graph_build_algo=graph_build_algo, + ivf_pq_build_config=ivf_pq_build_config, + ivf_pq_search_config=ivf_pq_search_config + ) + return self + + def set_metric(self, metric: str) -> "IndexConfigBuilder": + self._metric = SpaceType(metric) + return self + + def build(self) -> GPUIndexBuildConfig: + if not self._hnsw_config: + self._hnsw_config = IndexHNSWCagraConfig() + if not self._gpu_config: + self._gpu_config = GPUIndexCagraConfig() + + return GPUIndexBuildConfig( + index_hnsw_cagra_config=self._hnsw_config, + gpu_index_cagra_config=self._gpu_config, + metric=self._metric, + ) diff --git a/remote_vector_index_builder/core/index_builder/index_config_director.py b/remote_vector_index_builder/core/index_builder/index_config_director.py new file mode 100644 index 0000000..08acc73 --- /dev/null +++ b/remote_vector_index_builder/core/index_builder/index_config_director.py @@ -0,0 +1,22 @@ +from typing import Dict, Any +from remote_vector_index_builder.core.common.models.index_builder.gpu_index_build_config import ( + GPUIndexBuildConfig, +) +from remote_vector_index_builder.core.index_builder.index_config_builder import ( + IndexConfigBuilder, +) + + +class IndexConfigDirector: + """Director class to construct index configurations using the builder""" + + def __init__(self, builder: IndexConfigBuilder): + self._builder = builder + + def construct_config(self, config_params: Dict[str, Any]) -> GPUIndexBuildConfig: + return ( + self._builder.set_hnsw_config(config_params.get("hnsw_config", {})) + .set_gpu_config(config_params.get("gpu_config", {})) + .set_metric(config_params.get("metric", "l2")) + .build() + )