Skip to content

Commit

Permalink
Implement Faiss GPU Index Builder functionality
Browse files Browse the repository at this point in the history
Signed-off-by: Rajvaibhav Rahane <[email protected]>
  • Loading branch information
Rajrahane committed Mar 5, 2025
1 parent 4a6705b commit 6a7a658
Show file tree
Hide file tree
Showing 6 changed files with 516 additions and 4 deletions.
195 changes: 195 additions & 0 deletions remote_vector_index_builder/core/index_builder/faiss_index_builder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.

from remote_vector_index_builder.core.common.models import (
GPUIndexBuildConfig,
GPUIndexCagraConfig,
IndexHNSWCagraConfig,
SpaceType,
)
from remote_vector_index_builder.core.index_builder.faiss_index_config_builder import (
FaissIndexConfigBuilder,
)
import faiss
from remote_vector_index_builder.core.common.models.index_build_parameters import (
IndexBuildParameters,
)
from remote_vector_index_builder.core.common.models.vectors_dataset import (
VectorsDataset,
)
from remote_vector_index_builder.core.index_builder.index_config_builder import (
IndexConfigBuilder,
)
from remote_vector_index_builder.core.index_builder.index_config_director import (
IndexConfigDirector,
)
from remote_vector_index_builder.core.index_builder.models.create_gpu_index_response import (
CreateGPUIndexResponse,
)
from remote_vector_index_builder.core.index_builder.index_builder_utils import (
calculate_ivf_pq_n_lists,
configure_metric,
get_omp_num_threads,
)


class FaissIndexBuilder:
"""
Class exposing the the build_gpu_index method for building a CPU read compatible Faiis GPU Index
"""

def __init__(self):
self.gpu_resources = faiss.StandardGpuResources()
self.omp_num_threads = get_omp_num_threads()

def _create_gpu_index(
self,
vectorsDataset: VectorsDataset,
dataset_dimension: int,
faissGPUIndexCagraConfig: faiss.GpuIndexCagraConfig,
space_type: SpaceType = SpaceType.L2,
):
# Configure the distance metric
metric = configure_metric(space_type)

# Create GPU CAGRA index with specified configuration
faiss_gpu_index_cagra = faiss.GpuIndexCagra(
self.gpu_resources, dataset_dimension, metric, faissGPUIndexCagraConfig
)

# Create ID mapping layer to preserve document IDs
faiss_id_map_index = faiss.IndexIDMap(faiss_gpu_index_cagra)
# Add vectors and their corresponding IDs to the index
faiss_id_map_index.add_with_ids(vectorsDataset.vectors, vectorsDataset.doc_ids)

# Clean up memory by releasing vector dataset
vectorsDataset.free_vectors_space()
return CreateGPUIndexResponse(
gpu_index_cagra=faiss_gpu_index_cagra, id_map_index=faiss_id_map_index
)

def _create_and_write_cpu_index_to_file(
self,
create_gpu_index_response: CreateGPUIndexResponse,
index_hnsw_cagra_config: IndexHNSWCagraConfig,
cpuIndexOutputFilePath: str,
):
"""
Method to Create and Write the CPU compatible Index from a GPU Index
Args:
create_gpu_index_response (CreateGPUIndexResponse): datamodel containing the GPU Index and Dataset ID Maps
index_hnsw_cagra_config (IndexHNSWCagraConfig): CPU Search Index config
cpuIndexFileOutputpath: Complete File path to write the CPU Index to
"""

# Initialize CPU Index
cpuIndex = faiss.IndexHNSWCagra()

# Configure HNSW Search parameters
cpuIndex.hnsw.efConstruction = index_hnsw_cagra_config.ef_construction
cpuIndex.hnsw.efSearch = index_hnsw_cagra_config.ef_search
cpuIndex.base_level_only = index_hnsw_cagra_config.base_level_only
cpuIndex.own_fields = index_hnsw_cagra_config.own_fields

# Copy GPU index to CPU index
create_gpu_index_response.gpu_index_cagra.copyTo(cpuIndex)

# Update the ID map index with the CPU index
create_gpu_index_response.id_map_index.index = cpuIndex

# TODO: Investigate what issues may arise while writing index to local file
# Write the final index to disk
faiss.write_index(
create_gpu_index_response.id_map_index, cpuIndexOutputFilePath
)

# Clean up CPU index after writing to file
del cpuIndex

def _create_gpu_index_build_config(self, **kwargs) -> GPUIndexBuildConfig:
"""
Create an index configuration using the builder pattern.
Args:
**kwargs: Configuration parameters including 'hnsw_config', 'gpu_config', and 'metric'.
Returns:
GPUIndexBuildConfig: The constructed index configuration.
Raises:
ValueError: If required configuration parameters are missing.
"""
builder = IndexConfigBuilder()
director = IndexConfigDirector(builder)
return director.construct_config(kwargs)

def _create_faiss_gpu_index_config(self, config: GPUIndexCagraConfig):
"""
Create an faiss index configuration using the builder pattern.
Args:
config: GPUIndexCagraConfig: The core datamodel in remote_vector_index_builder
Returns:
faiss.GpuIndexCagraConfig: The equivalent faiss config
"""
faissIndexBuilder = FaissIndexConfigBuilder()
return faissIndexBuilder.with_gpu_config(config).build_gpu_index_cagra_config()

def build_gpu_index(
self,
vectorsDataset: VectorsDataset,
indexBuildParameters: IndexBuildParameters,
cpuIndexOutputFilePath: str,
):
"""
Creates a GPU Index for the specified vectors dataset, coonverts into CPU compatible Index
and writes the CPU Index to disc
Args:
vectorsDataset: The set of vectors to index
indexBuildParameters: The API Index Build parameters
cpuIndexOutputFilePath: The complete file path on disc to write the cpuIndex to.
"""
# Set number of threads for parallel processing
faiss.omp_set_num_threads(self.omp_num_threads)

# Create a structured GPUIndexBuildConfig having defaults, from a partial dictionary set with index params
index_build_config = self._create_gpu_index_build_config(
hnsw_config={},
gpu_config={
"ivf_pq_build_params": {
"pq_dim": indexBuildParameters.dimension,
"n_lists": calculate_ivf_pq_n_lists(indexBuildParameters.doc_count),
}
},
metric=indexBuildParameters.index_parameters.space_type,
)

# Create a faiis equivalent version of gpu index build config
faiss_gpu_index_cagra_config = self._create_faiss_gpu_index_config(
index_build_config.gpu_index_cagra_config
)

index_hnsw_cagra_config = index_build_config.index_hnsw_cagra_config

# create a GPU Index from the faiss config and vector dataset
create_gpu_index_response = self._create_gpu_index(
vectorsDataset,
indexBuildParameters.dimension,
faiss_gpu_index_cagra_config,
index_build_config.metric,
)

# Convert the GPU Index to CPU Index and write to disk
self._create_and_write_cpu_index_to_file(
create_gpu_index_response, index_hnsw_cagra_config, cpuIndexOutputFilePath
)

# Clean up GPU index response
del create_gpu_index_response
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from typing import Optional
import faiss
from remote_vector_index_builder.core.common.models import (
GPUIndexCagraConfig,
IVFPQBuildCagraConfig,
IVFPQSearchCagraConfig,
GraphBuildAlgo,
)


class FaissIndexConfigBuilder:
"""
A builder class for configuring FAISS GPU indexes
This class helps construct configuration objects for GPU-based FAISS indexes, including:
- GPU index Cagra configuration
- IVF-PQ (Inverted File System with Product Quantization) Cagra build parameters
- IVF-PQ Cagra search parameters
"""

def __init__(self):
"""
Initialize the builder with default configuration values set to None.
These configurations will be set later using builder methods.
"""
self._gpu_config: Optional[GPUIndexCagraConfig] = None
self._ivf_pq_build_config: Optional[IVFPQBuildCagraConfig] = None
self._ivf_pq_search_config: Optional[IVFPQSearchCagraConfig] = None

def _configure_build_algo(self, graph_build_algo: GraphBuildAlgo):
"""
Maps the graph building algorithm enum to the corresponding FAISS implementation.
Args:
graph_build_algo: The algorithm type to use for building the graph
Returns:
The corresponding FAISS graph building algorithm implementation
Defaults to IVF_PQ if the specified algorithm is not found
"""
switcher = {GraphBuildAlgo.IVF_PQ: faiss.graph_build_algo_IVF_PQ}
return switcher.get(graph_build_algo, faiss.graph_build_algo_IVF_PQ)

def _create_ivf_pq_build_config(self) -> faiss.IVFPQBuildCagraConfig:
"""
Creates and configures the equivalent FAISS IVFPQBuildCagraConfig from the
IVFPQBuildCagraConfig core datamodel.
Returns:
A configured FAISS IVFPQBuildCagraConfig object with parameters for:
- kmeans training set fraction
- kmeans iteration count
- Product Quantization bits and dimensions
- Number of inverted lists (kmeans clusters)
- Memory allocation strategy
"""

if not self._ivf_pq_build_config:
self._ivf_pq_build_config = IVFPQBuildCagraConfig()

config = faiss.IVFPQBuildCagraConfig()
config.kmeans_trainset_fraction = (
self._ivf_pq_build_config.kmeans_trainset_fraction
)
config.kmeans_n_iters = self._ivf_pq_build_config.kmeans_n_iters
config.pq_bits = self._ivf_pq_build_config.pq_bits
config.pq_dim = self._ivf_pq_build_config.pq_dim
config.n_lists = self._ivf_pq_build_config.n_lists
config.conservative_memory_allocation = (
self._ivf_pq_build_config.conservative_memory_allocation
)
return config

def _create_ivf_pq_search_config(self) -> faiss.IVFPQSearchCagraConfig:
"""
Creates and configures the equivalent FAISS IVFPQSearchCagraConfig from the
IVFPQSearchCagraConfig core datamodel.
Returns:
A configured FAISS IVFPQSearchCagraConfig object with search parameters for:
- n_probs The number of clusters to search
"""
if not self._ivf_pq_search_config:
self._ivf_pq_search_config = IVFPQSearchCagraConfig()

config = faiss.IVFPQSearchCagraConfig()
config.n_probes = self._ivf_pq_search_config.n_probes
return config

def with_gpu_config(
self, gpu_config: GPUIndexCagraConfig
) -> "FaissIndexConfigBuilder":
"""
Sets the GPUIndexCagraConfig for the index builder.
Args:
gpu_config: GPUIndexCagraConfig core datamodel
Returns:
Self reference for method chaining
"""
self._gpu_config = gpu_config
if gpu_config:
self._ivf_pq_build_config = gpu_config.ivf_pq_build_config
self._ivf_pq_search_config = gpu_config.ivf_pq_search_config
return self

def build_gpu_index_cagra_config(self) -> faiss.GpuIndexCagraConfig:
"""
Builds and returns the complete FAISS GPUIndexCagraConfig
Configures -
- Basic GPUIndex Cagra Config parameters
- IVF-PQ Build Cagra Config parameters
- IVF-PQ Search Cagra Config paramters
Returns:
A fully configured faiss GPUIndexCagraConfig object ready for index creation
"""
if not self._gpu_config:
self._gpu_config = GPUIndexCagraConfig()
self._ivf_pq_build_config = self._gpu_config.ivf_pq_build_config
self._ivf_pq_search_config = self._gpu_config.ivf_pq_search_config

gpu_index_cagra_config = faiss.GpuIndexCagraConfig()
gpu_index_cagra_config.intermediate_graph_degree = (
self._gpu_config.intermediate_graph_degree
)
gpu_index_cagra_config.graph_degree = self._gpu_config.graph_degree
gpu_index_cagra_config.store_dataset = self._gpu_config.store_dataset

gpu_index_cagra_config.build_algo = self._configure_build_algo(
self._gpu_config.graph_build_algo
)
gpu_index_cagra_config.device = self._gpu_config.device

if self._ivf_pq_build_config:
gpu_index_cagra_config.ivf_pq_build_params = (
self._create_ivf_pq_build_config()
)

if self._ivf_pq_search_config:
gpu_index_cagra_config.ivf_pq_search_params = (
self._create_ivf_pq_search_config()
)

return gpu_index_cagra_config
Loading

0 comments on commit 6a7a658

Please sign in to comment.