Skip to content

Commit

Permalink
add mongodb config
Browse files Browse the repository at this point in the history
Signed-off-by: zhuwenxing <[email protected]>
  • Loading branch information
zhuwenxing committed Jan 17, 2025
1 parent 811564a commit c8c1fb1
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 12 deletions.
3 changes: 2 additions & 1 deletion install.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os
import argparse
import os
import subprocess


def docker_tag_base():
return 'vdbbench'

Expand Down
17 changes: 13 additions & 4 deletions vectordb_bench/backend/clients/mongodb/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
from enum import Enum

from pydantic import BaseModel, SecretStr

from ..api import DBCaseConfig, DBConfig, IndexType, MetricType


class QuantizationType(Enum):
NONE = "none"
BINARY = "binary"
SCALAR = "scalar"


class MongoDBConfig(DBConfig, BaseModel):
connection_string: SecretStr = "mongodb+srv://<user>:<password>@<cluster_name>.heatl.mongodb.net"
database: str = "vdb_bench"
Expand All @@ -16,9 +24,9 @@ def to_dict(self) -> dict:

class MongoDBIndexConfig(BaseModel, DBCaseConfig):
index: IndexType = IndexType.HNSW # MongoDB uses HNSW for vector search
metric_type: MetricType | None = None
num_candidates: int | None = 1500 # Default numCandidates for vector search
exact_search: bool = False # Whether to use exact (ENN) search
metric_type: MetricType = MetricType.COSINE
num_candidates_ratio: int = 10 # Default numCandidates ratio for vector search
quantization: QuantizationType = QuantizationType.NONE # Quantization type if applicable

def parse_metric(self) -> str:
if self.metric_type == MetricType.L2:
Expand All @@ -36,9 +44,10 @@ def index_param(self) -> dict:
"similarity": self.parse_metric(),
"numDimensions": None, # Will be set in MongoDB class
"path": "vector", # Vector field name
"quantization": self.quantization.value,
}
],
}

def search_param(self) -> dict:
return {"numCandidates": self.num_candidates if not self.exact_search else None, "exact": self.exact_search}
return {"num_candidates_ratio": self.num_candidates_ratio}
9 changes: 4 additions & 5 deletions vectordb_bench/backend/clients/mongodb/mongodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def _create_index(self) -> None:
break
log.info(f"index deleting {indices}")
except Exception:
log.exception("Error dropping index")
log.exception(f"Error dropping index {index_name}")
try:
# Create vector search index
search_index = SearchIndexModel(definition=index_params, name=index_name, type="vectorSearch")
Expand All @@ -104,7 +104,7 @@ def _create_index(self) -> None:
log.info(f"Created index on {self.id_field} field")

except Exception:
log.exception("Error creating index")
log.exception(f"Error creating index {index_name}")
raise

def _wait_for_index_ready(self, index_name: str, check_interval: int = 5) -> None:
Expand Down Expand Up @@ -167,16 +167,15 @@ def search_embedding(
else:
# Set numCandidates based on k value and data size
# For 50K dataset, use higher multiplier for better recall
num_candidates = min(10000, max(k * 20, search_params["numCandidates"] or 0))
num_candidates = min(10000, k * search_params["num_candidates_ratio"])
vector_search["numCandidates"] = num_candidates

# Add filter if specified
if filters:
log.info(f"Applying filter: {filters}")
vector_search["filter"] = {
"id": {"gt": filters["id"]},
"id": {"gte": filters["id"]},
}

pipeline = [
{"$vectorSearch": vector_search},
{
Expand Down
32 changes: 32 additions & 0 deletions vectordb_bench/frontend/config/dbCaseConfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,26 @@ class CaseConfigInput(BaseModel):
)


CaseConfigParamInput_MongoDBQuantizationType = CaseConfigInput(
label=CaseConfigParamType.mongodb_quantization_type,
inputType=InputType.Option,
inputConfig={
"options": ["none", "scalar", "binary"],
},
)


CaseConfigParamInput_MongoDBNumCandidatesRatio = CaseConfigInput(
label=CaseConfigParamType.mongodb_num_candidates_ratio,
inputType=InputType.Number,
inputConfig={
"min": 10,
"max": 20,
"value": 10,
},
)


MilvusLoadConfig = [
CaseConfigParamInput_IndexType,
CaseConfigParamInput_M,
Expand Down Expand Up @@ -1224,6 +1244,14 @@ class CaseConfigInput(BaseModel):
CaseConfigParamInput_NumCandidates_AliES,
]

MongoDBLoadingConfig = [
CaseConfigParamInput_MongoDBQuantizationType,
]
MongoDBPerformanceConfig = [
CaseConfigParamInput_MongoDBQuantizationType,
CaseConfigParamInput_MongoDBNumCandidatesRatio,
]

CASE_CONFIG_MAP = {
DB.Milvus: {
CaseLabel.Load: MilvusLoadConfig,
Expand Down Expand Up @@ -1272,4 +1300,8 @@ class CaseConfigInput(BaseModel):
CaseLabel.Load: AliyunOpensearchLoadingConfig,
CaseLabel.Performance: AliyunOpenSearchPerformanceConfig,
},
DB.MongoDB: {
CaseLabel.Load: MongoDBLoadingConfig,
CaseLabel.Performance: MongoDBPerformanceConfig,
},
}
17 changes: 15 additions & 2 deletions vectordb_bench/log_util.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import logging
from logging import config
from pathlib import Path


def init(log_level: str):
# Create logs directory if it doesn't exist
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)

log_config = {
"version": 1,
"disable_existing_loggers": False,
Expand All @@ -24,15 +29,23 @@ def init(log_level: str):
"class": "logging.StreamHandler",
"formatter": "default",
},
"file": {
"class": "logging.handlers.RotatingFileHandler",
"formatter": "default",
"filename": "logs/vectordb_bench.log",
"maxBytes": 10485760, # 10MB
"backupCount": 5,
"encoding": "utf8"
}
},
"loggers": {
"vectordb_bench": {
"handlers": ["console"],
"handlers": ["console", "file"],
"level": log_level,
"propagate": False,
},
"no_color": {
"handlers": ["no_color_console"],
"handlers": ["no_color_console", "file"],
"level": log_level,
"propagate": False,
},
Expand Down
4 changes: 4 additions & 0 deletions vectordb_bench/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ class CaseConfigParamType(Enum):
numSearchThreads = "num_search_threads"
maxNumPrefetchDatasets = "max_num_prefetch_datasets"

# mongodb params
mongodb_quantization_type = "quantization"
mongodb_num_candidates_ratio = "num_candidates_ratio"


class CustomizedCase(BaseModel):
pass
Expand Down

0 comments on commit c8c1fb1

Please sign in to comment.