diff --git a/install.py b/install.py index f683a37b..5807485f 100644 --- a/install.py +++ b/install.py @@ -1,7 +1,8 @@ -import os import argparse +import os import subprocess + def docker_tag_base(): return 'vdbbench' diff --git a/vectordb_bench/backend/clients/mongodb/config.py b/vectordb_bench/backend/clients/mongodb/config.py index cc09471a..a2d8ca57 100644 --- a/vectordb_bench/backend/clients/mongodb/config.py +++ b/vectordb_bench/backend/clients/mongodb/config.py @@ -1,8 +1,16 @@ +from enum import Enum + from pydantic import BaseModel, SecretStr from ..api import DBCaseConfig, DBConfig, IndexType, MetricType +class QuantizationType(Enum): + NONE = "none" + BINARY = "binary" + SCALAR = "scalar" + + class MongoDBConfig(DBConfig, BaseModel): connection_string: SecretStr = "mongodb+srv://:@.heatl.mongodb.net" database: str = "vdb_bench" @@ -16,9 +24,9 @@ def to_dict(self) -> dict: class MongoDBIndexConfig(BaseModel, DBCaseConfig): index: IndexType = IndexType.HNSW # MongoDB uses HNSW for vector search - metric_type: MetricType | None = None - num_candidates: int | None = 1500 # Default numCandidates for vector search - exact_search: bool = False # Whether to use exact (ENN) search + metric_type: MetricType = MetricType.COSINE + num_candidates_ratio: int = 10 # Default numCandidates ratio for vector search + quantization: QuantizationType = QuantizationType.NONE # Quantization type if applicable def parse_metric(self) -> str: if self.metric_type == MetricType.L2: @@ -36,9 +44,10 @@ def index_param(self) -> dict: "similarity": self.parse_metric(), "numDimensions": None, # Will be set in MongoDB class "path": "vector", # Vector field name + "quantization": self.quantization.value, } ], } def search_param(self) -> dict: - return {"numCandidates": self.num_candidates if not self.exact_search else None, "exact": self.exact_search} + return {"num_candidates_ratio": self.num_candidates_ratio} diff --git a/vectordb_bench/backend/clients/mongodb/mongodb.py b/vectordb_bench/backend/clients/mongodb/mongodb.py index dddcc9a4..0bbfd5d9 100644 --- a/vectordb_bench/backend/clients/mongodb/mongodb.py +++ b/vectordb_bench/backend/clients/mongodb/mongodb.py @@ -90,7 +90,7 @@ def _create_index(self) -> None: break log.info(f"index deleting {indices}") except Exception: - log.exception("Error dropping index") + log.exception(f"Error dropping index {index_name}") try: # Create vector search index search_index = SearchIndexModel(definition=index_params, name=index_name, type="vectorSearch") @@ -104,7 +104,7 @@ def _create_index(self) -> None: log.info(f"Created index on {self.id_field} field") except Exception: - log.exception("Error creating index") + log.exception(f"Error creating index {index_name}") raise def _wait_for_index_ready(self, index_name: str, check_interval: int = 5) -> None: @@ -167,16 +167,15 @@ def search_embedding( else: # Set numCandidates based on k value and data size # For 50K dataset, use higher multiplier for better recall - num_candidates = min(10000, max(k * 20, search_params["numCandidates"] or 0)) + num_candidates = min(10000, k * search_params["num_candidates_ratio"]) vector_search["numCandidates"] = num_candidates # Add filter if specified if filters: log.info(f"Applying filter: {filters}") vector_search["filter"] = { - "id": {"gt": filters["id"]}, + "id": {"gte": filters["id"]}, } - pipeline = [ {"$vectorSearch": vector_search}, { diff --git a/vectordb_bench/frontend/config/dbCaseConfigs.py b/vectordb_bench/frontend/config/dbCaseConfigs.py index e004f2ba..13858f87 100644 --- a/vectordb_bench/frontend/config/dbCaseConfigs.py +++ b/vectordb_bench/frontend/config/dbCaseConfigs.py @@ -1041,6 +1041,26 @@ class CaseConfigInput(BaseModel): ) +CaseConfigParamInput_MongoDBQuantizationType = CaseConfigInput( + label=CaseConfigParamType.mongodb_quantization_type, + inputType=InputType.Option, + inputConfig={ + "options": ["none", "scalar", "binary"], + }, +) + + +CaseConfigParamInput_MongoDBNumCandidatesRatio = CaseConfigInput( + label=CaseConfigParamType.mongodb_num_candidates_ratio, + inputType=InputType.Number, + inputConfig={ + "min": 10, + "max": 20, + "value": 10, + }, +) + + MilvusLoadConfig = [ CaseConfigParamInput_IndexType, CaseConfigParamInput_M, @@ -1224,6 +1244,14 @@ class CaseConfigInput(BaseModel): CaseConfigParamInput_NumCandidates_AliES, ] +MongoDBLoadingConfig = [ + CaseConfigParamInput_MongoDBQuantizationType, +] +MongoDBPerformanceConfig = [ + CaseConfigParamInput_MongoDBQuantizationType, + CaseConfigParamInput_MongoDBNumCandidatesRatio, +] + CASE_CONFIG_MAP = { DB.Milvus: { CaseLabel.Load: MilvusLoadConfig, @@ -1272,4 +1300,8 @@ class CaseConfigInput(BaseModel): CaseLabel.Load: AliyunOpensearchLoadingConfig, CaseLabel.Performance: AliyunOpenSearchPerformanceConfig, }, + DB.MongoDB: { + CaseLabel.Load: MongoDBLoadingConfig, + CaseLabel.Performance: MongoDBPerformanceConfig, + }, } diff --git a/vectordb_bench/log_util.py b/vectordb_bench/log_util.py index d7568813..a55c9cf3 100644 --- a/vectordb_bench/log_util.py +++ b/vectordb_bench/log_util.py @@ -1,8 +1,13 @@ import logging from logging import config +from pathlib import Path def init(log_level: str): + # Create logs directory if it doesn't exist + log_dir = Path("logs") + log_dir.mkdir(exist_ok=True) + log_config = { "version": 1, "disable_existing_loggers": False, @@ -24,15 +29,23 @@ def init(log_level: str): "class": "logging.StreamHandler", "formatter": "default", }, + "file": { + "class": "logging.handlers.RotatingFileHandler", + "formatter": "default", + "filename": "logs/vectordb_bench.log", + "maxBytes": 10485760, # 10MB + "backupCount": 5, + "encoding": "utf8" + } }, "loggers": { "vectordb_bench": { - "handlers": ["console"], + "handlers": ["console", "file"], "level": log_level, "propagate": False, }, "no_color": { - "handlers": ["no_color_console"], + "handlers": ["no_color_console", "file"], "level": log_level, "propagate": False, }, diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py index 49bb04ae..bf71ebb8 100644 --- a/vectordb_bench/models.py +++ b/vectordb_bench/models.py @@ -88,6 +88,10 @@ class CaseConfigParamType(Enum): numSearchThreads = "num_search_threads" maxNumPrefetchDatasets = "max_num_prefetch_datasets" + # mongodb params + mongodb_quantization_type = "quantization" + mongodb_num_candidates_ratio = "num_candidates_ratio" + class CustomizedCase(BaseModel): pass