add mongodb config

Signed-off-by: zhuwenxing <[email protected]>
zilliztech · Jan 17, 2025 · c8c1fb1 · c8c1fb1
1 parent 811564a
commit c8c1fb1
Show file tree

Hide file tree

Showing 6 changed files with 70 additions and 12 deletions.
diff --git a/install.py b/install.py
@@ -1,7 +1,8 @@
-import os
 import argparse
+import os
 import subprocess
 
+
 def docker_tag_base():
     return 'vdbbench'
 

diff --git a/vectordb_bench/backend/clients/mongodb/config.py b/vectordb_bench/backend/clients/mongodb/config.py
@@ -1,8 +1,16 @@
+from enum import Enum
+
 from pydantic import BaseModel, SecretStr
 
 from ..api import DBCaseConfig, DBConfig, IndexType, MetricType
 
 
+class QuantizationType(Enum):
+    NONE = "none"
+    BINARY = "binary"
+    SCALAR = "scalar"
+
+
 class MongoDBConfig(DBConfig, BaseModel):
     connection_string: SecretStr = "mongodb+srv://<user>:<password>@<cluster_name>.heatl.mongodb.net"
     database: str = "vdb_bench"
@@ -16,9 +24,9 @@ def to_dict(self) -> dict:
 
 class MongoDBIndexConfig(BaseModel, DBCaseConfig):
     index: IndexType = IndexType.HNSW  # MongoDB uses HNSW for vector search
-    metric_type: MetricType | None = None
-    num_candidates: int | None = 1500  # Default numCandidates for vector search
-    exact_search: bool = False  # Whether to use exact (ENN) search
+    metric_type: MetricType = MetricType.COSINE
+    num_candidates_ratio: int = 10  # Default numCandidates ratio for vector search
+    quantization: QuantizationType = QuantizationType.NONE  # Quantization type if applicable
 
     def parse_metric(self) -> str:
         if self.metric_type == MetricType.L2:
@@ -36,9 +44,10 @@ def index_param(self) -> dict:
                     "similarity": self.parse_metric(),
                     "numDimensions": None,  # Will be set in MongoDB class
                     "path": "vector",  # Vector field name
+                    "quantization": self.quantization.value,
                 }
             ],
         }
 
     def search_param(self) -> dict:
-        return {"numCandidates": self.num_candidates if not self.exact_search else None, "exact": self.exact_search}
+        return {"num_candidates_ratio": self.num_candidates_ratio}
diff --git a/vectordb_bench/backend/clients/mongodb/mongodb.py b/vectordb_bench/backend/clients/mongodb/mongodb.py
@@ -90,7 +90,7 @@ def _create_index(self) -> None:
                             break
                         log.info(f"index deleting {indices}")
                 except Exception:
-                    log.exception("Error dropping index")
+                    log.exception(f"Error dropping index {index_name}")
         try:
             # Create vector search index
             search_index = SearchIndexModel(definition=index_params, name=index_name, type="vectorSearch")
@@ -104,7 +104,7 @@ def _create_index(self) -> None:
             log.info(f"Created index on {self.id_field} field")
 
         except Exception:
-            log.exception("Error creating index")
+            log.exception(f"Error creating index {index_name}")
             raise
 
     def _wait_for_index_ready(self, index_name: str, check_interval: int = 5) -> None:
@@ -167,16 +167,15 @@ def search_embedding(
         else:
             # Set numCandidates based on k value and data size
             # For 50K dataset, use higher multiplier for better recall
-            num_candidates = min(10000, max(k * 20, search_params["numCandidates"] or 0))
+            num_candidates = min(10000, k * search_params["num_candidates_ratio"])
             vector_search["numCandidates"] = num_candidates
 
         # Add filter if specified
         if filters:
             log.info(f"Applying filter: {filters}")
             vector_search["filter"] = {
-                "id": {"gt": filters["id"]},
+                "id": {"gte": filters["id"]},
             }
-
         pipeline = [
             {"$vectorSearch": vector_search},
             {

diff --git a/vectordb_bench/frontend/config/dbCaseConfigs.py b/vectordb_bench/frontend/config/dbCaseConfigs.py
@@ -1041,6 +1041,26 @@ class CaseConfigInput(BaseModel):
 )
 
 
+CaseConfigParamInput_MongoDBQuantizationType = CaseConfigInput(
+    label=CaseConfigParamType.mongodb_quantization_type,
+    inputType=InputType.Option,
+    inputConfig={
+        "options": ["none", "scalar", "binary"],
+    },
+)
+
+
+CaseConfigParamInput_MongoDBNumCandidatesRatio = CaseConfigInput(
+    label=CaseConfigParamType.mongodb_num_candidates_ratio,
+    inputType=InputType.Number,
+    inputConfig={
+        "min": 10,
+        "max": 20,
+        "value": 10,
+    },
+)
+
+
 MilvusLoadConfig = [
     CaseConfigParamInput_IndexType,
     CaseConfigParamInput_M,
@@ -1224,6 +1244,14 @@ class CaseConfigInput(BaseModel):
     CaseConfigParamInput_NumCandidates_AliES,
 ]
 
+MongoDBLoadingConfig = [
+    CaseConfigParamInput_MongoDBQuantizationType,
+]
+MongoDBPerformanceConfig = [
+    CaseConfigParamInput_MongoDBQuantizationType,
+    CaseConfigParamInput_MongoDBNumCandidatesRatio,
+]
+
 CASE_CONFIG_MAP = {
     DB.Milvus: {
         CaseLabel.Load: MilvusLoadConfig,
@@ -1272,4 +1300,8 @@ class CaseConfigInput(BaseModel):
         CaseLabel.Load: AliyunOpensearchLoadingConfig,
         CaseLabel.Performance: AliyunOpenSearchPerformanceConfig,
     },
+    DB.MongoDB: {
+        CaseLabel.Load: MongoDBLoadingConfig,
+        CaseLabel.Performance: MongoDBPerformanceConfig,
+    },
 }
diff --git a/vectordb_bench/log_util.py b/vectordb_bench/log_util.py
@@ -1,8 +1,13 @@
 import logging
 from logging import config
+from pathlib import Path
 
 
 def init(log_level: str):
+    # Create logs directory if it doesn't exist
+    log_dir = Path("logs")
+    log_dir.mkdir(exist_ok=True)
+
     log_config = {
         "version": 1,
         "disable_existing_loggers": False,
@@ -24,15 +29,23 @@ def init(log_level: str):
                 "class": "logging.StreamHandler",
                 "formatter": "default",
             },
+            "file": {
+                "class": "logging.handlers.RotatingFileHandler",
+                "formatter": "default",
+                "filename": "logs/vectordb_bench.log",
+                "maxBytes": 10485760,  # 10MB
+                "backupCount": 5,
+                "encoding": "utf8"
+            }
         },
         "loggers": {
             "vectordb_bench": {
-                "handlers": ["console"],
+                "handlers": ["console", "file"],
                 "level": log_level,
                 "propagate": False,
             },
             "no_color": {
-                "handlers": ["no_color_console"],
+                "handlers": ["no_color_console", "file"],
                 "level": log_level,
                 "propagate": False,
             },

diff --git a/vectordb_bench/models.py b/vectordb_bench/models.py
@@ -88,6 +88,10 @@ class CaseConfigParamType(Enum):
     numSearchThreads = "num_search_threads"
     maxNumPrefetchDatasets = "max_num_prefetch_datasets"
 
+    # mongodb params
+    mongodb_quantization_type = "quantization"
+    mongodb_num_candidates_ratio = "num_candidates_ratio"
+
 
 class CustomizedCase(BaseModel):
     pass