Add timeout by dataset size

Signed-off-by: yangxuan <[email protected]>
alwayslove2013 · Jul 4, 2023 · 13fdd8d · 13fdd8d
1 parent b060486
commit 13fdd8d
Show file tree

Hide file tree

Showing 13 changed files with 172 additions and 109 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,5 @@ __pycache__
 __MACOSX
 .DS_Store
 build/
+venv/
+.idea/
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -65,6 +65,6 @@ def test_test_result_merge(self):
 
     def test_test_result_display(self):
         result_dir = config.RESULTS_LOCAL_DIR
-        for json_file in result_dir.glob("*.json"):
+        for json_file in result_dir.glob("result*.json"):
             res = TestResult.read_file(json_file)
             res.display()
diff --git a/vectordb_bench/__init__.py b/vectordb_bench/__init__.py
@@ -18,12 +18,23 @@ class config:
     USE_SHUFFLED_DATA = env.bool("USE_SHUFFLED_DATA", True)
 
     RESULTS_LOCAL_DIR = pathlib.Path(__file__).parent.joinpath("results")
-    CASE_TIMEOUT_IN_SECOND = 24 * 60 * 60
+
+    CAPACITY_TIMEOUT_IN_SECONDS =  24 * 3600 # 24h
+    LOAD_TIMEOUT_1M             = 1.5 * 3600 # 1.5h
+    LOAD_TIMEOUT_10M            =  15 * 3600 # 15h
+    LOAD_TIMEOUT_100M           = 150 * 3600 # 6.25d
+
+    OPTIMIZE_TIMEOUT_1M         =  15 * 60   # 15min
+    OPTIMIZE_TIMEOUT_10M        = 2.5 * 3600 # 2.5h
+    OPTIMIZE_TIMEOUT_100M       =  25 * 3600 # 1.04d
 
 
     def display(self) -> str:
-        tmp = [i for i in inspect.getmembers(self)
-            if not inspect.ismethod(i[1]) and not i[0].startswith('_') \
+        tmp = [
+            i for i in inspect.getmembers(self)
+            if not inspect.ismethod(i[1])
+            and not i[0].startswith('_')
+            and "TIMEOUT" not in i[0]
         ]
         return tmp
 

diff --git a/vectordb_bench/backend/cases.py b/vectordb_bench/backend/cases.py
@@ -2,8 +2,10 @@
 import logging
 from enum import Enum, auto
 
+from vectordb_bench import config
+from vectordb_bench.base import BaseModel
+
 from .dataset import Dataset, DatasetManager
-from ..base import BaseModel
 
 
 log = logging.getLogger(__name__)
@@ -75,6 +77,9 @@ class Case(BaseModel):
     description: str
     dataset: DatasetManager
 
+    load_timeout: float | int
+    optimize_timeout: float | int | None
+
     filter_rate: float | None
 
     @property
@@ -92,6 +97,8 @@ def filters(self) -> dict | None:
 class CapacityCase(Case, BaseModel):
     label: CaseLabel = CaseLabel.Load
     filter_rate: float | None = None
+    load_timeout: float | int = config.CAPACITY_TIMEOUT_IN_SECONDS
+    optimize_timeout: float | int | None = None
 
 
 class PerformanceCase(Case, BaseModel):
@@ -121,6 +128,8 @@ class Performance10M(PerformanceCase):
     name: str = "Search Performance Test (10M Dataset, 768 Dim)"
     description: str = """This case tests the search performance of a vector database with a large dataset (<b>Cohere 10M vectors</b>, 768 dimensions) at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_10M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_10M
 
 
 class Performance1M(PerformanceCase):
@@ -129,6 +138,8 @@ class Performance1M(PerformanceCase):
     name: str = "Search Performance Test (1M Dataset, 768 Dim)"
     description: str = """This case tests the search performance of a vector database with a medium dataset (<b>Cohere 1M vectors</b>, 768 dimensions) at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_1M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1M
 
 
 class Performance10M1P(PerformanceCase):
@@ -138,6 +149,8 @@ class Performance10M1P(PerformanceCase):
     name: str = "Filtering Search Performance Test (10M Dataset, 768 Dim, Filter 1%)"
     description: str = """This case tests the search performance of a vector database with a large dataset (<b>Cohere 10M vectors</b>, 768 dimensions) under a low filtering rate (<b>1% vectors</b>), at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_10M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_10M
 
 
 class Performance1M1P(PerformanceCase):
@@ -147,6 +160,8 @@ class Performance1M1P(PerformanceCase):
     name: str = "Filtering Search Performance Test (1M Dataset, 768 Dim, Filter 1%)"
     description: str = """This case tests the search performance of a vector database with a medium dataset (<b>Cohere 1M vectors</b>, 768 dimensions) under a low filtering rate (<b>1% vectors</b>), at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_1M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1M
 
 
 class Performance10M99P(PerformanceCase):
@@ -156,6 +171,8 @@ class Performance10M99P(PerformanceCase):
     name: str = "Filtering Search Performance Test (10M Dataset, 768 Dim, Filter 99%)"
     description: str = """This case tests the search performance of a vector database with a large dataset (<b>Cohere 10M vectors</b>, 768 dimensions) under a high filtering rate (<b>99% vectors</b>), at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_10M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_10M
 
 
 class Performance1M99P(PerformanceCase):
@@ -165,6 +182,8 @@ class Performance1M99P(PerformanceCase):
     name: str = "Filtering Search Performance Test (1M Dataset, 768 Dim, Filter 99%)"
     description: str = """This case tests the search performance of a vector database with a medium dataset (<b>Cohere 1M vectors</b>, 768 dimensions) under a high filtering rate (<b>99% vectors</b>), at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_1M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_1M
 
 
 
@@ -175,6 +194,8 @@ class Performance100M(PerformanceCase):
     name: str = "Search Performance Test (100M Dataset, 768 Dim)"
     description: str = """This case tests the search performance of a vector database with a large 100M dataset (<b>LAION 100M vectors</b>, 768 dimensions), at varying parallel levels.
 Results will show index building time, recall, and maximum QPS."""
+    load_timeout: float | int = config.LOAD_TIMEOUT_100M
+    optimize_timeout: float | int | None = config.OPTIMIZE_TIMEOUT_100M
 
 
 type2case = {

diff --git a/vectordb_bench/backend/clients/api.py b/vectordb_bench/backend/clients/api.py
@@ -73,7 +73,7 @@ class VectorDB(ABC):
 
     In each process, the benchmark cases ensure VectorDB.init() calls before any other methods operations
 
-    insert_embeddings, search_embedding, and, ready_to_search will be timed for each call.
+    insert_embeddings, search_embedding, and, optimize will be timed for each call.
 
     Examples:
         >>> milvus = Milvus()
@@ -166,13 +166,14 @@ def search_embedding(
 
     # TODO: remove
     @abstractmethod
-    def ready_to_search(self):
-        """ready_to_search will be called between insertion and search in performance cases.
+    def optimize(self):
+        """optimize will be called between insertion and search in performance cases.
 
         Should be blocked until the vectorDB is ready to be tested on
         heavy performance cases.
 
-        Time(insert the dataset) + Time(ready_to_search) will be recorded as "load_duration" metric
+        Time(insert the dataset) + Time(optimize) will be recorded as "load_duration" metric
+        Optimize's execution time is limited, the limited time is based on cases.
         """
         raise NotImplementedError
 

diff --git a/vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py b/vectordb_bench/backend/clients/elastic_cloud/elastic_cloud.py
@@ -143,8 +143,8 @@ def search_embedding(
             log.warning(f"Failed to search: {self.indice} error: {str(e)}")
             raise e from None
 
-    def ready_to_search(self):
-        """ready_to_search will be called between insertion and search in performance cases."""
+    def optimize(self):
+        """optimize will be called between insertion and search in performance cases."""
         pass
 
     def ready_to_load(self):

diff --git a/vectordb_bench/backend/clients/milvus/milvus.py b/vectordb_bench/backend/clients/milvus/milvus.py
@@ -53,7 +53,7 @@ def __init__(
             log.info(f"{self.name} create collection: {self.collection_name}")
 
             # Create the collection
-            coll = Collection(
+            Collection(
                 name=self.collection_name,
                 schema=CollectionSchema(fields),
                 consistency_level="Session",
@@ -107,6 +107,14 @@ def _pre_load(self, coll: Collection):
 
     def _optimize(self):
         log.info(f"{self.name} optimizing before search")
+        try:
+            self.col.load()
+        except Exception as e:
+            log.warning(f"{self.name} optimize error: {e}")
+            raise e from None
+
+    def _post_insert(self):
+        log.info(f"{self.name} post insert before optimize")
         try:
             self.col.flush()
             self.col.compact()
@@ -119,10 +127,6 @@ def _optimize(self):
                 index_name=self._index_name,
             )
             utility.wait_for_index_building_complete(self.collection_name)
-            self.col.load()
-            #  self.col.load(_refresh=True)
-            #  utility.wait_for_loading_complete(self.collection_name)
-            #  import time; time.sleep(10)
         except Exception as e:
             log.warning(f"{self.name} optimize error: {e}")
             raise e from None
@@ -132,7 +136,7 @@ def ready_to_load(self):
         self._pre_load(self.col)
         pass
 
-    def ready_to_search(self):
+    def optimize(self):
         assert self.col, "Please call self.init() before"
         self._optimize()
 
@@ -157,6 +161,8 @@ def insert_embeddings(
                 ]
                 res = self.col.insert(insert_data, **kwargs)
                 insert_count += len(res.primary_keys)
+            if kwargs.get("last_batch"):
+                self._post_insert()
         except MilvusException as e:
             log.warning("Failed to insert data")
             return (insert_count, e)

diff --git a/vectordb_bench/backend/clients/pinecone/pinecone.py b/vectordb_bench/backend/clients/pinecone/pinecone.py
@@ -69,7 +69,7 @@ def init(self) -> None:
     def ready_to_load(self):
         pass
 
-    def ready_to_search(self):
+    def optimize(self):
         pass
 
     def insert_embeddings(

diff --git a/vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py b/vectordb_bench/backend/clients/qdrant_cloud/qdrant_cloud.py
@@ -74,7 +74,7 @@ def ready_to_load(self):
         pass
 
 
-    def ready_to_search(self):
+    def optimize(self):
         assert self.qdrant_client, "Please call self.init() before"
         # wait for vectors to be fully indexed
         SECONDS_WAITING_FOR_INDEXING_API_CALL = 5

diff --git a/vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py b/vectordb_bench/backend/clients/weaviate_cloud/weaviate_cloud.py
@@ -70,7 +70,7 @@ def ready_to_load(self):
         """Should call insert first, do nothing"""
         pass
 
-    def ready_to_search(self):
+    def optimize(self):
         assert self.client.schema.exists(self.collection_name)
         self.client.schema.update_config(self.collection_name, {"vectorIndexConfig": self.case_config.search_param() } )
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,5 @@ __pycache__ @@
     __MACOSX
     .DS_Store
     build/
+    venv/
+    .idea/