Initial commit

alwayslove2013 · Jun 14, 2023 · 99cb037 · 99cb037
commit 99cb037
Show file tree

Hide file tree

Showing 82 changed files with 9,050 additions and 0 deletions.
diff --git a/.env.example b/.env.example
@@ -0,0 +1,11 @@
+# LOG_LEVEL=
+# LOG_PATH=
+# LOG_NAME=
+# TIMEZONE=
+
+# NUM_PER_BATCH=
+# DEFAULT_DATASET_URL=
+
+DATASET_LOCAL_DIR="/tmp/vector_db_bench/dataset"
+
+# DROP_OLD = True
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+*.sw[op]
+*.egg-info
+dist/
+__pycache__
+.env
+.data/
+__MACOSX
+.DS_Store
diff --git a/.ruff.toml b/.ruff.toml
@@ -0,0 +1,49 @@
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+# Enable flake8-bugbear (`B`) rules.
+select = ["E", "F", "B"]
+ignore = [
+    "E501", # (line length violations)
+]
+
+# Allow autofix for all enabled rules (when `--fix`) is provided.
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM",
+    "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT",
+    "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT",
+]
+unfixable = []
+
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+    "__pycache__",
+    "__init__.py",
+]
+
+# Allow unused variables when underscore-prefixed.
+dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
+
+# Assume Python 3.11.
+target-version = "py311"
+
+[mccabe]
+# Unlike Flake8, default to a complexity level of 10.
+max-complexity = 10
diff --git a/.streamlit/config.toml b/.streamlit/config.toml
@@ -0,0 +1,4 @@
+[theme]
+primaryColor="#3670F2"
+secondaryBackgroundColor="#F0F2F6"
+base="light"
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Zilliztech
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,50 @@
+requires: `python >= 3.11`
+
+## 1. Quick Start
+### Installation
+```shell
+$ pip install vector_db_bench
+```
+
+### Run
+```shell
+$ init_bench
+```
+
+### View app in browser
+
+Local URL: http://localhost:8501
+
+## 2. How to run test server
+
+### Install requirements
+``` shell
+pip install -e '.[test]'
+```
+
+### Run test server
+```
+$ python -m vector_db_bench
+```
+
+OR:
+
+```shell
+$ init_bench
+```
+
+## 3. How to check coding styles
+
+```shell
+$ ruff check vector_db_bench
+```
+
+Add `--fix` if you want to fix the coding styles automatically
+```shell
+$ ruff check vector_db_bench --fix
+```
+
+## 4. How to run uinitest
+```
+pytest -sv tests/
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,52 @@
+[build-system]
+requires = ["setuptools>=67.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "vector_db_bench"
+# authors = [
+#   { name="", email="" },
+# ]
+description = ""
+readme = "README.md"
+requires-python = ">=3.10"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    "pytz",
+    "streamlit-autorefresh",
+    "streamlit>=1.23.0",
+    "streamlit_extras",
+    "grpcio==1.53.0", # for qdrant-client and pymilvus
+    "grpcio-tools==1.53.0", # for qdrant-client and pymilvus
+    "pymilvus", # with pandas, numpy, ujson
+    "qdrant-client",
+    "pinecone-client",
+    "weaviate-client",
+    "elasticsearch",
+    "plotly",
+    "pydantic==v1.10.7", # for qdrant-client
+    "environs",
+    "scikit-learn",
+    "s3fs",
+    "psutil",
+]
+version = "0.0.1"
+
+[project.optional-dependencies]
+test = [
+    "ruff",
+    "pytest",
+]
+
+
+# [project.urls]
+# "Homepage"          = ""
+# "Docs: User Guide"  = ""
+# "Source Code"       = ""
+
+[project.scripts]
+init_bench = "vector_db_bench.__main__:main"
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,5 @@
+import os
+import sys
+
+from os.path import dirname, abspath
+sys.path.append(dirname(dirname(abspath(__file__))))
diff --git a/tests/pytest.ini b/tests/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+filterwarnings = 
+    ignore::UserWarning
diff --git a/tests/test_bench_runner.py b/tests/test_bench_runner.py
@@ -0,0 +1,60 @@
+import time
+import logging
+from vector_db_bench.interface import BenchMarkRunner
+from vector_db_bench.models import (
+    DB, IndexType, CaseType, TaskConfig, CaseConfig,
+)
+
+log = logging.getLogger(__name__)
+
+class TestBenchRunner:
+    def test_get_results(self):
+        runner = BenchMarkRunner()
+
+        result = runner.get_results()
+        log.info(f"test result: {result}")
+
+    def test_performance_case_whole(self):
+        runner = BenchMarkRunner()
+
+        task_config=TaskConfig(
+            db=DB.Milvus,
+            db_config=DB.Milvus.config(),
+            db_case_config=DB.Milvus.case_config_cls(index=IndexType.Flat)(),
+            case_config=CaseConfig(case_id=CaseType.PerformanceSZero),
+        )
+
+        runner.run([task_config])
+        runner._sync_running_task()
+        result = runner.get_results()
+        log.info(f"test result: {result}")
+
+    def test_performance_case_clean(self):
+        runner = BenchMarkRunner()
+
+        task_config=TaskConfig(
+            db=DB.Milvus,
+            db_config=DB.Milvus.config(),
+            db_case_config=DB.Milvus.case_config_cls(index=IndexType.Flat)(),
+            case_config=CaseConfig(case_id=CaseType.PerformanceSZero),
+        )
+
+        runner.run([task_config])
+        time.sleep(3)
+        runner.stop_running()
+
+    def test_performance_case_no_error(self):
+        task_config=TaskConfig(
+            db=DB.ZillizCloud,
+            db_config=DB.ZillizCloud.config(uri="xxx", user="abc", password="1234"),
+            db_case_config=DB.ZillizCloud.case_config_cls()(),
+            case_config=CaseConfig(case_id=CaseType.PerformanceSZero),
+        )
+
+        t = task_config.copy()
+        d = t.json(exclude={'db_config': {'password', 'api_key'}})
+        log.info(f"{d}")
+
+        import ujson
+        loads = ujson.loads(d)
+        log.info(f"{loads}")
diff --git a/tests/test_case.py b/tests/test_case.py
@@ -0,0 +1,99 @@
+import pytest
+import logging
+import vector_db_bench.backend.dataset as ds
+from vector_db_bench.models import DB, IndexType
+from vector_db_bench.backend import cases
+from vector_db_bench.backend.clients.milvus import Milvus
+from vector_db_bench.backend.clients.weaviate import Weaviate
+
+log  = logging.getLogger(__name__)
+class TestCases:
+    def test_init_LoadCase(self):
+        c = cases.LoadSDimCase(run_id=1, db_class=Milvus)
+        log.debug(f"c: {c}, {c.dict().keys()}")
+
+    def test_case_type(self):
+        from vector_db_bench.models import CaseType
+        log.debug(f"{CaseType.LoadLDim}")
+
+    def test_performance_case_small_zero(self):
+        dataset = ds.get(ds.Name.Cohere, ds.Label.SMALL)
+        # milvus crash
+        #  db_case_config = DB.Milvus.case_config_cls(IndexType.HNSW)(
+        #      M=8,
+        #      efConstruction=32,
+        #      ef=8,
+        #  )
+
+        db_case_config = DB.Milvus.case_config_cls(IndexType.Flat)()
+        db_case_config.metric_type = dataset.data.metric_type
+        c = cases.PerformanceSZero(run_id=1, db_configs=(
+            DB.Milvus.init_cls,
+            DB.Milvus.config().to_dict(),
+            db_case_config,
+        ))
+        c.run()
+
+    @pytest.mark.skip(reason="replace url and api_key by real value")
+    def test_performance_case_small_zero_weaviate(self):
+        dataset = ds.get(ds.Name.Cohere, ds.Label.SMALL)
+        db_case_config = DB.WeaviateCloud.case_config_cls()()
+        db_case_config.metric_type = dataset.data.metric_type
+
+        c = cases.PerformanceSZero(run_id=1, db_configs={
+            DB.WeaviateCloud.init_cls,
+            DB.WeaviateCloud.config(url="", api_key="").to_dict(),
+            db_case_config,
+        })
+        c.run()
+
+    def test_performance_case_small_low_filter(self):
+        dataset = ds.get(ds.Name.Cohere, ds.Label.SMALL)
+
+        db_case_config = DB.Milvus.case_config_cls(IndexType.Flat)()
+        db_case_config.metric_type = dataset.data.metric_type
+        c = cases.PerformanceSLow(run_id=2, db_configs=(
+            DB.Milvus.init_cls,
+            DB.Milvus.config().to_dict(),
+            db_case_config,
+        ))
+        c.run()
+
+    def test_performance_case_small_high_filter(self):
+        dataset = ds.get(ds.Name.Cohere, ds.Label.SMALL)
+        db_case_config = DB.Milvus.case_config_cls(IndexType.Flat)()
+        db_case_config.metric_type = dataset.data.metric_type
+
+        c = cases.PerformanceSHigh(run_id=3, db_configs=(
+            DB.Milvus.init_cls,
+            DB.Milvus.config().to_dict(),
+            db_case_config,
+        ))
+        c.run()
+
+    def test_load_small_dim(self):
+        dataset = ds.get(ds.Name.SIFT, ds.Label.SMALL)
+        db_case_config = DB.Milvus.case_config_cls(IndexType.Flat)()
+        db_case_config.metric_type = dataset.data.metric_type
+
+        c = cases.LoadSDimCase(run_id=1, db_configs=(
+            DB.Milvus.init_cls,
+            DB.Milvus.config().to_dict(),
+            db_case_config,
+        ))
+        c.run()
+
+    def test_performance_case_medium_zero(self):
+        dataset = ds.get(ds.Name.Cohere, ds.Label.MEDIUM)
+        db_case_config = DB.Milvus.case_config_cls(IndexType.Flat)()
+        db_case_config.metric_type = dataset.data.metric_type
+        c = cases.PerformanceMZero(run_id=1, db_configs=(
+            DB.Milvus.init_cls,
+            DB.Milvus.config().to_dict(),
+            db_case_config,
+        ))
+
+        #  c.dataset.prepare(False)
+        #  c._insert_train_data()
+        c.run()
+