From 00e4af207ec56f7387c0a0c08e5abaa1f7cdc1c6 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 13:44:18 +0100 Subject: [PATCH 01/13] Added euclidean metric to basic backend --- vicinity/backends/basic.py | 92 +++++++++++++++++++++++++------------- vicinity/vicinity.py | 1 + 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/vicinity/backends/basic.py b/vicinity/backends/basic.py index 7f5d5e8..937a1be 100644 --- a/vicinity/backends/basic.py +++ b/vicinity/backends/basic.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from pathlib import Path -from typing import Any +from typing import Any, Literal import numpy as np from numpy import typing as npt @@ -13,7 +13,8 @@ @dataclass -class BasicArgs(BaseArgs): ... +class BasicArgs(BaseArgs): + metric: Literal["cosine", "euclidean"] = "cosine" class BasicBackend(AbstractBackend[BasicArgs]): @@ -24,6 +25,8 @@ def __init__(self, vectors: npt.NDArray, arguments: BasicArgs) -> None: super().__init__(arguments) self._vectors = vectors self._norm_vectors: npt.NDArray | None = None + self._squared_norm_vectors: npt.NDArray | None = None + self._update_precomputed_data() def __len__(self) -> int: """Get the number of vectors.""" @@ -37,7 +40,8 @@ def backend_type(self) -> Backend: @classmethod def from_vectors(cls: type[BasicBackend], vectors: npt.NDArray, **kwargs: Any) -> BasicBackend: """Create a new instance from vectors.""" - return cls(vectors, BasicArgs()) + arguments = BasicArgs(**kwargs) + return cls(vectors, arguments) @classmethod def load(cls: type[BasicBackend], folder: Path) -> BasicBackend: @@ -70,9 +74,18 @@ def vectors(self, x: Matrix) -> None: if not np.ndim(matrix) == 2: raise ValueError(f"Your array does not have 2 dimensions: {np.ndim(matrix)}") self._vectors = matrix - # Make sure norm vectors is updated. - if self._norm_vectors is not None: - self._norm_vectors = normalize_or_copy(matrix) + self._update_precomputed_data() + + def squared_norm(self, x: np.ndarray) -> np.ndarray: + """Compute the squared norm of a matrix.""" + return (x**2).sum(1) + + def _update_precomputed_data(self) -> None: + """Update precomputed data based on the metric.""" + if self.arguments.metric == "cosine": + self._norm_vectors = normalize_or_copy(self._vectors) + elif self.arguments.metric == "euclidean": + self._squared_norm_vectors = self.squared_norm(self._vectors) @property def norm_vectors(self) -> npt.NDArray: @@ -85,17 +98,24 @@ def norm_vectors(self) -> npt.NDArray: self._norm_vectors = normalize_or_copy(self.vectors) return self._norm_vectors + @property + def squared_norm_vectors(self) -> npt.NDArray: + """The squared norms of the vectors.""" + if self._squared_norm_vectors is None: + self._squared_norm_vectors = self.squared_norm(self.vectors) + return self._squared_norm_vectors + def threshold( self, vectors: npt.NDArray, threshold: float, ) -> list[npt.NDArray]: - """Batched cosine similarity.""" + """Batched distance thresholding.""" out: list[npt.NDArray] = [] for i in range(0, len(vectors), 1024): batch = vectors[i : i + 1024] - distances = self._dist(batch, self.norm_vectors) - for _, sims in enumerate(distances): + distances = self._dist(batch) + for sims in distances: indices = np.flatnonzero(sims <= threshold) sorted_indices = indices[np.argsort(sims[indices])] out.append(sorted_indices) @@ -107,43 +127,51 @@ def query( vectors: npt.NDArray, k: int, ) -> QueryResult: - """Batched cosine distance.""" + """Batched distance query.""" if k < 1: - raise ValueError("num should be >= 1, is now {num}") + raise ValueError(f"k should be >= 1, is now {k}") out: QueryResult = [] + num_vectors = len(self.vectors) + effective_k = min(k, num_vectors) for index in range(0, len(vectors), 1024): batch = vectors[index : index + 1024] - distances = self._dist(batch, self.norm_vectors) - if k == 1: - sorted_indices = np.argmin(distances, 1, keepdims=True) - elif k >= len(self.vectors): - # If we want more than we have, just sort everything. - sorted_indices = np.stack([np.arange(len(self.vectors))] * len(vectors)) - else: - sorted_indices = np.argpartition(distances, kth=k, axis=1) - sorted_indices = sorted_indices[:, :k] - for lidx, indices in enumerate(sorted_indices): - dists_for_word = distances[lidx, indices] - word_index = np.argsort(dists_for_word) - i = indices[word_index] - d = dists_for_word[word_index] - out.append((i, d)) + distances = self._dist(batch) - return out + # Use argpartition for efficiency + indices = np.argpartition(distances, kth=effective_k - 1, axis=1)[:, :effective_k] + sorted_indices = np.take_along_axis( + indices, np.argsort(np.take_along_axis(distances, indices, axis=1)), axis=1 + ) + sorted_distances = np.take_along_axis(distances, sorted_indices, axis=1) - @classmethod - def _dist(cls, x: npt.NDArray, y: npt.NDArray) -> npt.NDArray: - """Cosine distance function. This assumes y is normalized.""" - sim = normalize(x).dot(y.T) + out.extend(zip(sorted_indices, sorted_distances)) + + return out - return 1 - sim + def _dist(self, x: npt.NDArray) -> npt.NDArray: + """Compute distances between x and self._vectors based on the given metric.""" + if self.arguments.metric == "cosine": + x_norm = normalize(x) + sim = x_norm.dot(self.norm_vectors.T) + return 1 - sim + elif self.arguments.metric == "euclidean": + x_norm = self.squared_norm(x) + dists_squared = (x_norm[:, None] + self.squared_norm_vectors[None, :]) - 2 * (x @ self._vectors.T) + + # Ensure non-negative distances + dists_squared = np.maximum(dists_squared, 1e-12) + return np.sqrt(dists_squared) + else: + raise ValueError(f"Unsupported metric: {self.arguments.metric}") def insert(self, vectors: npt.NDArray) -> None: """Insert vectors into the vector space.""" self._vectors = np.vstack([self._vectors, vectors]) + self._update_precomputed_data() def delete(self, indices: list[int]) -> None: """Deletes specific indices from the vector space.""" self._vectors = np.delete(self._vectors, indices, axis=0) + self._update_precomputed_data() diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py index 6571657..d469325 100644 --- a/vicinity/vicinity.py +++ b/vicinity/vicinity.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import time from io import open from pathlib import Path from typing import Any, Sequence, Union From 342a3089eecbbffbe028455c10707719f18c5b3b Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 14:40:42 +0100 Subject: [PATCH 02/13] Switched to mixins --- vicinity/backends/basic.py | 164 ++++++++++++++++++++++--------------- 1 file changed, 96 insertions(+), 68 deletions(-) diff --git a/vicinity/backends/basic.py b/vicinity/backends/basic.py index 937a1be..682756f 100644 --- a/vicinity/backends/basic.py +++ b/vicinity/backends/basic.py @@ -1,5 +1,6 @@ from __future__ import annotations +from abc import ABC, abstractmethod from dataclasses import dataclass from pathlib import Path from typing import Any, Literal @@ -17,16 +18,13 @@ class BasicArgs(BaseArgs): metric: Literal["cosine", "euclidean"] = "cosine" -class BasicBackend(AbstractBackend[BasicArgs]): +class BasicBackend(AbstractBackend[BasicArgs], ABC): argument_class = BasicArgs + _vectors: npt.NDArray - def __init__(self, vectors: npt.NDArray, arguments: BasicArgs) -> None: - """Initialize the backend using vectors.""" + def __init__(self, arguments: BasicArgs) -> None: + """Initialize the backend.""" super().__init__(arguments) - self._vectors = vectors - self._norm_vectors: npt.NDArray | None = None - self._squared_norm_vectors: npt.NDArray | None = None - self._update_precomputed_data() def __len__(self) -> int: """Get the number of vectors.""" @@ -37,27 +35,6 @@ def backend_type(self) -> Backend: """The type of the backend.""" return Backend.BASIC - @classmethod - def from_vectors(cls: type[BasicBackend], vectors: npt.NDArray, **kwargs: Any) -> BasicBackend: - """Create a new instance from vectors.""" - arguments = BasicArgs(**kwargs) - return cls(vectors, arguments) - - @classmethod - def load(cls: type[BasicBackend], folder: Path) -> BasicBackend: - """Load the vectors from a path.""" - path = folder / "vectors.npy" - arguments = BasicArgs.load(folder / "arguments.json") - with open(path, "rb") as f: - return cls(np.load(f), arguments) - - def save(self, folder: Path) -> None: - """Save the vectors to a path.""" - path = Path(folder) / "vectors.npy" - self.arguments.dump(folder / "arguments.json") - with open(path, "wb") as f: - np.save(f, self._vectors) - @property def dim(self) -> int: """The size of the space.""" @@ -71,39 +48,52 @@ def vectors(self) -> npt.NDArray: @vectors.setter def vectors(self, x: Matrix) -> None: matrix = np.asarray(x) - if not np.ndim(matrix) == 2: + if np.ndim(matrix) != 2: raise ValueError(f"Your array does not have 2 dimensions: {np.ndim(matrix)}") self._vectors = matrix self._update_precomputed_data() - def squared_norm(self, x: np.ndarray) -> np.ndarray: - """Compute the squared norm of a matrix.""" - return (x**2).sum(1) - + @abstractmethod def _update_precomputed_data(self) -> None: """Update precomputed data based on the metric.""" - if self.arguments.metric == "cosine": - self._norm_vectors = normalize_or_copy(self._vectors) - elif self.arguments.metric == "euclidean": - self._squared_norm_vectors = self.squared_norm(self._vectors) + pass - @property - def norm_vectors(self) -> npt.NDArray: - """ - Vectors, but normalized to unit length. + @abstractmethod + def _dist(self, x: npt.NDArray) -> npt.NDArray: + """Compute distances between x and self._vectors based on the metric.""" + pass - NOTE: when all vectors are unit length, this attribute _is_ vectors. - """ - if self._norm_vectors is None: - self._norm_vectors = normalize_or_copy(self.vectors) - return self._norm_vectors + @classmethod + def from_vectors(cls, vectors: npt.NDArray, **kwargs: Any) -> BasicBackend: + """Create a new instance from vectors.""" + arguments = BasicArgs(**kwargs) + if arguments.metric == "cosine": + return CosineBasicBackend(vectors, arguments) + elif arguments.metric == "euclidean": + return EuclideanBasicBackend(vectors, arguments) + else: + raise ValueError(f"Unsupported metric: {arguments.metric}") - @property - def squared_norm_vectors(self) -> npt.NDArray: - """The squared norms of the vectors.""" - if self._squared_norm_vectors is None: - self._squared_norm_vectors = self.squared_norm(self.vectors) - return self._squared_norm_vectors + @classmethod + def load(cls, folder: Path) -> BasicBackend: + """Load the vectors from a path.""" + path = folder / "vectors.npy" + arguments = BasicArgs.load(folder / "arguments.json") + with open(path, "rb") as f: + vectors = np.load(f) + if arguments.metric == "cosine": + return CosineBasicBackend(vectors, arguments) + elif arguments.metric == "euclidean": + return EuclideanBasicBackend(vectors, arguments) + else: + raise ValueError(f"Unsupported metric: {arguments.metric}") + + def save(self, folder: Path) -> None: + """Save the vectors to a path.""" + path = folder / "vectors.npy" + self.arguments.dump(folder / "arguments.json") + with open(path, "wb") as f: + np.save(f, self._vectors) def threshold( self, @@ -150,22 +140,6 @@ def query( return out - def _dist(self, x: npt.NDArray) -> npt.NDArray: - """Compute distances between x and self._vectors based on the given metric.""" - if self.arguments.metric == "cosine": - x_norm = normalize(x) - sim = x_norm.dot(self.norm_vectors.T) - return 1 - sim - elif self.arguments.metric == "euclidean": - x_norm = self.squared_norm(x) - dists_squared = (x_norm[:, None] + self.squared_norm_vectors[None, :]) - 2 * (x @ self._vectors.T) - - # Ensure non-negative distances - dists_squared = np.maximum(dists_squared, 1e-12) - return np.sqrt(dists_squared) - else: - raise ValueError(f"Unsupported metric: {self.arguments.metric}") - def insert(self, vectors: npt.NDArray) -> None: """Insert vectors into the vector space.""" self._vectors = np.vstack([self._vectors, vectors]) @@ -175,3 +149,57 @@ def delete(self, indices: list[int]) -> None: """Deletes specific indices from the vector space.""" self._vectors = np.delete(self._vectors, indices, axis=0) self._update_precomputed_data() + + +class CosineBasicBackend(BasicBackend): + def __init__(self, vectors: npt.NDArray, arguments: BasicArgs) -> None: + """Initialize the cosine basic backend.""" + super().__init__(arguments) + self._vectors = vectors + self._norm_vectors: npt.NDArray | None = None + self._update_precomputed_data() + + def _update_precomputed_data(self) -> None: + """Update precomputed data for cosine similarity.""" + self._norm_vectors = normalize_or_copy(self._vectors) + + @property + def norm_vectors(self) -> npt.NDArray: + """Return normalized vectors.""" + if self._norm_vectors is None: + self._norm_vectors = normalize_or_copy(self._vectors) + return self._norm_vectors + + def _dist(self, x: npt.NDArray) -> npt.NDArray: + """Compute cosine distance.""" + x_norm = normalize(x) + sim = x_norm.dot(self.norm_vectors.T) + return 1 - sim + + +class EuclideanBasicBackend(BasicBackend): + def __init__(self, vectors: npt.NDArray, arguments: BasicArgs) -> None: + """Initialize the Euclidean basic backend.""" + super().__init__(arguments) + self._vectors = vectors + self._squared_norm_vectors: npt.NDArray | None = None + self._update_precomputed_data() + + def _update_precomputed_data(self) -> None: + """Update precomputed data for Euclidean distance.""" + self._squared_norm_vectors = (self._vectors**2).sum(1) + + @property + def squared_norm_vectors(self) -> npt.NDArray: + """Return squared norms of vectors.""" + if self._squared_norm_vectors is None: + self._squared_norm_vectors = (self._vectors**2).sum(1) + return self._squared_norm_vectors + + def _dist(self, x: npt.NDArray) -> npt.NDArray: + """Compute Euclidean distance.""" + x_norm = (x**2).sum(1) + dists_squared = (x_norm[:, None] + self.squared_norm_vectors[None, :]) - 2 * (x @ self._vectors.T) + # Ensure non-negative distances + dists_squared = np.maximum(dists_squared, 1e-12) + return np.sqrt(dists_squared) From e89d3546d8addf5c7c8c6cc3d6c5b11996dff3fa Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 14:48:49 +0100 Subject: [PATCH 03/13] Updates --- vicinity/backends/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vicinity/backends/basic.py b/vicinity/backends/basic.py index 682756f..e031abb 100644 --- a/vicinity/backends/basic.py +++ b/vicinity/backends/basic.py @@ -201,5 +201,5 @@ def _dist(self, x: npt.NDArray) -> npt.NDArray: x_norm = (x**2).sum(1) dists_squared = (x_norm[:, None] + self.squared_norm_vectors[None, :]) - 2 * (x @ self._vectors.T) # Ensure non-negative distances - dists_squared = np.maximum(dists_squared, 1e-12) + dists_squared = np.clip(dists_squared, 0, None) return np.sqrt(dists_squared) From 8ac5c95a1dbadedda83529918107767932390bee Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 15:04:23 +0100 Subject: [PATCH 04/13] Updates --- vicinity/backends/basic.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/vicinity/backends/basic.py b/vicinity/backends/basic.py index e031abb..28d59b4 100644 --- a/vicinity/backends/basic.py +++ b/vicinity/backends/basic.py @@ -47,6 +47,7 @@ def vectors(self) -> npt.NDArray: @vectors.setter def vectors(self, x: Matrix) -> None: + """Set the vectors.""" matrix = np.asarray(x) if np.ndim(matrix) != 2: raise ValueError(f"Your array does not have 2 dimensions: {np.ndim(matrix)}") @@ -56,12 +57,12 @@ def vectors(self, x: Matrix) -> None: @abstractmethod def _update_precomputed_data(self) -> None: """Update precomputed data based on the metric.""" - pass + raise NotImplementedError() @abstractmethod def _dist(self, x: npt.NDArray) -> npt.NDArray: """Compute distances between x and self._vectors based on the metric.""" - pass + raise NotImplementedError() @classmethod def from_vectors(cls, vectors: npt.NDArray, **kwargs: Any) -> BasicBackend: @@ -100,7 +101,13 @@ def threshold( vectors: npt.NDArray, threshold: float, ) -> list[npt.NDArray]: - """Batched distance thresholding.""" + """ + Batched distance thresholding. + + :param vectors: The vectors to threshold. + :param threshold: The threshold to use. + :return: A list of lists of indices of vectors that are below the threshold + """ out: list[npt.NDArray] = [] for i in range(0, len(vectors), 1024): batch = vectors[i : i + 1024] @@ -117,7 +124,14 @@ def query( vectors: npt.NDArray, k: int, ) -> QueryResult: - """Batched distance query.""" + """ + Batched distance query. + + :param vectors: The vectors to query. + :param k: The number of nearest neighbors to return. + :return: A list of tuples with the indices and distances. + :raises ValueError: If k is less than 1. + """ if k < 1: raise ValueError(f"k should be >= 1, is now {k}") @@ -125,17 +139,19 @@ def query( num_vectors = len(self.vectors) effective_k = min(k, num_vectors) + # Batch the queries for index in range(0, len(vectors), 1024): batch = vectors[index : index + 1024] distances = self._dist(batch) - # Use argpartition for efficiency + # Efficiently get the k smallest distances indices = np.argpartition(distances, kth=effective_k - 1, axis=1)[:, :effective_k] sorted_indices = np.take_along_axis( indices, np.argsort(np.take_along_axis(distances, indices, axis=1)), axis=1 ) sorted_distances = np.take_along_axis(distances, sorted_indices, axis=1) + # Extend the output with tuples of (indices, distances) out.extend(zip(sorted_indices, sorted_distances)) return out From 55f02d0e5bf8788324aaea135105373426a42c27 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 16:56:30 +0100 Subject: [PATCH 05/13] Aligned metrics --- vicinity/backends/annoy.py | 51 +++++++++++--------- vicinity/backends/basic.py | 68 ++++++++++++-------------- vicinity/backends/faiss.py | 83 +++++++++++++------------------- vicinity/backends/hnsw.py | 21 +++++--- vicinity/backends/pynndescent.py | 33 ++++++++----- vicinity/backends/usearch.py | 54 ++++++++++++--------- vicinity/utils.py | 38 +++++++++++++++ 7 files changed, 197 insertions(+), 151 deletions(-) diff --git a/vicinity/backends/annoy.py b/vicinity/backends/annoy.py index 6a86f0b..926bfaa 100644 --- a/vicinity/backends/annoy.py +++ b/vicinity/backends/annoy.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from pathlib import Path -from typing import Any, Literal +from typing import Any, Union import numpy as np from annoy import AnnoyIndex @@ -10,19 +10,20 @@ from vicinity.backends.base import AbstractBackend, BaseArgs from vicinity.datatypes import Backend, QueryResult -from vicinity.utils import normalize +from vicinity.utils import Metric, normalize @dataclass class AnnoyArgs(BaseArgs): dim: int = 0 - metric: Literal["dot", "euclidean", "cosine"] = "cosine" + metric: str = "cosine" trees: int = 100 length: int | None = None class AnnoyBackend(AbstractBackend[AnnoyArgs]): argument_class = AnnoyArgs + supported_metrics = {Metric.COSINE, Metric.EUCLIDEAN, Metric.INNER_PRODUCT} def __init__( self, @@ -40,25 +41,34 @@ def __init__( def from_vectors( cls: type[AnnoyBackend], vectors: npt.NDArray, - metric: Literal["dot", "euclidean", "cosine"], + metric: Union[str, Metric], trees: int, **kwargs: Any, ) -> AnnoyBackend: """Create a new instance from vectors.""" - dim = vectors.shape[1] - actual_metric: Literal["dot", "euclidean"] - if metric == "cosine": - actual_metric = "dot" + metric_enum = Metric.from_string(metric) + + if metric_enum not in cls.supported_metrics: + raise ValueError(f"Metric '{metric_enum.value}' is not supported by AnnoyBackend.") + + # Map Metric to Annoy's metric parameter + if metric_enum == Metric.COSINE: + metric = "dot" vectors = normalize(vectors) + elif metric_enum == Metric.EUCLIDEAN: + metric = "euclidean" + elif metric_enum == Metric.INNER_PRODUCT: + metric = "dot" else: - actual_metric = metric + raise ValueError(f"Unsupported metric for AnnoyBackend: {metric_enum}") - index = AnnoyIndex(f=dim, metric=actual_metric) + dim = vectors.shape[1] + index = AnnoyIndex(f=dim, metric=metric) # type: ignore for i, vector in enumerate(vectors): index.add_item(i, vector) index.build(trees) - arguments = AnnoyArgs(dim=dim, trees=trees, metric=metric, length=len(vectors)) + arguments = AnnoyArgs(dim=dim, metric=metric, trees=trees, length=len(vectors)) # type: ignore return AnnoyBackend(index, arguments=arguments) @property @@ -80,11 +90,7 @@ def load(cls: type[AnnoyBackend], base_path: Path) -> AnnoyBackend: """Load the vectors from a path.""" path = Path(base_path) / "index.bin" arguments = AnnoyArgs.load(base_path / "arguments.json") - - metric = arguments.metric - actual_metric = "dot" if metric == "cosine" else metric - - index = AnnoyIndex(arguments.dim, actual_metric) + index = AnnoyIndex(arguments.dim, arguments.metric) # type: ignore index.load(str(path)) return cls(index, arguments=arguments) @@ -93,7 +99,7 @@ def save(self, base_path: Path) -> None: """Save the vectors to a path.""" path = Path(base_path) / "index.bin" self.index.save(str(path)) - # NOTE: set the length before saving. + # Ensure the length is set before saving self.arguments.length = len(self) self.arguments.dump(base_path / "arguments.json") @@ -101,28 +107,27 @@ def query(self, vectors: npt.NDArray, k: int) -> QueryResult: """Query the backend.""" out = [] for vec in vectors: - if self.arguments.metric == "cosine": + if self.arguments.metric == "dot": vec = normalize(vec) indices, scores = self.index.get_nns_by_vector(vec, k, include_distances=True) scores_array = np.asarray(scores) - if self.arguments.metric == "cosine": - # Turn cosine similarity into cosine distance. + if self.arguments.metric == "dot": + # Convert cosine similarity to cosine distance scores_array = 1 - scores_array out.append((np.asarray(indices), scores_array)) return out def insert(self, vectors: npt.NDArray) -> None: """Insert vectors into the backend.""" - raise NotImplementedError("Insertion is not supported in ANNOY backend.") + raise NotImplementedError("Insertion is not supported in Annoy backend.") def delete(self, indices: list[int]) -> None: """Delete vectors from the backend.""" - raise NotImplementedError("Deletion is not supported in ANNOY backend.") + raise NotImplementedError("Deletion is not supported in Annoy backend.") def threshold(self, vectors: npt.NDArray, threshold: float) -> list[npt.NDArray]: """Threshold the backend.""" out: list[npt.NDArray] = [] for x, y in self.query(vectors, 100): out.append(x[y < threshold]) - return out diff --git a/vicinity/backends/basic.py b/vicinity/backends/basic.py index 28d59b4..e718bc1 100644 --- a/vicinity/backends/basic.py +++ b/vicinity/backends/basic.py @@ -3,24 +3,25 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from pathlib import Path -from typing import Any, Literal +from typing import Any, Union import numpy as np from numpy import typing as npt from vicinity.backends.base import AbstractBackend, BaseArgs from vicinity.datatypes import Backend, Matrix, QueryResult -from vicinity.utils import normalize, normalize_or_copy +from vicinity.utils import Metric, normalize, normalize_or_copy @dataclass class BasicArgs(BaseArgs): - metric: Literal["cosine", "euclidean"] = "cosine" + metric: str = "cosine" class BasicBackend(AbstractBackend[BasicArgs], ABC): argument_class = BasicArgs _vectors: npt.NDArray + supported_metrics = {Metric.COSINE, Metric.EUCLIDEAN} def __init__(self, arguments: BasicArgs) -> None: """Initialize the backend.""" @@ -65,29 +66,38 @@ def _dist(self, x: npt.NDArray) -> npt.NDArray: raise NotImplementedError() @classmethod - def from_vectors(cls, vectors: npt.NDArray, **kwargs: Any) -> BasicBackend: + def from_vectors( + cls: type[BasicBackend], + vectors: npt.NDArray, + metric: Union[str, Metric], + **kwargs: Any, + ) -> BasicBackend: """Create a new instance from vectors.""" - arguments = BasicArgs(**kwargs) - if arguments.metric == "cosine": - return CosineBasicBackend(vectors, arguments) - elif arguments.metric == "euclidean": - return EuclideanBasicBackend(vectors, arguments) + metric_enum = Metric.from_string(metric) + + if metric_enum not in cls.supported_metrics: + raise ValueError(f"Metric '{metric_enum.value}' is not supported by BasicBackend.") + + if metric_enum == Metric.COSINE: + return CosineBasicBackend(vectors, BasicArgs(metric=metric_enum.value)) + elif metric_enum == Metric.EUCLIDEAN: + return EuclideanBasicBackend(vectors, BasicArgs(metric=metric_enum.value)) else: - raise ValueError(f"Unsupported metric: {arguments.metric}") + raise ValueError(f"Unsupported metric: {metric_enum.value}") @classmethod def load(cls, folder: Path) -> BasicBackend: """Load the vectors from a path.""" path = folder / "vectors.npy" arguments = BasicArgs.load(folder / "arguments.json") - with open(path, "rb") as f: - vectors = np.load(f) - if arguments.metric == "cosine": - return CosineBasicBackend(vectors, arguments) - elif arguments.metric == "euclidean": - return EuclideanBasicBackend(vectors, arguments) + metric_enum = Metric.from_string(arguments.metric) + + if metric_enum == Metric.COSINE: + return CosineBasicBackend(np.load(path), arguments) + elif metric_enum == Metric.EUCLIDEAN: + return EuclideanBasicBackend(np.load(path), arguments) else: - raise ValueError(f"Unsupported metric: {arguments.metric}") + raise ValueError(f"Unsupported metric: {metric_enum.value}") def save(self, folder: Path) -> None: """Save the vectors to a path.""" @@ -101,13 +111,7 @@ def threshold( vectors: npt.NDArray, threshold: float, ) -> list[npt.NDArray]: - """ - Batched distance thresholding. - - :param vectors: The vectors to threshold. - :param threshold: The threshold to use. - :return: A list of lists of indices of vectors that are below the threshold - """ + """Batched distance thresholding.""" out: list[npt.NDArray] = [] for i in range(0, len(vectors), 1024): batch = vectors[i : i + 1024] @@ -116,7 +120,6 @@ def threshold( indices = np.flatnonzero(sims <= threshold) sorted_indices = indices[np.argsort(sims[indices])] out.append(sorted_indices) - return out def query( @@ -124,14 +127,7 @@ def query( vectors: npt.NDArray, k: int, ) -> QueryResult: - """ - Batched distance query. - - :param vectors: The vectors to query. - :param k: The number of nearest neighbors to return. - :return: A list of tuples with the indices and distances. - :raises ValueError: If k is less than 1. - """ + """Batched distance query.""" if k < 1: raise ValueError(f"k should be >= 1, is now {k}") @@ -139,19 +135,16 @@ def query( num_vectors = len(self.vectors) effective_k = min(k, num_vectors) - # Batch the queries for index in range(0, len(vectors), 1024): batch = vectors[index : index + 1024] distances = self._dist(batch) - # Efficiently get the k smallest distances indices = np.argpartition(distances, kth=effective_k - 1, axis=1)[:, :effective_k] sorted_indices = np.take_along_axis( indices, np.argsort(np.take_along_axis(distances, indices, axis=1)), axis=1 ) sorted_distances = np.take_along_axis(distances, sorted_indices, axis=1) - # Extend the output with tuples of (indices, distances) out.extend(zip(sorted_indices, sorted_distances)) return out @@ -216,6 +209,5 @@ def _dist(self, x: npt.NDArray) -> npt.NDArray: """Compute Euclidean distance.""" x_norm = (x**2).sum(1) dists_squared = (x_norm[:, None] + self.squared_norm_vectors[None, :]) - 2 * (x @ self._vectors.T) - # Ensure non-negative distances - dists_squared = np.clip(dists_squared, 0, None) + dists_squared = np.clip(dists_squared, 0, None) # Ensure non-negative distances return np.sqrt(dists_squared) diff --git a/vicinity/backends/faiss.py b/vicinity/backends/faiss.py index d499f17..4f5d247 100644 --- a/vicinity/backends/faiss.py +++ b/vicinity/backends/faiss.py @@ -3,7 +3,7 @@ import logging from dataclasses import dataclass from pathlib import Path -from typing import Any, Literal +from typing import Any, Union import faiss import numpy as np @@ -11,12 +11,17 @@ from vicinity.backends.base import AbstractBackend, BaseArgs from vicinity.datatypes import Backend, QueryResult -from vicinity.utils import normalize +from vicinity.utils import Metric, normalize logger = logging.getLogger(__name__) # FAISS indexes that support range_search -RANGE_SEARCH_INDEXES = (faiss.IndexFlat, faiss.IndexIVFFlat, faiss.IndexScalarQuantizer, faiss.IndexIVFScalarQuantizer) +RANGE_SEARCH_INDEXES = ( + faiss.IndexFlat, + faiss.IndexIVFFlat, + faiss.IndexScalarQuantizer, + faiss.IndexIVFScalarQuantizer, +) # FAISS indexes that need to be trained before adding vectors TRAINABLE_INDEXES = ( faiss.IndexIVFFlat, @@ -31,8 +36,8 @@ @dataclass class FaissArgs(BaseArgs): dim: int = 0 - index_type: Literal["flat", "ivf", "hnsw", "lsh", "scalar", "pq", "ivf_scalar", "ivfpq", "ivfpqr"] = "hnsw" - metric: Literal["cosine", "l2"] = "cosine" + index_type: str = "flat" + metric: str = "cosine" nlist: int = 100 m: int = 8 nbits: int = 8 @@ -41,6 +46,7 @@ class FaissArgs(BaseArgs): class FaissBackend(AbstractBackend[FaissArgs]): argument_class = FaissArgs + supported_metrics = {Metric.COSINE, Metric.EUCLIDEAN} def __init__( self, @@ -55,43 +61,34 @@ def __init__( def from_vectors( # noqa: C901 cls: type[FaissBackend], vectors: npt.NDArray, - index_type: Literal["flat", "ivf", "hnsw", "lsh", "scalar", "pq", "ivf_scalar", "ivfpq", "ivfpqr"] = "flat", - metric: Literal["cosine", "l2"] = "cosine", + index_type: str = "flat", + metric: Union[str, Metric] = "cosine", nlist: int = 100, m: int = 8, nbits: int = 8, refine_nbits: int = 8, **kwargs: Any, ) -> FaissBackend: - """ - Create a new instance from vectors. - - :param vectors: The vectors to index. - :param index_type: The type of FAISS index to use. - :param metric: The metric to use for similarity search. - :param nlist: The number of cells for IVF indexes. - :param m: The number of subquantizers for PQ and HNSW indexes. - :param nbits: The number of bits for LSH and PQ indexes. - :param refine_nbits: The number of bits for the refinement stage in IVFPQR indexes. - :param **kwargs: Additional arguments to pass to the backend. - :return: A new FaissBackend instance. - :raises ValueError: If an invalid index type is provided. - """ - dim = vectors.shape[1] + """Create a new instance from vectors.""" + metric_enum = Metric.from_string(metric) - # If using cosine, normalize vectors to unit length - if metric == "cosine": - vectors = normalize(vectors) + if metric_enum not in cls.supported_metrics: + raise ValueError(f"Metric '{metric_enum.value}' is not supported by FaissBackend.") + + # Map Metric enum to FAISS-specific metric + if metric_enum == Metric.COSINE: faiss_metric = faiss.METRIC_INNER_PRODUCT - else: + vectors = normalize(vectors) + elif metric_enum == Metric.EUCLIDEAN: faiss_metric = faiss.METRIC_L2 + else: + raise ValueError(f"Unsupported metric for FaissBackend: {metric_enum}") - if index_type.startswith("ivf"): - # Create a quantizer for IVF indexes - quantizer = faiss.IndexFlatL2(dim) if faiss_metric == faiss.METRIC_L2 else faiss.IndexFlatIP(dim) + dim = vectors.shape[1] + # Handle index creation based on index_type if index_type == "flat": - index = faiss.IndexFlatL2(dim) if faiss_metric == faiss.METRIC_L2 else faiss.IndexFlatIP(dim) + index = faiss.IndexFlat(dim, faiss_metric) elif index_type == "hnsw": index = faiss.IndexHNSWFlat(dim, m) elif index_type == "lsh": @@ -100,13 +97,11 @@ def from_vectors( # noqa: C901 index = faiss.IndexScalarQuantizer(dim, faiss.ScalarQuantizer.QT_8bit) elif index_type == "pq": if not (1 <= nbits <= 16): - # Log a warning and adjust nbits to the maximum supported value for PQ logger.warning(f"Invalid nbits={nbits} for IndexPQ. Setting nbits to 16.") nbits = 16 index = faiss.IndexPQ(dim, m, nbits) elif index_type.startswith("ivf"): - # Create a quantizer for IVF indexes - quantizer = faiss.IndexFlatL2(dim) if faiss_metric == faiss.METRIC_L2 else faiss.IndexFlatIP(dim) + quantizer = faiss.IndexFlat(dim, faiss_metric) if index_type == "ivf": index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss_metric) elif index_type == "ivf_scalar": @@ -115,6 +110,8 @@ def from_vectors( # noqa: C901 index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, nbits) elif index_type == "ivfpqr": index = faiss.IndexIVFPQR(quantizer, dim, nlist, m, nbits, m, refine_nbits) + else: + raise ValueError(f"Unsupported FAISS index type: {index_type}") else: raise ValueError(f"Unsupported FAISS index type: {index_type}") @@ -127,7 +124,7 @@ def from_vectors( # noqa: C901 arguments = FaissArgs( dim=dim, index_type=index_type, - metric=metric, + metric=metric_enum.value, nlist=nlist, m=m, nbits=nbits, @@ -171,39 +168,25 @@ def delete(self, indices: list[int]) -> None: def threshold(self, vectors: npt.NDArray, threshold: float) -> list[npt.NDArray]: """Query vectors within a distance threshold, using range_search if supported.""" out: list[npt.NDArray] = [] - - # Normalize query vectors if using cosine similarity if self.arguments.metric == "cosine": vectors = normalize(vectors) if isinstance(self.index, RANGE_SEARCH_INDEXES): - # Use range_search for supported indexes radius = threshold lims, D, I = self.index.range_search(vectors, radius) - for i in range(vectors.shape[0]): start, end = lims[i], lims[i + 1] idx = I[start:end] dist = D[start:end] - - # Convert dist for cosine if needed if self.arguments.metric == "cosine": dist = 1 - dist - - # Only include idx within the threshold - within_threshold = idx[dist < threshold] - out.append(within_threshold) + out.append(idx[dist < threshold]) else: - # Fallback to search-based filtering for indexes that do not support range_search distances, indices = self.index.search(vectors, 100) - for dist, idx in zip(distances, indices): - # Convert distances for cosine if needed if self.arguments.metric == "cosine": dist = 1 - dist - # Filter based on the threshold - within_threshold = idx[dist < threshold] - out.append(within_threshold) + out.append(idx[dist < threshold]) return out diff --git a/vicinity/backends/hnsw.py b/vicinity/backends/hnsw.py index 827adab..3a6b543 100644 --- a/vicinity/backends/hnsw.py +++ b/vicinity/backends/hnsw.py @@ -2,25 +2,27 @@ from dataclasses import dataclass from pathlib import Path -from typing import Any, Literal +from typing import Any, Literal, Union from hnswlib import Index as HnswIndex from numpy import typing as npt from vicinity.backends.base import AbstractBackend, BaseArgs from vicinity.datatypes import Backend, QueryResult +from vicinity.utils import Metric @dataclass class HNSWArgs(BaseArgs): dim: int = 0 - space: Literal["cosine", "l2"] = "cosine" + metric: str = "cosine" ef_construction: int = 200 m: int = 16 class HNSWBackend(AbstractBackend[HNSWArgs]): argument_class = HNSWArgs + supported_metrics = {Metric.COSINE, Metric.EUCLIDEAN} def __init__( self, @@ -35,17 +37,24 @@ def __init__( def from_vectors( cls: type[HNSWBackend], vectors: npt.NDArray, - space: Literal["cosine", "l2"], + metric: Union[str, Metric], ef_construction: int, m: int, **kwargs: Any, ) -> HNSWBackend: """Create a new instance from vectors.""" + metric_enum = Metric.from_string(metric) + + if metric_enum not in cls.supported_metrics: + raise ValueError(f"Metric '{metric_enum.value}' is not supported by HNSWBackend.") + + # Map Metric to HNSW's space parameter + metric = "l2" if metric_enum == Metric.EUCLIDEAN else "cosine" dim = vectors.shape[1] - index = HnswIndex(space=space, dim=dim) + index = HnswIndex(space=metric, dim=dim) index.init_index(max_elements=vectors.shape[0], ef_construction=ef_construction, M=m) index.add_items(vectors) - arguments = HNSWArgs(dim=dim, space=space, ef_construction=ef_construction, m=m) + arguments = HNSWArgs(dim=dim, metric=metric, ef_construction=ef_construction, m=m) return HNSWBackend(index, arguments=arguments) @property @@ -67,7 +76,7 @@ def load(cls: type[HNSWBackend], base_path: Path) -> HNSWBackend: """Load the vectors from a path.""" path = Path(base_path) / "index.bin" arguments = HNSWArgs.load(base_path / "arguments.json") - index = HnswIndex(space=arguments.space, dim=arguments.dim) + index = HnswIndex(space=arguments.metric, dim=arguments.dim) index.load_index(str(path)) return cls(index, arguments=arguments) diff --git a/vicinity/backends/pynndescent.py b/vicinity/backends/pynndescent.py index 726eb65..964ab08 100644 --- a/vicinity/backends/pynndescent.py +++ b/vicinity/backends/pynndescent.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from pathlib import Path -from typing import Literal +from typing import Any, Union import numpy as np from numpy import typing as npt @@ -10,21 +10,18 @@ from vicinity.backends.base import AbstractBackend, BaseArgs from vicinity.datatypes import Backend, QueryResult -from vicinity.utils import normalize_or_copy +from vicinity.utils import Metric, normalize_or_copy @dataclass class PyNNDescentArgs(BaseArgs): n_neighbors: int = 15 - metric: Literal[ - "cosine", - "euclidean", - "manhattan", - ] = "cosine" + metric: str = "cosine" class PyNNDescentBackend(AbstractBackend[PyNNDescentArgs]): argument_class = PyNNDescentArgs + supported_metrics = {Metric.COSINE, Metric.EUCLIDEAN, Metric.MANHATTAN} def __init__( self, @@ -40,10 +37,18 @@ def from_vectors( cls: type[PyNNDescentBackend], vectors: npt.NDArray, n_neighbors: int = 15, - metric: Literal["cosine", "euclidean", "manhattan"] = "cosine", + metric: Union[str, Metric] = "cosine", + **kwargs: Any, ) -> PyNNDescentBackend: """Create a new instance from vectors.""" - index = NNDescent(vectors, n_neighbors=n_neighbors, metric=metric) + metric_enum = Metric.from_string(metric) + + if metric_enum not in cls.supported_metrics: + raise ValueError(f"Metric '{metric_enum.value}' is not supported by PyNNDescentBackend.") + + metric = metric_enum.value + + index = NNDescent(vectors, n_neighbors=n_neighbors, metric=metric, **kwargs) arguments = PyNNDescentArgs(n_neighbors=n_neighbors, metric=metric) return cls(index=index, arguments=arguments) @@ -69,11 +74,11 @@ def query(self, vectors: npt.NDArray, k: int) -> QueryResult: def insert(self, vectors: npt.NDArray) -> None: """Insert vectors into the backend.""" - raise NotImplementedError("Insertion is not supported in pynndescent backend.") + raise NotImplementedError("Insertion is not supported in PyNNDescent backend.") def delete(self, indices: list[int]) -> None: """Delete vectors from the backend.""" - raise NotImplementedError("Deletion is not supported in pynndescent backend.") + raise NotImplementedError("Deletion is not supported in PyNNDescent backend.") def threshold(self, vectors: npt.NDArray, threshold: float) -> list[npt.NDArray]: """Find neighbors within a distance threshold.""" @@ -99,7 +104,11 @@ def load(cls: type[PyNNDescentBackend], base_path: Path) -> PyNNDescentBackend: """Load the vectors and configuration from a specified path.""" arguments = PyNNDescentArgs.load(base_path / "arguments.json") vectors = np.load(Path(base_path) / "vectors.npy") - index = NNDescent(vectors, n_neighbors=arguments.n_neighbors, metric=arguments.metric) + + metric_enum = Metric.from_string(arguments.metric) + pynndescent_metric = metric_enum.value + + index = NNDescent(vectors, n_neighbors=arguments.n_neighbors, metric=pynndescent_metric) # Load the neighbor graph if it was saved neighbor_graph_path = base_path / "neighbor_graph.npy" diff --git a/vicinity/backends/usearch.py b/vicinity/backends/usearch.py index 470a335..d98dc8f 100644 --- a/vicinity/backends/usearch.py +++ b/vicinity/backends/usearch.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from pathlib import Path -from typing import Any, Literal +from typing import Any, Union import numpy as np from numpy import typing as npt @@ -10,12 +10,13 @@ from vicinity.backends.base import AbstractBackend, BaseArgs from vicinity.datatypes import Backend, QueryResult +from vicinity.utils import Metric @dataclass class UsearchArgs(BaseArgs): dim: int = 0 - metric: Literal["cos", "ip", "l2sq", "hamming", "tanimoto"] = "cos" + metric: str = "cos" connectivity: int = 16 expansion_add: int = 128 expansion_search: int = 64 @@ -23,6 +24,7 @@ class UsearchArgs(BaseArgs): class UsearchBackend(AbstractBackend[UsearchArgs]): argument_class = UsearchArgs + supported_metrics = {Metric.COSINE, Metric.INNER_PRODUCT, Metric.L2_SQUARED, Metric.HAMMING, Metric.TANIMOTO} def __init__( self, @@ -37,23 +39,32 @@ def __init__( def from_vectors( cls: type[UsearchBackend], vectors: npt.NDArray, - metric: Literal["cos", "ip", "l2sq", "hamming", "tanimoto"], - connectivity: int, - expansion_add: int, - expansion_search: int, + metric: Union[str, Metric] = "cos", + connectivity: int = 16, + expansion_add: int = 128, + expansion_search: int = 64, **kwargs: Any, ) -> UsearchBackend: - """ - Create a new instance from vectors. - - :param vectors: The vectors to index. - :param metric: The metric to use. - :param connectivity: The connectivity parameter. - :param expansion_add: The expansion add parameter. - :param expansion_search: The expansion search parameter. - :param **kwargs: Additional keyword arguments. - :return: A new instance of the backend. - """ + """Create a new instance from vectors.""" + metric_enum = Metric.from_string(metric) + + if metric_enum not in cls.supported_metrics: + raise ValueError(f"Metric '{metric_enum.value}' is not supported by UsearchBackend.") + + # Map Metric enum to Usearch-compatible metric strings + if metric_enum == Metric.COSINE: + metric = "cos" + elif metric_enum == Metric.INNER_PRODUCT: + metric = "ip" + elif metric_enum == Metric.L2_SQUARED: + metric = "l2sq" + elif metric_enum == Metric.HAMMING: + metric = "hamming" + elif metric_enum == Metric.TANIMOTO: + metric = "tanimoto" + else: + raise ValueError(f"Unsupported metric for UsearchBackend: {metric_enum}") + dim = vectors.shape[1] index = UsearchIndex( ndim=dim, @@ -70,9 +81,7 @@ def from_vectors( expansion_add=expansion_add, expansion_search=expansion_search, ) - backend = cls(index, arguments=arguments) - - return backend + return cls(index, arguments) @property def backend_type(self) -> Backend: @@ -93,6 +102,7 @@ def load(cls: type[UsearchBackend], base_path: Path) -> UsearchBackend: """Load the index from a path.""" path = Path(base_path) / "index.usearch" arguments = UsearchArgs.load(base_path / "arguments.json") + index = UsearchIndex( ndim=arguments.dim, metric=arguments.metric, @@ -121,8 +131,8 @@ def insert(self, vectors: npt.NDArray) -> None: self.index.add(None, vectors) # type: ignore def delete(self, indices: list[int]) -> None: - """Delete vectors from the index (not supported by usearch).""" - raise NotImplementedError("Dynamic deletion is not supported by usearch.") + """Delete vectors from the index (not supported by Usearch).""" + raise NotImplementedError("Dynamic deletion is not supported in Usearch.") def threshold(self, vectors: npt.NDArray, threshold: float) -> list[npt.NDArray]: """Threshold the backend and return filtered keys.""" diff --git a/vicinity/utils.py b/vicinity/utils.py index 76aef5e..bce2ef5 100644 --- a/vicinity/utils.py +++ b/vicinity/utils.py @@ -1,3 +1,6 @@ +from __future__ import annotations + +from enum import Enum from typing import Union import numpy as np @@ -51,3 +54,38 @@ def normalize_or_copy(vectors: npt.NDArray) -> npt.NDArray: if all_unit_length: return vectors return normalize(vectors, norms) + + +class Metric(Enum): + COSINE = "cosine" + EUCLIDEAN = "euclidean" + MANHATTAN = "manhattan" + INNER_PRODUCT = "inner_product" + L2_SQUARED = "l2sq" + HAMMING = "hamming" + TANIMOTO = "tanimoto" + + @classmethod + def from_string(cls, metric: Union[str, Metric]) -> Metric: + """Convert a string or Metric enum to a Metric enum member.""" + if isinstance(metric, cls): + return metric + if isinstance(metric, str): + mapping = { + "cos": cls.COSINE, + "cosine": cls.COSINE, + "euclidean": cls.EUCLIDEAN, + "l2": cls.EUCLIDEAN, + "manhattan": cls.MANHATTAN, + "l1": cls.MANHATTAN, + "inner_product": cls.INNER_PRODUCT, + "ip": cls.INNER_PRODUCT, + "l2sq": cls.L2_SQUARED, + "l2_squared": cls.L2_SQUARED, + "hamming": cls.HAMMING, + "tanimoto": cls.TANIMOTO, + } + metric_str = metric.lower() + if metric_str in mapping: + return mapping[metric_str] + raise ValueError(f"Unsupported metric: {metric}") From f4d7056174ed96dd13d13490f68f52e52c51e0df Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 17:02:43 +0100 Subject: [PATCH 06/13] Update --- vicinity/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vicinity/__init__.py b/vicinity/__init__.py index 51507fd..e819e1b 100644 --- a/vicinity/__init__.py +++ b/vicinity/__init__.py @@ -1,8 +1,8 @@ """Small vector store.""" from vicinity.datatypes import Backend -from vicinity.utils import normalize +from vicinity.utils import Metric, normalize from vicinity.version import __version__ from vicinity.vicinity import Vicinity -__all__ = ["Backend", "Vicinity", "normalize", "__version__"] +__all__ = ["Backend", "Metric", "Vicinity", "normalize", "__version__"] From 501cd517fab30dd30be5eea10195e283e8a11a44 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 18:46:07 +0100 Subject: [PATCH 07/13] WIP --- vicinity/vicinity.py | 60 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py index d469325..36766c4 100644 --- a/vicinity/vicinity.py +++ b/vicinity/vicinity.py @@ -83,6 +83,11 @@ def dim(self) -> int: """The dimensionality of the vectors.""" return self.backend.dim + @property + def metric(self) -> str: + """The metric used by the backend.""" + return self.backend.arguments.metric + def query( self, vectors: npt.NDArray, @@ -229,3 +234,58 @@ def delete(self, tokens: Sequence[str]) -> None: # Delete items starting from the highest index for index in sorted(curr_indices, reverse=True): self.items.pop(index) + + def evaluate( + self, + full_vectors: npt.NDArray, + query_vectors: npt.NDArray, + k: int = 10, + epsilon: float = 1e-3, + ) -> Any: + """ + Evaluate the Vicinity instance on the given query vectors. + + Computes recall using `knn_threshold` and measures QPS (Queries Per Second). + + :param full_vectors: The full dataset vectors used to build the index. + :param query_vectors: The query vectors to evaluate. + :param k: The number of nearest neighbors to retrieve. + :param epsilon: The epsilon threshold for recall calculation. + + :return: A tuple of (QPS, recall). + """ + import time + + # Create ground truth Vicinity instance + ground_truth_vicinity = Vicinity.from_vectors_and_items( + vectors=full_vectors, + items=self.items, + backend_type=Backend.BASIC, + metric=self.metric, + ) + + # Compute ground truth results + dataset_distances = [ + [dist for _, dist in neighbors] for neighbors in ground_truth_vicinity.query(query_vectors, k=k) + ] + + # Start timer for approximate query + start_time = time.perf_counter() + run_results = self.query(query_vectors, k=k) + elapsed_time = time.perf_counter() - start_time + + num_queries = len(query_vectors) + qps = num_queries / elapsed_time if elapsed_time > 0 else float("inf") + + # Extract approximate distances + run_distances = [[dist for _, dist in neighbors] for neighbors in run_results] + + # Compute recall using knn_threshold + recalls = [] + for gt_distances, approx_distances in zip(dataset_distances, run_distances): + t = gt_distances[k - 1] + epsilon # knn_threshold + recall = sum(1 for dist in approx_distances if dist <= t) / k + recalls.append(recall) + + mean_recall = np.mean(recalls) + return qps, mean_recall From 6677f514d29d402c2c01982d7407317ae08899c4 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 18:48:04 +0100 Subject: [PATCH 08/13] Update --- vicinity/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vicinity/utils.py b/vicinity/utils.py index bce2ef5..1566a81 100644 --- a/vicinity/utils.py +++ b/vicinity/utils.py @@ -74,6 +74,7 @@ def from_string(cls, metric: Union[str, Metric]) -> Metric: mapping = { "cos": cls.COSINE, "cosine": cls.COSINE, + "dot": cls.COSINE, "euclidean": cls.EUCLIDEAN, "l2": cls.EUCLIDEAN, "manhattan": cls.MANHATTAN, From 141900c94ea80da3cf880bfd96bec6fd7cb3d1f7 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 19:10:49 +0100 Subject: [PATCH 09/13] Update --- vicinity/vicinity.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py index 36766c4..50c702f 100644 --- a/vicinity/vicinity.py +++ b/vicinity/vicinity.py @@ -241,11 +241,11 @@ def evaluate( query_vectors: npt.NDArray, k: int = 10, epsilon: float = 1e-3, - ) -> Any: + ) -> tuple[float, float]: """ Evaluate the Vicinity instance on the given query vectors. - Computes recall using `knn_threshold` and measures QPS (Queries Per Second). + Computes recall and measures QPS (Queries Per Second). :param full_vectors: The full dataset vectors used to build the index. :param query_vectors: The query vectors to evaluate. @@ -254,10 +254,8 @@ def evaluate( :return: A tuple of (QPS, recall). """ - import time - # Create ground truth Vicinity instance - ground_truth_vicinity = Vicinity.from_vectors_and_items( + gt_vicinity = Vicinity.from_vectors_and_items( vectors=full_vectors, items=self.items, backend_type=Backend.BASIC, @@ -265,27 +263,26 @@ def evaluate( ) # Compute ground truth results - dataset_distances = [ - [dist for _, dist in neighbors] for neighbors in ground_truth_vicinity.query(query_vectors, k=k) - ] + gt_distances = [[dist for _, dist in neighbors] for neighbors in gt_vicinity.query(query_vectors, k=k)] # Start timer for approximate query start_time = time.perf_counter() run_results = self.query(query_vectors, k=k) elapsed_time = time.perf_counter() - start_time + # Compute QPS num_queries = len(query_vectors) qps = num_queries / elapsed_time if elapsed_time > 0 else float("inf") # Extract approximate distances - run_distances = [[dist for _, dist in neighbors] for neighbors in run_results] + approx_distances = [[dist for _, dist in neighbors] for neighbors in run_results] # Compute recall using knn_threshold recalls = [] - for gt_distances, approx_distances in zip(dataset_distances, run_distances): - t = gt_distances[k - 1] + epsilon # knn_threshold - recall = sum(1 for dist in approx_distances if dist <= t) / k + for _gt_distances, _approx_distances in zip(gt_distances, approx_distances): + t = _gt_distances[k - 1] + epsilon # knn_threshold + recall = sum(1 for dist in _approx_distances if dist <= t) / k recalls.append(recall) - mean_recall = np.mean(recalls) + mean_recall = float(np.mean(recalls)) return qps, mean_recall From c7c4517c5c80b221929ecf9c85284ae17432894b Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 19:24:34 +0100 Subject: [PATCH 10/13] Added test --- tests/test_vicinity.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/test_vicinity.py b/tests/test_vicinity.py index ea07e5e..1ad1916 100644 --- a/tests/test_vicinity.py +++ b/tests/test_vicinity.py @@ -220,3 +220,18 @@ def test_vicinity_delete_and_query(vicinity_instance: Vicinity, items: list[str] # Check that the queried item is in the results assert "item3" in returned_items + + +def test_vicinity_evaluate(vicinity_instance: Vicinity, vectors: np.ndarray) -> None: + """ + Test the evaluate method of the Vicinity instance. + + :param vicinity_instance: A Vicinity instance. + :param vectors: The full dataset vectors used to build the index. + """ + query_vectors = vectors[:10] + qps, recall = vicinity_instance.evaluate(vectors, query_vectors) + + # Ensure the QPS and recall values are within valid ranges + assert qps > 0 + assert 0 <= recall <= 1 From 4402d68596ae917235a41a9272363f2465836094 Mon Sep 17 00:00:00 2001 From: Pringled Date: Fri, 29 Nov 2024 19:31:58 +0100 Subject: [PATCH 11/13] Updates --- vicinity/vicinity.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py index 50c702f..2744cf5 100644 --- a/vicinity/vicinity.py +++ b/vicinity/vicinity.py @@ -3,9 +3,9 @@ from __future__ import annotations import logging -import time from io import open from pathlib import Path +from time import perf_counter from typing import Any, Sequence, Union import numpy as np @@ -246,6 +246,7 @@ def evaluate( Evaluate the Vicinity instance on the given query vectors. Computes recall and measures QPS (Queries Per Second). + For recall calculation, the same methodology is used as in the ann-benchmarks repository. :param full_vectors: The full dataset vectors used to build the index. :param query_vectors: The query vectors to evaluate. @@ -266,9 +267,9 @@ def evaluate( gt_distances = [[dist for _, dist in neighbors] for neighbors in gt_vicinity.query(query_vectors, k=k)] # Start timer for approximate query - start_time = time.perf_counter() + start_time = perf_counter() run_results = self.query(query_vectors, k=k) - elapsed_time = time.perf_counter() - start_time + elapsed_time = perf_counter() - start_time # Compute QPS num_queries = len(query_vectors) @@ -277,10 +278,10 @@ def evaluate( # Extract approximate distances approx_distances = [[dist for _, dist in neighbors] for neighbors in run_results] - # Compute recall using knn_threshold + # Compute recall using the ground truth and approximate distances recalls = [] for _gt_distances, _approx_distances in zip(gt_distances, approx_distances): - t = _gt_distances[k - 1] + epsilon # knn_threshold + t = _gt_distances[k - 1] + epsilon recall = sum(1 for dist in _approx_distances if dist <= t) / k recalls.append(recall) From 234c2cd5e8bd575bba440a228614ac17526d6613 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 1 Dec 2024 12:36:07 +0100 Subject: [PATCH 12/13] Fixed supported metric issue --- tests/test_vicinity.py | 5 +++++ vicinity/vicinity.py | 19 ++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/test_vicinity.py b/tests/test_vicinity.py index 1ad1916..79b7c4d 100644 --- a/tests/test_vicinity.py +++ b/tests/test_vicinity.py @@ -235,3 +235,8 @@ def test_vicinity_evaluate(vicinity_instance: Vicinity, vectors: np.ndarray) -> # Ensure the QPS and recall values are within valid ranges assert qps > 0 assert 0 <= recall <= 1 + + # Test with an unsupported metric + vicinity_instance.backend.arguments.metric = "manhattan" + with pytest.raises(ValueError): + vicinity_instance.evaluate(vectors, query_vectors) diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py index e619847..aa3d210 100644 --- a/vicinity/vicinity.py +++ b/vicinity/vicinity.py @@ -13,7 +13,8 @@ import orjson from numpy import typing as npt -from vicinity.backends import AbstractBackend, get_backend_class +from vicinity import Metric +from vicinity.backends import AbstractBackend, BasicBackend, get_backend_class from vicinity.datatypes import Backend, PathLike logger = logging.getLogger(__name__) @@ -253,15 +254,27 @@ def evaluate( :param query_vectors: The query vectors to evaluate. :param k: The number of nearest neighbors to retrieve. :param epsilon: The epsilon threshold for recall calculation. - :return: A tuple of (QPS, recall). + :raises ValueError: If the metric is not supported by the BasicBackend. """ + try: + # Validate and map the metric using Metric.from_string + metric_enum = Metric.from_string(self.metric) + if metric_enum not in BasicBackend.supported_metrics: + raise ValueError(f"Unsupported metric '{metric_enum.value}' for BasicBackend.") + basic_metric = metric_enum.value + except ValueError as e: + raise ValueError( + f"Unsupported metric '{self.metric}' for evaluation with BasicBackend. " + f"Supported metrics are: {[m.value for m in BasicBackend.supported_metrics]}" + ) from e + # Create ground truth Vicinity instance gt_vicinity = Vicinity.from_vectors_and_items( vectors=full_vectors, items=self.items, backend_type=Backend.BASIC, - metric=self.metric, + metric=basic_metric, ) # Compute ground truth results From 248df9103618ac227d5397c222d2fda8face8889 Mon Sep 17 00:00:00 2001 From: Pringled Date: Sun, 1 Dec 2024 12:41:40 +0100 Subject: [PATCH 13/13] Update --- vicinity/vicinity.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vicinity/vicinity.py b/vicinity/vicinity.py index aa3d210..e6273ef 100644 --- a/vicinity/vicinity.py +++ b/vicinity/vicinity.py @@ -250,6 +250,8 @@ def evaluate( Computes recall and measures QPS (Queries Per Second). For recall calculation, the same methodology is used as in the ann-benchmarks repository. + NOTE: this is only supported for Cosine and Euclidean metric backends. + :param full_vectors: The full dataset vectors used to build the index. :param query_vectors: The query vectors to evaluate. :param k: The number of nearest neighbors to retrieve.