Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add faiss #19

Merged
merged 19 commits into from
Nov 17, 2024
Merged
4 changes: 3 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
name: Run tests and upload coverage

on:
push
pull_request:
types: [opened, synchronize, reopened, ready_for_review]

jobs:
test:
name: Run tests with pytest
if: ${{ !github.event.pull_request.draft }} # Skip draft PRs
runs-on: ${{ matrix.os }}
strategy:
matrix:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ install: venv
uv run pre-commit install

install-no-pre-commit:
uv pip install ".[dev,hnsw,pynndescent,annoy]"
uv pip install ".[dev,hnsw,pynndescent,annoy,faiss]"

install-base:
uv sync --extra dev
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ Vicinity provides the following features:
The following backends are supported:
- `BASIC`: A simple flat index for vector storage and search.
- `HNSW`: Hierarchical Navigable Small World Graph for approximate nearest neighbor search.
- `FAISS`: All FAISS indexes for approximate nearest neighbor search are supported.
- `ANNOY`: "Approximate Nearest Neighbors Oh Yeah" for approximate nearest neighbor search.
- `PYNNDescent`: Approximate nearest neighbor search using PyNNDescent.

## Usage
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ pynndescent = [
"numpy>=1.24.0"
]
annoy = ["annoy"]
faiss = ["faiss-cpu"]

[project.urls]
"Homepage" = "https://github.com/MinishLab"
Expand Down
40 changes: 33 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,58 @@

random_gen = np.random.default_rng(42)

_faiss_index_types = ["flat", "ivf", "hnsw", "lsh", "scalar", "pq", "ivf_scalar", "ivfpq", "ivfpqr"]


@pytest.fixture(scope="session")
def items() -> list[str]:
"""Fixture providing a list of item names."""
return [f"item{i}" for i in range(1, 101)]
return [f"item{i}" for i in range(1, 10001)]


@pytest.fixture(scope="session")
def vectors() -> np.ndarray:
"""Fixture providing an array of vectors corresponding to items."""
return random_gen.random((100, 5))
return random_gen.random((10000, 8))


@pytest.fixture(scope="session")
def query_vector() -> np.ndarray:
"""Fixture providing a query vector."""
return random_gen.random(5)
return random_gen.random(8)


BACKEND_PARAMS = [(Backend.FAISS, index_type) for index_type in _faiss_index_types] + [
(Backend.BASIC, None),
(Backend.HNSW, None),
(Backend.ANNOY, None),
(Backend.PYNNDESCENT, None),
]

# Create human-readable ids for each backend type
BACKEND_IDS = [f"{backend.name}-{index_type}" if index_type else backend.name for backend, index_type in BACKEND_PARAMS]

@pytest.fixture(params=list(Backend))

@pytest.fixture(params=BACKEND_PARAMS)
def backend_type(request: pytest.FixtureRequest) -> Backend:
"""Fixture parametrizing over all backend types defined in Backend."""
return request.param


@pytest.fixture
def vicinity_instance(backend_type: Backend, items: list[str], vectors: np.ndarray) -> Vicinity:
"""Fixture creating a Vicinity instance with the given backend, items, and vectors."""
@pytest.fixture(params=BACKEND_PARAMS, ids=BACKEND_IDS)
Pringled marked this conversation as resolved.
Show resolved Hide resolved
def vicinity_instance(request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray) -> Vicinity:
"""Fixture providing a Vicinity instance for each backend type."""
backend_type, index_type = request.param
# Handle FAISS backend with specific FAISS index types
if backend_type == Backend.FAISS:
if index_type in ("pq", "ivfpq", "ivfpqr"):
# Use smaller values for pq indexes since the dataset is small
return Vicinity.from_vectors_and_items(
vectors, items, backend_type=backend_type, index_type=index_type, m=2, nbits=4
)
else:
return Vicinity.from_vectors_and_items(
vectors, items, backend_type=backend_type, index_type=index_type, nlist=2, nbits=32
)

return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
44 changes: 27 additions & 17 deletions tests/test_vicinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,35 +8,39 @@
from vicinity import Vicinity
from vicinity.datatypes import Backend

BackendType = tuple[Backend, str]

def test_vicinity_init(backend_type: Backend, items: list[str], vectors: np.ndarray) -> None:

def test_vicinity_init(backend_type: BackendType, items: list[str], vectors: np.ndarray) -> None:
"""
Test Vicinity.init.

:param backend_type: The backend type to use (BASIC, HNSW or Annoy).
:param backend_type: The backend type to use.
:param items: A list of item names.
:param vectors: An array of vectors.
"""
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
backend = backend_type[0]
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend)
assert len(vicinity) == len(items)
assert vicinity.items == items
assert vicinity.dim == vectors.shape[1]

vectors = np.random.default_rng(42).random((len(items) - 1, 5))

with pytest.raises(ValueError):
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend)


def test_vicinity_from_vectors_and_items(backend_type: Backend, items: list[str], vectors: np.ndarray) -> None:
def test_vicinity_from_vectors_and_items(backend_type: BackendType, items: list[str], vectors: np.ndarray) -> None:
"""
Test Vicinity.from_vectors_and_items.

:param backend_type: The backend type to use (BASIC, HNSW or Annoy).
:param backend_type: The backend type to use.
:param items: A list of item names.
:param vectors: An array of vectors.
"""
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
backend = backend_type[0]
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend)

assert len(vicinity) == len(items)
assert vicinity.items == items
Expand Down Expand Up @@ -76,29 +80,35 @@ def test_vicinity_insert(vicinity_instance: Vicinity, query_vector: np.ndarray)
:param query_vector: A query vector.
"""
if vicinity_instance.backend.backend_type in {Backend.HNSW, Backend.ANNOY, Backend.PYNNDESCENT}:
# Don't test insert for HNSW or Annoy backend.
# Skip insert for HNSW or Annoy backends.
return
new_item = ["item101"]
new_item = ["item10001"]
new_vector = query_vector
vicinity_instance.insert(new_item, new_vector[None, :])

results = vicinity_instance.query(query_vector, k=10)
returned_item = results[0][0][0]
results = vicinity_instance.query(query_vector, k=50)

assert returned_item == "item101"
returned_items = [item for item, _ in results[0]]
assert "item10001" in returned_items


def test_vicinity_delete(vicinity_instance: Vicinity, items: list[str], vectors: np.ndarray) -> None:
"""
Test Vicinity.delete method by verifying that the vector for a deleted item is not returned in subsequent queries.

:param backend_type: The backend type to use.
:param vicinity_instance: A Vicinity instance.
:param items: List of item names.
:param vectors: Array of vectors corresponding to items.
"""
if vicinity_instance.backend.backend_type in {Backend.ANNOY, Backend.PYNNDESCENT}:
# Don't test delete for Annoy and Pynndescent backend
# Skip delete for Annoy and Pynndescent backend
return

elif vicinity_instance.backend.backend_type == Backend.FAISS and vicinity_instance.backend.arguments.index_type in {
"hnsw",
"ivfpqr",
}:
# Skip delete test for FAISS index types that do not support deletion
return

# Get the vector corresponding to "item2"
Expand Down Expand Up @@ -154,7 +164,7 @@ def test_vicinity_delete_nonexistent(vicinity_instance: Vicinity) -> None:
:raises ValueError: If deleting items that do not exist.
"""
with pytest.raises(ValueError):
vicinity_instance.delete(["item102"])
vicinity_instance.delete(["item10002"])


def test_vicinity_insert_mismatched_lengths(vicinity_instance: Vicinity, query_vector: np.ndarray) -> None:
Expand All @@ -164,7 +174,7 @@ def test_vicinity_insert_mismatched_lengths(vicinity_instance: Vicinity, query_v
:param vicinity_instance: A Vicinity instance.
:raises ValueError: If tokens and vectors lengths differ.
"""
new_items = ["item102", "item103"]
new_items = ["item10002", "item10003"]
new_vector = query_vector

with pytest.raises(ValueError):
Expand All @@ -178,7 +188,7 @@ def test_vicinity_insert_wrong_dimension(vicinity_instance: Vicinity) -> None:
:param vicinity_instance: A Vicinity instance.
:raises ValueError: If vectors have wrong dimension.
"""
new_item = ["item102"]
new_item = ["item10002"]
new_vector = np.array([[0.5, 0.5, 0.5]])

with pytest.raises(ValueError):
Expand Down
31 changes: 31 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions vicinity/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,10 @@ def get_backend_class(backend: Backend | str) -> type[AbstractBackend]:

return PyNNDescentBackend

elif backend == Backend.FAISS:
from vicinity.backends.faiss import FaissBackend

return FaissBackend


__all__ = ["get_backend_class", "AbstractBackend"]
Loading