Skip to content

Commit

Permalink
feat: Add faiss (#19)
Browse files Browse the repository at this point in the history
* Initial faiss integration, wip

* Added lsh

* Added ivf

* Added ivf_scalar

* Added scalar

* Added pq

* Added final backends

* Added cosine distance

* Updates

* Updates

* Updated ci

* Used range_search for supported indexes

* Used range_search for supported indexes

* Updates

* Updated CI

* Updated makefile

* Updated makefile

* Resolved comments

* Resolved comments
  • Loading branch information
Pringled authored Nov 17, 2024
1 parent 88cb61e commit 529eb5f
Show file tree
Hide file tree
Showing 10 changed files with 337 additions and 26 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
name: Run tests and upload coverage

on:
push
pull_request:
types: [opened, synchronize, reopened, ready_for_review]

jobs:
test:
name: Run tests with pytest
if: ${{ !github.event.pull_request.draft }} # Skip draft PRs
runs-on: ${{ matrix.os }}
strategy:
matrix:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ install: venv
uv run pre-commit install

install-no-pre-commit:
uv pip install ".[dev,hnsw,pynndescent,annoy]"
uv pip install ".[dev,hnsw,pynndescent,annoy,faiss]"

install-base:
uv sync --extra dev
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ Vicinity provides the following features:
The following backends are supported:
- `BASIC`: A simple flat index for vector storage and search.
- `HNSW`: Hierarchical Navigable Small World Graph for approximate nearest neighbor search.
- `FAISS`: All FAISS indexes for approximate nearest neighbor search are supported.
- `ANNOY`: "Approximate Nearest Neighbors Oh Yeah" for approximate nearest neighbor search.
- `PYNNDescent`: Approximate nearest neighbor search using PyNNDescent.

## Usage
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ pynndescent = [
"numpy>=1.24.0"
]
annoy = ["annoy"]
faiss = ["faiss-cpu"]

[project.urls]
"Homepage" = "https://github.com/MinishLab"
Expand Down
40 changes: 33 additions & 7 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,58 @@

random_gen = np.random.default_rng(42)

_faiss_index_types = ["flat", "ivf", "hnsw", "lsh", "scalar", "pq", "ivf_scalar", "ivfpq", "ivfpqr"]


@pytest.fixture(scope="session")
def items() -> list[str]:
"""Fixture providing a list of item names."""
return [f"item{i}" for i in range(1, 101)]
return [f"item{i}" for i in range(1, 10001)]


@pytest.fixture(scope="session")
def vectors() -> np.ndarray:
"""Fixture providing an array of vectors corresponding to items."""
return random_gen.random((100, 5))
return random_gen.random((10000, 8))


@pytest.fixture(scope="session")
def query_vector() -> np.ndarray:
"""Fixture providing a query vector."""
return random_gen.random(5)
return random_gen.random(8)


BACKEND_PARAMS = [(Backend.FAISS, index_type) for index_type in _faiss_index_types] + [
(Backend.BASIC, None),
(Backend.HNSW, None),
(Backend.ANNOY, None),
(Backend.PYNNDESCENT, None),
]

# Create human-readable ids for each backend type
BACKEND_IDS = [f"{backend.name}-{index_type}" if index_type else backend.name for backend, index_type in BACKEND_PARAMS]

@pytest.fixture(params=list(Backend))

@pytest.fixture(params=BACKEND_PARAMS)
def backend_type(request: pytest.FixtureRequest) -> Backend:
"""Fixture parametrizing over all backend types defined in Backend."""
return request.param


@pytest.fixture
def vicinity_instance(backend_type: Backend, items: list[str], vectors: np.ndarray) -> Vicinity:
"""Fixture creating a Vicinity instance with the given backend, items, and vectors."""
@pytest.fixture(params=BACKEND_PARAMS, ids=BACKEND_IDS)
def vicinity_instance(request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray) -> Vicinity:
"""Fixture providing a Vicinity instance for each backend type."""
backend_type, index_type = request.param
# Handle FAISS backend with specific FAISS index types
if backend_type == Backend.FAISS:
if index_type in ("pq", "ivfpq", "ivfpqr"):
# Use smaller values for pq indexes since the dataset is small
return Vicinity.from_vectors_and_items(
vectors, items, backend_type=backend_type, index_type=index_type, m=2, nbits=4
)
else:
return Vicinity.from_vectors_and_items(
vectors, items, backend_type=backend_type, index_type=index_type, nlist=2, nbits=32
)

return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
44 changes: 27 additions & 17 deletions tests/test_vicinity.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,35 +8,39 @@
from vicinity import Vicinity
from vicinity.datatypes import Backend

BackendType = tuple[Backend, str]

def test_vicinity_init(backend_type: Backend, items: list[str], vectors: np.ndarray) -> None:

def test_vicinity_init(backend_type: BackendType, items: list[str], vectors: np.ndarray) -> None:
"""
Test Vicinity.init.
:param backend_type: The backend type to use (BASIC, HNSW or Annoy).
:param backend_type: The backend type to use.
:param items: A list of item names.
:param vectors: An array of vectors.
"""
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
backend = backend_type[0]
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend)
assert len(vicinity) == len(items)
assert vicinity.items == items
assert vicinity.dim == vectors.shape[1]

vectors = np.random.default_rng(42).random((len(items) - 1, 5))

with pytest.raises(ValueError):
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend)


def test_vicinity_from_vectors_and_items(backend_type: Backend, items: list[str], vectors: np.ndarray) -> None:
def test_vicinity_from_vectors_and_items(backend_type: BackendType, items: list[str], vectors: np.ndarray) -> None:
"""
Test Vicinity.from_vectors_and_items.
:param backend_type: The backend type to use (BASIC, HNSW or Annoy).
:param backend_type: The backend type to use.
:param items: A list of item names.
:param vectors: An array of vectors.
"""
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
backend = backend_type[0]
vicinity = Vicinity.from_vectors_and_items(vectors, items, backend_type=backend)

assert len(vicinity) == len(items)
assert vicinity.items == items
Expand Down Expand Up @@ -76,29 +80,35 @@ def test_vicinity_insert(vicinity_instance: Vicinity, query_vector: np.ndarray)
:param query_vector: A query vector.
"""
if vicinity_instance.backend.backend_type in {Backend.HNSW, Backend.ANNOY, Backend.PYNNDESCENT}:
# Don't test insert for HNSW or Annoy backend.
# Skip insert for HNSW or Annoy backends.
return
new_item = ["item101"]
new_item = ["item10001"]
new_vector = query_vector
vicinity_instance.insert(new_item, new_vector[None, :])

results = vicinity_instance.query(query_vector, k=10)
returned_item = results[0][0][0]
results = vicinity_instance.query(query_vector, k=50)

assert returned_item == "item101"
returned_items = [item for item, _ in results[0]]
assert "item10001" in returned_items


def test_vicinity_delete(vicinity_instance: Vicinity, items: list[str], vectors: np.ndarray) -> None:
"""
Test Vicinity.delete method by verifying that the vector for a deleted item is not returned in subsequent queries.
:param backend_type: The backend type to use.
:param vicinity_instance: A Vicinity instance.
:param items: List of item names.
:param vectors: Array of vectors corresponding to items.
"""
if vicinity_instance.backend.backend_type in {Backend.ANNOY, Backend.PYNNDESCENT}:
# Don't test delete for Annoy and Pynndescent backend
# Skip delete for Annoy and Pynndescent backend
return

elif vicinity_instance.backend.backend_type == Backend.FAISS and vicinity_instance.backend.arguments.index_type in {
"hnsw",
"ivfpqr",
}:
# Skip delete test for FAISS index types that do not support deletion
return

# Get the vector corresponding to "item2"
Expand Down Expand Up @@ -154,7 +164,7 @@ def test_vicinity_delete_nonexistent(vicinity_instance: Vicinity) -> None:
:raises ValueError: If deleting items that do not exist.
"""
with pytest.raises(ValueError):
vicinity_instance.delete(["item102"])
vicinity_instance.delete(["item10002"])


def test_vicinity_insert_mismatched_lengths(vicinity_instance: Vicinity, query_vector: np.ndarray) -> None:
Expand All @@ -164,7 +174,7 @@ def test_vicinity_insert_mismatched_lengths(vicinity_instance: Vicinity, query_v
:param vicinity_instance: A Vicinity instance.
:raises ValueError: If tokens and vectors lengths differ.
"""
new_items = ["item102", "item103"]
new_items = ["item10002", "item10003"]
new_vector = query_vector

with pytest.raises(ValueError):
Expand All @@ -178,7 +188,7 @@ def test_vicinity_insert_wrong_dimension(vicinity_instance: Vicinity) -> None:
:param vicinity_instance: A Vicinity instance.
:raises ValueError: If vectors have wrong dimension.
"""
new_item = ["item102"]
new_item = ["item10002"]
new_vector = np.array([[0.5, 0.5, 0.5]])

with pytest.raises(ValueError):
Expand Down
31 changes: 31 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions vicinity/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,10 @@ def get_backend_class(backend: Backend | str) -> type[AbstractBackend]:

return PyNNDescentBackend

elif backend == Backend.FAISS:
from vicinity.backends.faiss import FaissBackend

return FaissBackend


__all__ = ["get_backend_class", "AbstractBackend"]
Loading

0 comments on commit 529eb5f

Please sign in to comment.