-
Notifications
You must be signed in to change notification settings - Fork 339
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add basic sparse float vector support to pymilvus. for now only scipy…
….sparse.csr_matrix as input is supported Signed-off-by: Buqian Zheng <[email protected]>
- Loading branch information
1 parent
d53344c
commit 81baaaa
Showing
21 changed files
with
1,062 additions
and
803 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
# hello_sprase.py demonstrates the basic operations of PyMilvus, a Python SDK of Milvus, | ||
# while operating on sparse float vectors. | ||
# 1. connect to Milvus | ||
# 2. create collection | ||
# 3. insert data | ||
# 4. create index | ||
# 5. search, query, and hybrid search on entities | ||
# 6. delete entities by PK | ||
# 7. drop collection | ||
import time | ||
|
||
import numpy as np | ||
from scipy.sparse import rand | ||
from pymilvus import ( | ||
connections, | ||
utility, | ||
FieldSchema, CollectionSchema, DataType, | ||
Collection, | ||
) | ||
|
||
fmt = "=== {:30} ===" | ||
search_latency_fmt = "search latency = {:.4f}s" | ||
num_entities, dim, density = 10000, 60000, 0.00005 | ||
|
||
def log(msg): | ||
print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + " " + msg) | ||
|
||
# ----------------------------------------------------------------------------- | ||
# connect to Milvus | ||
log(fmt.format("start connecting to Milvus")) | ||
connections.connect("default", host="localhost", port="19530") | ||
|
||
has = utility.has_collection("hello_sparse") | ||
log(f"Does collection hello_sparse exist in Milvus: {has}") | ||
|
||
# ----------------------------------------------------------------------------- | ||
# create collection with a sparse float vector column | ||
hello_sparse = None | ||
if not has: | ||
fields = [ | ||
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=100), | ||
FieldSchema(name="random", dtype=DataType.DOUBLE), | ||
FieldSchema(name="embeddings", dtype=DataType.SPARSE_FLOAT_VECTOR), | ||
] | ||
schema = CollectionSchema(fields, "hello_sparse is the simplest demo to introduce sparse float vector usage") | ||
log(fmt.format("Create collection `hello_sparse`")) | ||
hello_sparse = Collection("hello_sparse", schema, consistency_level="Strong") | ||
else: | ||
hello_sparse = Collection("hello_sparse") | ||
|
||
# ----------------------------------------------------------------------------- | ||
# create index | ||
if not hello_sparse.has_index(): | ||
log(fmt.format("Start Creating index SPARSE_INVERTED_INDEX")) | ||
index = { | ||
"index_type": "SPARSE_INVERTED_INDEX", | ||
"metric_type": "IP", | ||
"params":{ | ||
"drop_ratio_build": 0.2, | ||
} | ||
} | ||
hello_sparse.create_index("embeddings", index) | ||
|
||
log(fmt.format("Start loading")) | ||
hello_sparse.load() | ||
|
||
log(f"hello_sparse has {hello_sparse.num_entities} entities({hello_sparse.num_entities/1000000}M), indexed {hello_sparse.has_index()}") | ||
|
||
# ----------------------------------------------------------------------------- | ||
# insert | ||
log(fmt.format("Start inserting entities")) | ||
rng = np.random.default_rng(seed=19530) | ||
matrix_csr = rand(num_entities, dim, density=density, format='csr') | ||
entities = [ | ||
rng.random(num_entities).tolist(), | ||
matrix_csr, | ||
] | ||
|
||
insert_result = hello_sparse.insert(entities) | ||
|
||
# ----------------------------------------------------------------------------- | ||
# search based on vector similarity | ||
log(fmt.format("Start searching based on vector similarity")) | ||
vectors_to_search = entities[-1][-1:] | ||
search_params = { | ||
"metric_type": "IP", | ||
"params": { | ||
"drop_ratio_search": "0.2", | ||
} | ||
} | ||
|
||
start_time = time.time() | ||
result = hello_sparse.search(vectors_to_search, "embeddings", search_params, limit=3, output_fields=["pk", "random", "embeddings"]) | ||
end_time = time.time() | ||
|
||
for hits in result: | ||
for hit in hits: | ||
print(f"hit: {hit}") | ||
log(search_latency_fmt.format(end_time - start_time)) | ||
|
||
# ----------------------------------------------------------------------------- | ||
# query based on scalar filtering(boolean, int, etc.) | ||
print(fmt.format("Start querying with `random > 0.5`")) | ||
|
||
start_time = time.time() | ||
result = hello_sparse.query(expr="random > 0.5", output_fields=["random", "embeddings"]) | ||
end_time = time.time() | ||
|
||
print(f"query result:\n-{result[0]}") | ||
print(search_latency_fmt.format(end_time - start_time)) | ||
|
||
# ----------------------------------------------------------------------------- | ||
# pagination | ||
r1 = hello_sparse.query(expr="random > 0.5", limit=4, output_fields=["random"]) | ||
r2 = hello_sparse.query(expr="random > 0.5", offset=1, limit=3, output_fields=["random"]) | ||
print(f"query pagination(limit=4):\n\t{r1}") | ||
print(f"query pagination(offset=1, limit=3):\n\t{r2}") | ||
|
||
|
||
# ----------------------------------------------------------------------------- | ||
# hybrid search | ||
print(fmt.format("Start hybrid searching with `random > 0.5`")) | ||
|
||
start_time = time.time() | ||
result = hello_sparse.search(vectors_to_search, "embeddings", search_params, limit=3, expr="random > 0.5", output_fields=["random"]) | ||
end_time = time.time() | ||
|
||
for hits in result: | ||
for hit in hits: | ||
print(f"hit: {hit}, random field: {hit.entity.get('random')}") | ||
print(search_latency_fmt.format(end_time - start_time)) | ||
|
||
# ----------------------------------------------------------------------------- | ||
# delete entities by PK | ||
# You can delete entities by their PK values using boolean expressions. | ||
ids = insert_result.primary_keys | ||
|
||
expr = f'pk in ["{ids[0]}" , "{ids[1]}"]' | ||
print(fmt.format(f"Start deleting with expr `{expr}`")) | ||
|
||
result = hello_sparse.query(expr=expr, output_fields=["random", "embeddings"]) | ||
print(f"query before delete by expr=`{expr}` -> result: \n-{result[0]}\n-{result[1]}\n") | ||
|
||
hello_sparse.delete(expr) | ||
|
||
result = hello_sparse.query(expr=expr, output_fields=["random", "embeddings"]) | ||
print(f"query after delete by expr=`{expr}` -> result: {result}\n") | ||
|
||
|
||
# ----------------------------------------------------------------------------- | ||
# drop collection | ||
print(fmt.format("Drop collection `hello_sparse`")) | ||
utility.drop_collection("hello_sparse") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.