Skip to content

Commit

Permalink
Modified index build data types and expected values to match API cont…
Browse files Browse the repository at this point in the history
…ract (#18)

Signed-off-by: Rohan Chitale <[email protected]>
  • Loading branch information
rchitale7 authored Mar 4, 2025
1 parent c989027 commit 1f19156
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 48 deletions.
7 changes: 5 additions & 2 deletions API.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,15 @@ GET /_status/{job_id}
Request Response:
{
"task_status" : "String", //RUNNING_INDEX_BUILD, FAILED_INDEX_BUILD, COMPLETED_INDEX_BUILD
"index_path" : "String" // Null if not completed
"file_name" : "String"
"error_message": "String"
}
```

Client can expect an error in “error_message” if task_status == `FAILED_INDEX_BUILD`.
* Client can expect an error in `error_message` if `task_status` == `FAILED_INDEX_BUILD`.
* If `task_status` == `COMPLETED_INDEX_BUILD`, then `file_name` is the name of the index file, located
in the same root remote store path as the `vector_path`.
* Otherwise, `file_name` is `null`.


#### Error codes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,22 @@ class DataType(str, Enum):
"""Supported data types for vector values.
Attributes:
FLOAT32: 32-bit floating point values
FLOAT16: 16-bit floating point values
BYTE: 8-bit integer values
BINARY: Binary data format
FLOAT: 32-bit floating point values
"""

FLOAT32 = "fp32"
FLOAT16 = "fp16"
BYTE = "byte"
BINARY = "binary"
FLOAT = "float"


class SpaceType(str, Enum):
"""Distance method used for measuring vector similarities.
Attributes:
L2: Euclidean distance
COSINESIMIL: Cosine similarity
L1: Manhattan distance
LINF: Chebyshev distance
INNERPRODUCT: Dot product similarity
HAMMING: Hamming distance for binary vectors
"""

L2 = "l2"
COSINESIMIL = "cosinesimil"
L1 = "l1"
LINF = "linf"
INNERPRODUCT = "innerproduct"
HAMMING = "hamming"


class Algorithm(str, Enum):
Expand Down Expand Up @@ -145,7 +131,7 @@ class IndexBuildParameters(BaseModel):
tenant_id: str = ""
dimension: int = Field(gt=0)
doc_count: int = Field(gt=0)
data_type: DataType = DataType.FLOAT32
data_type: DataType = DataType.FLOAT
engine: Engine = Engine.FAISS
index_parameters: IndexParameters = Field(default_factory=IndexParameters)
model_config = ConfigDict(extra="forbid")
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,8 @@ def get_numpy_dtype(dtype: DataType):
Raises:
UnsupportedVectorsDataTypeError: If the provided data type is not supported.
"""
if dtype == DataType.FLOAT32:
if dtype == DataType.FLOAT:
return "<f4"
elif dtype == DataType.FLOAT16:
return "<f2"
elif dtype == DataType.BYTE:
return "<i1"
elif dtype == DataType.BINARY:
return "<i1"
else:
raise UnsupportedVectorsDataTypeError(f"Unsupported data type: {dtype}")

Expand Down
11 changes: 8 additions & 3 deletions remote_vector_index_builder/core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,10 @@ def upload_index(
Note:
- Creates an object store instance based on the provided configuration
- Uses the vector_path from index_build_params to determine the upload destination
- The upload destination has the same file path as the vector_path
except for the file extension. The file extension is based on the engine
- The index_local_path must exist and be readable
- The function assumes index_build_params has been validated by Pydantic
- The function assumes index_build_params has already been validated by Pydantic
Raises:
BlobError: If there are issues uploading to the object store
Expand All @@ -124,7 +126,10 @@ def upload_index(
index_build_params, object_store_config
)

# vector_path is unique for each index build request, so we can simply append the local path
index_remote_path = index_build_params.vector_path + index_local_path
# vector path has already been validated that it ends with '.knnvec' by pydantic regex
vector_root_path = ".".join(index_build_params.vector_path.split(".")[0:-1])

# the index path is in the same root location as the vector path
index_remote_path = vector_root_path + "." + index_build_params.engine

object_store.write_blob(index_local_path, index_remote_path)
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,7 @@ def test_free_vectors_space(vectors_dataset):
@pytest.mark.parametrize(
"dtype, expected",
[
(DataType.FLOAT32, "<f4"),
(DataType.FLOAT16, "<f2"),
(DataType.BYTE, "<i1"),
(DataType.BINARY, "<i1"),
(DataType.FLOAT, "<f4"),
],
)
def test_get_numpy_dtype_valid(dtype, expected):
Expand All @@ -73,19 +70,13 @@ def test_check_dimensions_invalid():
VectorsDataset.check_dimensions(vectors, 10)


@pytest.mark.parametrize(
"vector_dtype", [DataType.FLOAT32, DataType.FLOAT16, DataType.BYTE, DataType.BINARY]
)
@pytest.mark.parametrize("vector_dtype", [DataType.FLOAT])
def test_parse_valid_data(vector_dtype):
# Prepare test data
dimension = 3
doc_count = 2

arr = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
if vector_dtype == DataType.BYTE:
arr = [[1, 2, 3], [4, 5, 6]]
elif vector_dtype == DataType.BINARY:
arr = [[0, 0, 0], [1, 1, 1]]

test_vectors = np.array(arr, dtype=VectorsDataset.get_numpy_dtype(vector_dtype))
test_doc_ids = np.array([1, 2], dtype="<i4")
Expand Down Expand Up @@ -125,7 +116,7 @@ def test_parse_invalid_doc_count():
doc_ids=doc_ids,
dimension=2,
doc_count=2,
vector_dtype=DataType.FLOAT32,
vector_dtype=DataType.FLOAT,
)
dataset.free_vectors_space()
vectors.close()
Expand All @@ -141,7 +132,7 @@ def test_parse_invalid_vector_dimensions():
doc_ids=doc_ids,
dimension=3, # Expecting 6 values (2*3), but only provided 4
doc_count=2,
vector_dtype=DataType.FLOAT32,
vector_dtype=DataType.FLOAT,
)
dataset.free_vectors_space()
vectors.close()
Expand All @@ -160,7 +151,7 @@ def test_parse_invalid_data():
doc_ids=doc_ids,
dimension=3,
doc_count=2,
vector_dtype=DataType.FLOAT32,
vector_dtype=DataType.FLOAT,
)
dataset.free_vectors_space()
vectors.close()
Expand Down
12 changes: 7 additions & 5 deletions test_remote_vector_index_builder/test_core/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
from core.object_store.object_store import ObjectStore
from core.tasks import create_vectors_dataset, upload_index

DEFAULT_VECTOR_NAME = "vec"


@pytest.fixture
def mock_object_store():
Expand All @@ -41,11 +43,11 @@ def mock_vectors_dataset_parse():
@pytest.fixture
def index_build_params():
return IndexBuildParameters(
vector_path="vec.knnvec",
vector_path=DEFAULT_VECTOR_NAME + ".knnvec",
doc_id_path="doc.knndid",
dimension=128,
doc_count=1000,
data_type="fp32",
data_type="float",
repository_type="s3",
container_name="test-bucket",
)
Expand Down Expand Up @@ -128,9 +130,9 @@ def test_successful_upload(
mock_object_store_factory.assert_called_once_with(
index_build_params, object_store_config
)
mock_object_store.write_blob.assert_called_once_with(
local_path, index_build_params.vector_path + local_path
)

remote_path = DEFAULT_VECTOR_NAME + "." + index_build_params.engine
mock_object_store.write_blob.assert_called_once_with(local_path, remote_path)


def test_upload_blob_error_handling(
Expand Down

0 comments on commit 1f19156

Please sign in to comment.