Skip to content

Commit

Permalink
feat: Enable more VECTOR_INT8 unittest (milvus-io#39569)
Browse files Browse the repository at this point in the history
Issue: milvus-io#38666

Signed-off-by: Cai Yudong <[email protected]>
  • Loading branch information
cydrain authored Jan 24, 2025
1 parent c84a074 commit 5730b69
Show file tree
Hide file tree
Showing 12 changed files with 120 additions and 9 deletions.
2 changes: 1 addition & 1 deletion internal/core/src/segcore/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ CreateVectorDataArray(int64_t count, const FieldMeta& field_meta) {
case DataType::VECTOR_INT8: {
auto length = count * dim;
auto obj = vector_array->mutable_int8_vector();
obj->resize(length * sizeof(int8));
obj->resize(length);
break;
}
default: {
Expand Down
32 changes: 24 additions & 8 deletions internal/core/unittest/test_chunk_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
// or implied. See the License for the specific language governing permissions and limitations under the License

#include <gtest/gtest.h>
#include <stdio.h>

#include "common/Types.h"
#include "knowhere/comp/index_param.h"
Expand Down Expand Up @@ -71,6 +72,8 @@ TEST_F(ChunkVectorTest, FillDataWithMmap) {
"bf16_vec", DataType::VECTOR_BFLOAT16, 128, metric_type);
auto sparse_vec = schema->AddDebugField(
"sparse_vec", DataType::VECTOR_SPARSE_FLOAT, 128, metric_type);
auto int8_vec = schema->AddDebugField(
"int8_vec", DataType::VECTOR_INT8, 128, metric_type);
schema->set_primary_field_id(int64_field);

std::map<std::string, std::string> index_params = {
Expand Down Expand Up @@ -136,6 +139,8 @@ TEST_F(ChunkVectorTest, FillDataWithMmap) {
segment->bulk_subscript(bf16_vec, ids_ds->GetIds(), num_inserted);
auto sparse_vec_result =
segment->bulk_subscript(sparse_vec, ids_ds->GetIds(), num_inserted);
auto int8_vec_result =
segment->bulk_subscript(int8_vec, ids_ds->GetIds(), num_inserted);

EXPECT_EQ(bool_result->scalars().bool_data().data_size(), num_inserted);
EXPECT_EQ(int8_result->scalars().int_data().data_size(), num_inserted);
Expand All @@ -159,6 +164,8 @@ TEST_F(ChunkVectorTest, FillDataWithMmap) {
EXPECT_EQ(
sparse_vec_result->vectors().sparse_float_vector().contents_size(),
num_inserted);
EXPECT_EQ(int8_vec_result->vectors().int8_vector().size(),
num_inserted * dim);
EXPECT_EQ(int_array_result->scalars().array_data().data_size(),
num_inserted);
EXPECT_EQ(long_array_result->scalars().array_data().data_size(),
Expand All @@ -184,24 +191,33 @@ TEST_F(ChunkVectorTest, FillDataWithMmap) {
.data();
auto sparse_vec_res = SparseBytesToRows(
sparse_vec_result->vectors().sparse_float_vector().contents());
auto int8_vec_res = (int8*)int8_vec_result.get()
->mutable_vectors()
->int8_vector()
.data();
EXPECT_TRUE(fp32_vec_res.size() == num_inserted * dim);
auto fp32_vec_gt = dataset.get_col<float>(fp32_vec);
auto fp16_vec_gt = dataset.get_col<float16>(fp16_vec);
auto bf16_vec_gt = dataset.get_col<bfloat16>(bf16_vec);
auto sparse_vec_gt =
dataset.get_col<knowhere::sparse::SparseRow<float>>(sparse_vec);
auto int8_vec_gt = dataset.get_col<int8>(int8_vec);

for (size_t i = 0; i < num_inserted; ++i) {
auto id = ids_ds->GetIds()[i];
// check dense vector
for (size_t j = 0; j < 128; ++j) {
EXPECT_TRUE(fp32_vec_res[i * dim + j] ==
fp32_vec_gt[(id % per_batch) * dim + j]);
EXPECT_TRUE(fp16_vec_res[i * dim + j] ==
fp16_vec_gt[(id % per_batch) * dim + j]);
EXPECT_TRUE(bf16_vec_res[i * dim + j] ==
bf16_vec_gt[(id % per_batch) * dim + j]);
}
EXPECT_TRUE(memcmp((void*)(&fp32_vec_res[i * dim]),
(void*)(&fp32_vec_gt[(id % per_batch) * dim]),
sizeof(float) * dim) == 0);
EXPECT_TRUE(memcmp((void*)(&fp16_vec_res[i * dim]),
(void*)(&fp16_vec_gt[(id % per_batch) * dim]),
sizeof(float16) * dim) == 0);
EXPECT_TRUE(memcmp((void*)(&bf16_vec_res[i * dim]),
(void*)(&bf16_vec_gt[(id % per_batch) * dim]),
sizeof(bfloat16) * dim) == 0);
EXPECT_TRUE(memcmp((void*)(&int8_vec_res[i * dim]),
(void*)(&int8_vec_gt[(id % per_batch) * dim]),
sizeof(int8) * dim) == 0);
//check sparse vector
auto actual_row = sparse_vec_res[i];
auto expected_row = sparse_vec_gt[(id % per_batch)];
Expand Down
18 changes: 18 additions & 0 deletions internal/core/unittest/test_loading.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class IndexLoadTest : public ::testing::TestWithParam<Param> {
data_type = milvus::DataType::VECTOR_BINARY;
} else if (field_type == "vector_sparse_float") {
data_type = milvus::DataType::VECTOR_SPARSE_FLOAT;
} else if (field_type == "vector_int8") {
data_type = milvus::DataType::VECTOR_INT8;
} else if (field_type == "array") {
data_type = milvus::DataType::ARRAY;
} else {
Expand Down Expand Up @@ -106,6 +108,22 @@ INSTANTIATE_TEST_SUITE_P(
{"mmap", "true"},
{"field_type", "vector_fp16"}},
{0.125f, 1.0f, 0.0f, 1.0f, true}),
std::pair<std::map<std::string, std::string>, LoadResourceRequest>(
{{"index_type", "HNSW"},
{"metric_type", "L2"},
{"efConstrcution", "300"},
{"M", "30"},
{"mmap", "false"},
{"field_type", "vector_int8"}},
{2.0f, 0.0f, 1.0f, 0.0f, true}),
std::pair<std::map<std::string, std::string>, LoadResourceRequest>(
{{"index_type", "HNSW"},
{"metric_type", "L2"},
{"efConstrcution", "300"},
{"M", "30"},
{"mmap", "true"},
{"field_type", "vector_int8"}},
{0.125f, 1.0f, 0.0f, 1.0f, true}),
std::pair<std::map<std::string, std::string>, LoadResourceRequest>(
{{"index_type", "IVFFLAT"},
{"metric_type", "L2"},
Expand Down
7 changes: 7 additions & 0 deletions internal/core/unittest/test_sealed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2197,6 +2197,8 @@ TEST(Sealed, QueryAllFields) {
"float16_vec", DataType::VECTOR_FLOAT16, 128, metric_type);
auto bfloat16_vec = schema->AddDebugField(
"bfloat16_vec", DataType::VECTOR_BFLOAT16, 128, metric_type);
auto int8_vec = schema->AddDebugField(
"int8_vec", DataType::VECTOR_INT8, 128, metric_type);
schema->set_primary_field_id(int64_field);

std::map<std::string, std::string> index_params = {
Expand Down Expand Up @@ -2235,6 +2237,7 @@ TEST(Sealed, QueryAllFields) {
auto vector_values = dataset.get_col<float>(vec);
auto float16_vector_values = dataset.get_col<uint8_t>(float16_vec);
auto bfloat16_vector_values = dataset.get_col<uint8_t>(bfloat16_vec);
auto int8_vector_values = dataset.get_col<int8>(int8_vec);

auto ids_ds = GenRandomIds(dataset_size);
auto bool_result =
Expand Down Expand Up @@ -2273,6 +2276,8 @@ TEST(Sealed, QueryAllFields) {
segment->bulk_subscript(float16_vec, ids_ds->GetIds(), dataset_size);
auto bfloat16_vec_result =
segment->bulk_subscript(bfloat16_vec, ids_ds->GetIds(), dataset_size);
auto int8_vec_result =
segment->bulk_subscript(int8_vec, ids_ds->GetIds(), dataset_size);

EXPECT_EQ(bool_result->scalars().bool_data().data_size(), dataset_size);
EXPECT_EQ(int8_result->scalars().int_data().data_size(), dataset_size);
Expand All @@ -2290,6 +2295,8 @@ TEST(Sealed, QueryAllFields) {
dataset_size * dim * 2);
EXPECT_EQ(bfloat16_vec_result->vectors().bfloat16_vector().size(),
dataset_size * dim * 2);
EXPECT_EQ(int8_vec_result->vectors().int8_vector().size(),
dataset_size * dim);
EXPECT_EQ(int_array_result->scalars().array_data().data_size(),
dataset_size);
EXPECT_EQ(long_array_result->scalars().array_data().data_size(),
Expand Down
1 change: 1 addition & 0 deletions internal/core/unittest/test_utils/DataGen.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@ inline GeneratedData DataGen(SchemaPtr schema,
case DataType::VECTOR_INT8: {
auto dim = field_meta.get_dim();
vector<int8> final(dim * N);
srand(seed);
for (auto& x : final) {
x = int8_t(rand() % 256 - 128);
}
Expand Down
2 changes: 2 additions & 0 deletions internal/parser/planparserv2/plan_parser_v2.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ func CreateSearchPlan(schema *typeutil.SchemaHelper, exprStr string, vectorField
vectorType = planpb.VectorType_BFloat16Vector
case schemapb.DataType_SparseFloatVector:
vectorType = planpb.VectorType_SparseFloatVector
case schemapb.DataType_Int8Vector:
vectorType = planpb.VectorType_Int8Vector
default:
log.Error("Invalid dataType", zap.Any("dataType", dataType))
return nil, err
Expand Down
7 changes: 7 additions & 0 deletions internal/storage/serde.go
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,13 @@ var serdeMap = func() map[schemapb.DataType]serdeEntry {
fixedSizeDeserializer,
fixedSizeSerializer,
}
m[schemapb.DataType_Int8Vector] = serdeEntry{
func(i int) arrow.DataType {
return &arrow.FixedSizeBinaryType{ByteWidth: i}
},
fixedSizeDeserializer,
fixedSizeSerializer,
}
m[schemapb.DataType_FloatVector] = serdeEntry{
func(i int) arrow.DataType {
return &arrow.FixedSizeBinaryType{ByteWidth: i * 4}
Expand Down
32 changes: 32 additions & 0 deletions tests/integration/getvector/get_vector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ func (s *TestGetVectorSuite) run() {
vecFieldData = integration.NewBFloat16VectorFieldData(vecFieldName, NB, dim)
} else if typeutil.IsSparseFloatVectorType(s.vecType) {
vecFieldData = integration.NewSparseFloatVectorFieldData(vecFieldName, NB)
} else if s.vecType == schemapb.DataType_Int8Vector {
vecFieldData = integration.NewInt8VectorFieldData(vecFieldName, NB, dim)
} else {
vecFieldData = integration.NewBinaryVectorFieldData(vecFieldName, NB, dim)
}
Expand Down Expand Up @@ -294,6 +296,26 @@ func (s *TestGetVectorSuite) run() {
s.Require().Equal(rawData[id], resData[i])
}
}
} else if s.vecType == schemapb.DataType_Int8Vector {
s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetInt8Vector(), nq*topk*dim)
rawData := vecFieldData.GetVectors().GetInt8Vector()
resData := result.GetFieldsData()[vecFieldIndex].GetVectors().GetInt8Vector()
rowBytes := dim
if s.pkType == schemapb.DataType_Int64 {
for i, id := range result.GetIds().GetIntId().GetData() {
expect := rawData[int(id)*rowBytes : (int(id)+1)*rowBytes]
actual := resData[i*rowBytes : (i+1)*rowBytes]
s.Require().ElementsMatch(expect, actual)
}
} else {
for i, idStr := range result.GetIds().GetStrId().GetData() {
id, err := strconv.Atoi(idStr)
s.Require().NoError(err)
expect := rawData[id*rowBytes : (id+1)*rowBytes]
actual := resData[i*rowBytes : (i+1)*rowBytes]
s.Require().ElementsMatch(expect, actual)
}
}
} else {
s.Require().Len(result.GetFieldsData()[vecFieldIndex].GetVectors().GetBinaryVector(), nq*topk*dim/8)
rawData := vecFieldData.GetVectors().GetBinaryVector()
Expand Down Expand Up @@ -448,6 +470,16 @@ func (s *TestGetVectorSuite) TestGetVector_BFloat16Vector() {
s.run()
}

func (s *TestGetVectorSuite) TestGetVector_Int8Vector() {
s.nq = 10
s.topK = 10
s.indexType = integration.IndexHNSW
s.metricType = metric.L2
s.pkType = schemapb.DataType_Int64
s.vecType = schemapb.DataType_Int8Vector
s.run()
}

func (s *TestGetVectorSuite) TestGetVector_Big_NQ_TOPK() {
s.T().Skip("skip big NQ Top due to timeout")
s.nq = 10000
Expand Down
5 changes: 5 additions & 0 deletions tests/integration/import/import_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,11 @@ func (s *BulkInsertSuite) TestMultiFileTypes() {
s.metricType = metric.L2
s.run()

s.vecType = schemapb.DataType_Int8Vector
s.indexType = "HNSW"
s.metricType = metric.L2
s.run()

// TODO: not support numpy for SparseFloatVector by now
if fileType != importutilv2.Numpy {
s.vecType = schemapb.DataType_SparseFloatVector
Expand Down
11 changes: 11 additions & 0 deletions tests/integration/import/util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,17 @@ func GenerateNumpyFiles(cm storage.ChunkManager, schema *schemapb.CollectionSche
data = chunkedRows
case schemapb.DataType_SparseFloatVector:
data = insertData.Data[fieldID].(*storage.SparseFloatVectorFieldData).GetContents()
case schemapb.DataType_Int8Vector:
rows := insertData.Data[fieldID].GetDataRows().([]int8)
if dim != fieldData.(*storage.Int8VectorFieldData).Dim {
panic(fmt.Sprintf("dim mis-match: %d, %d", dim, fieldData.(*storage.Int8VectorFieldData).Dim))
}
chunked := lo.Chunk(rows, dim)
chunkedRows := make([][dim]int8, len(chunked))
for i, innerSlice := range chunked {
copy(chunkedRows[i][:], innerSlice)
}
data = chunkedRows
default:
data = insertData.Data[fieldID].GetDataRows()
}
Expand Down
4 changes: 4 additions & 0 deletions tests/integration/util_insert.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ func NewSparseFloatVectorFieldData(fieldName string, numRows int) *schemapb.Fiel
return testutils.NewSparseFloatVectorFieldData(fieldName, numRows)
}

func NewInt8VectorFieldData(fieldName string, numRows, dim int) *schemapb.FieldData {
return testutils.NewInt8VectorFieldData(fieldName, numRows, dim)
}

func GenerateInt64Array(numRows int, start int64) []int64 {
ret := make([]int64, numRows)
for i := 0; i < numRows; i++ {
Expand Down
8 changes: 8 additions & 0 deletions tests/integration/util_query.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/milvus-io/milvus/pkg/util/metricsinfo"
"github.com/milvus-io/milvus/pkg/util/testutils"
"github.com/milvus-io/milvus/pkg/util/typeutil"
)

const (
Expand Down Expand Up @@ -328,6 +329,13 @@ func constructPlaceholderGroup(nq, dim int, vectorType schemapb.DataType) *commo
placeholderType = commonpb.PlaceholderType_SparseFloatVector
sparseVecs := GenerateSparseFloatArray(nq)
values = append(values, sparseVecs.Contents...)
case schemapb.DataType_Int8Vector:
placeholderType = commonpb.PlaceholderType_Int8Vector
data := testutils.GenerateInt8Vectors(nq, dim)
for i := 0; i < nq; i++ {
rowBytes := dim
values = append(values, typeutil.Int8ArrayToBytes(data[rowBytes*i:rowBytes*(i+1)]))
}
default:
panic("invalid vector data type")
}
Expand Down

0 comments on commit 5730b69

Please sign in to comment.