diff --git a/Makefile b/Makefile index ad028ec3d3e6a..3e7d75abbd649 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ INSTALL_PATH := $(PWD)/bin LIBRARY_PATH := $(PWD)/lib PGO_PATH := $(PWD)/configs/pgo OS := $(shell uname -s) -mode = Release +mode = Debug use_disk_index = OFF ifdef disk_index diff --git a/internal/core/src/common/Common.cpp b/internal/core/src/common/Common.cpp index b51c86e374bc4..c4eaf2a0b533f 100644 --- a/internal/core/src/common/Common.cpp +++ b/internal/core/src/common/Common.cpp @@ -29,6 +29,8 @@ int64_t LOW_PRIORITY_THREAD_CORE_COEFFICIENT = int CPU_NUM = DEFAULT_CPU_NUM; int64_t EXEC_EVAL_EXPR_BATCH_SIZE = DEFAULT_EXEC_EVAL_EXPR_BATCH_SIZE; +int64_t COPY_STR_D = 0; + void SetIndexSliceSize(const int64_t size) { FILE_SLICE_SIZE = size << 20; diff --git a/internal/core/src/common/Common.h b/internal/core/src/common/Common.h index 49fcbcb7c8592..13e3ff9a11691 100644 --- a/internal/core/src/common/Common.h +++ b/internal/core/src/common/Common.h @@ -30,6 +30,8 @@ extern int64_t LOW_PRIORITY_THREAD_CORE_COEFFICIENT; extern int CPU_NUM; extern int64_t EXEC_EVAL_EXPR_BATCH_SIZE; +extern int64_t COPY_STR_D; + void SetIndexSliceSize(const int64_t size); diff --git a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp index ce45f09333132..b551c35e4ea45 100644 --- a/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp +++ b/internal/core/src/segcore/ChunkedSegmentSealedImpl.cpp @@ -1440,7 +1440,16 @@ ChunkedSegmentSealedImpl::bulk_subscript_ptr_impl( auto field = reinterpret_cast*>(column); for (int64_t i = 0; i < count; ++i) { auto offset = seg_offsets[i]; - dst->at(i) = std::move(T(field->RawAt(offset))); + std::string_view t = field->RawAt(offset); + std::chrono::high_resolution_clock::time_point start = + std::chrono::high_resolution_clock::now(); + dst->at(i) = std::move(T(t)); + + std::chrono::high_resolution_clock::time_point end = + std::chrono::high_resolution_clock::now(); + COPY_STR_D += + std::chrono::duration_cast(end - start) + .count(); } } diff --git a/internal/core/src/segcore/ChunkedSegmentSealedImpl.h b/internal/core/src/segcore/ChunkedSegmentSealedImpl.h index 60314f019ec26..a6518d129f9e6 100644 --- a/internal/core/src/segcore/ChunkedSegmentSealedImpl.h +++ b/internal/core/src/segcore/ChunkedSegmentSealedImpl.h @@ -234,7 +234,7 @@ class ChunkedSegmentSealedImpl : public SegmentSealed { return insert_record_.timestamps_; } - private: + public: template static void bulk_subscript_impl(const void* src_raw, diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index bfd847df1f753..139dfbc455e3d 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include "Utils.h" #include "Types.h" #include "common/Array.h" +#include "common/Common.h" #include "common/Consts.h" #include "common/EasyAssert.h" #include "common/FieldData.h" @@ -1345,7 +1347,15 @@ SegmentSealedImpl::bulk_subscript_ptr_impl( auto field = reinterpret_cast*>(column); for (int64_t i = 0; i < count; ++i) { auto offset = seg_offsets[i]; - dst->at(i) = std::move(T(field->RawAt(offset))); + std::string_view t = field->RawAt(offset); + std::chrono::high_resolution_clock::time_point start = + std::chrono::high_resolution_clock::now(); + dst->at(i) = std::move(T(t)); + std::chrono::high_resolution_clock::time_point end = + std::chrono::high_resolution_clock::now(); + COPY_STR_D += + std::chrono::duration_cast(end - start) + .count(); } } diff --git a/internal/core/src/segcore/SegmentSealedImpl.h b/internal/core/src/segcore/SegmentSealedImpl.h index cc16f5568a831..37524d22e7ec7 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.h +++ b/internal/core/src/segcore/SegmentSealedImpl.h @@ -240,7 +240,7 @@ class SegmentSealedImpl : public SegmentSealed { return insert_record_.timestamps_; } - private: + public: template static void bulk_subscript_impl(const void* src_raw, diff --git a/internal/core/unittest/CMakeLists.txt b/internal/core/unittest/CMakeLists.txt index 56123c7ef06e6..8b21bbdb1cb47 100644 --- a/internal/core/unittest/CMakeLists.txt +++ b/internal/core/unittest/CMakeLists.txt @@ -24,69 +24,69 @@ add_definitions(-DMILVUS_TEST_SEGCORE_YAML_PATH="${CMAKE_SOURCE_DIR}/unittest/te set(MILVUS_TEST_FILES init_gtest.cpp - test_always_true_expr.cpp - test_array_bitmap_index.cpp - test_array_inverted_index.cpp - test_bf.cpp - test_bf_sparse.cpp - test_binary.cpp - test_binlog_index.cpp - test_bitmap_index.cpp - test_bool_index.cpp - test_c_api.cpp - test_chunk_cache.cpp - test_chunk.cpp - test_chunk_vector.cpp - test_common.cpp - test_concurrent_vector.cpp - test_c_stream_reduce.cpp - test_c_tokenizer.cpp - test_loading.cpp - test_data_codec.cpp - test_disk_file_manager_test.cpp - test_exec.cpp - test_expr.cpp - test_expr_materialized_view.cpp - test_float16.cpp - test_function.cpp - test_futures.cpp - test_group_by.cpp - test_growing.cpp - test_growing_index.cpp - test_hybrid_index.cpp - test_index_c_api.cpp - test_indexing.cpp - test_index_wrapper.cpp - test_init.cpp - test_integer_overflow.cpp - test_inverted_index.cpp - test_local_chunk_manager.cpp - test_mmap_chunk_manager.cpp - test_monitor.cpp - test_offset_ordered_array.cpp - test_offset_ordered_map.cpp - test_plan_proto.cpp - test_query.cpp - test_range_search_sort.cpp - test_reduce_c.cpp - test_reduce.cpp - test_regex_query.cpp - test_regex_query_util.cpp - test_relational.cpp - test_retrieve.cpp - test_scalar_index.cpp - test_sealed.cpp - test_segcore.cpp - test_similarity_corelation.cpp - test_span.cpp - test_storage.cpp - test_string_expr.cpp - test_text_match.cpp - test_timestamp_index.cpp - test_tracer.cpp - test_utils.cpp + # test_always_true_expr.cpp + # test_array_bitmap_index.cpp + # test_array_inverted_index.cpp + # test_bf.cpp + # test_bf_sparse.cpp + # test_binary.cpp + # test_binlog_index.cpp + # test_bitmap_index.cpp + # test_bool_index.cpp + # test_c_api.cpp + # test_chunk_cache.cpp + # test_chunk.cpp + # test_chunk_vector.cpp + # test_common.cpp + # test_concurrent_vector.cpp + # test_c_stream_reduce.cpp + # test_c_tokenizer.cpp + # test_loading.cpp + # test_data_codec.cpp + # test_disk_file_manager_test.cpp + # test_exec.cpp + # test_expr.cpp + # test_expr_materialized_view.cpp + # test_float16.cpp + # test_function.cpp + # test_futures.cpp + # test_group_by.cpp + # test_growing.cpp + # test_growing_index.cpp + # test_hybrid_index.cpp + # test_index_c_api.cpp + # test_indexing.cpp + # test_index_wrapper.cpp + # test_init.cpp + # test_integer_overflow.cpp + # test_inverted_index.cpp + # test_local_chunk_manager.cpp + # test_mmap_chunk_manager.cpp + # test_monitor.cpp + # test_offset_ordered_array.cpp + # test_offset_ordered_map.cpp + # test_plan_proto.cpp + # test_query.cpp + # test_range_search_sort.cpp + # test_reduce_c.cpp + # test_reduce.cpp + # test_regex_query.cpp + # test_regex_query_util.cpp + # test_relational.cpp + # test_retrieve.cpp + # test_scalar_index.cpp + # test_sealed.cpp + # test_segcore.cpp + # test_similarity_corelation.cpp + # test_span.cpp + # test_storage.cpp + # test_string_expr.cpp + # test_text_match.cpp + # test_timestamp_index.cpp + # test_tracer.cpp + # test_utils.cpp test_chunked_segment.cpp - test_chunked_column.cpp + # test_chunked_column.cpp ) if ( INDEX_ENGINE STREQUAL "cardinal" ) @@ -103,23 +103,23 @@ if ( BUILD_DISK_ANN STREQUAL "ON" ) ) endif() -if (LINUX OR APPLE) - set(MILVUS_TEST_FILES - ${MILVUS_TEST_FILES} - test_scalar_index_creator.cpp - test_string_index.cpp - test_array.cpp test_array_expr.cpp) -endif() +# if (LINUX OR APPLE) +# set(MILVUS_TEST_FILES +# ${MILVUS_TEST_FILES} +# test_scalar_index_creator.cpp +# test_string_index.cpp +# test_array.cpp test_array_expr.cpp) +# endif() -if (DEFINED AZURE_BUILD_DIR) - set(MILVUS_TEST_FILES - ${MILVUS_TEST_FILES} - test_azure_chunk_manager.cpp - #need update aws-sdk-cpp, see more from https://github.com/aws/aws-sdk-cpp/issues/2119 - #test_remote_chunk_manager.cpp - ) - include_directories("${AZURE_BUILD_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/include") -endif() +# if (DEFINED AZURE_BUILD_DIR) +# set(MILVUS_TEST_FILES +# ${MILVUS_TEST_FILES} +# test_azure_chunk_manager.cpp +# #need update aws-sdk-cpp, see more from https://github.com/aws/aws-sdk-cpp/issues/2119 +# #test_remote_chunk_manager.cpp +# ) +# include_directories("${AZURE_BUILD_DIR}/vcpkg_installed/${VCPKG_TARGET_TRIPLET}/include") +# endif() if (ENABLE_GCP_NATIVE) add_definitions(-DENABLE_GCP_NATIVE) @@ -129,33 +129,33 @@ if (ENABLE_GCP_NATIVE) ) endif() -if (LINUX) - message( STATUS "Building Milvus Unit Test on Linux") - option(USE_ASAN "Whether to use AddressSanitizer" OFF) - if ( USE_ASAN ) - message( STATUS "Building Milvus using AddressSanitizer") - add_compile_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address) - add_link_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address) - endif() - - # check if memory leak exists in index builder - set(INDEX_BUILDER_TEST_FILES - test_index_wrapper.cpp - test_scalar_index_creator.cpp - test_index_c_api.cpp - ) - - add_executable(index_builder_test - ${INDEX_BUILDER_TEST_FILES} - ) - - target_link_libraries(index_builder_test - gtest - milvus_core - knowhere - ) - install(TARGETS index_builder_test DESTINATION unittest) -endif() +# if (LINUX) +# message( STATUS "Building Milvus Unit Test on Linux") +# option(USE_ASAN "Whether to use AddressSanitizer" OFF) +# if ( USE_ASAN ) +# message( STATUS "Building Milvus using AddressSanitizer") +# add_compile_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address) +# add_link_options(-fno-stack-protector -fno-omit-frame-pointer -fno-var-tracking -fsanitize=address) +# endif() + +# # check if memory leak exists in index builder +# set(INDEX_BUILDER_TEST_FILES +# test_index_wrapper.cpp +# test_scalar_index_creator.cpp +# test_index_c_api.cpp +# ) + +# add_executable(index_builder_test +# ${INDEX_BUILDER_TEST_FILES} +# ) + +# target_link_libraries(index_builder_test +# gtest +# milvus_core +# knowhere +# ) +# install(TARGETS index_builder_test DESTINATION unittest) +# endif() add_executable(all_tests ${MILVUS_TEST_FILES} @@ -169,9 +169,9 @@ target_link_libraries(all_tests install(TARGETS all_tests DESTINATION unittest) -if (LINUX) - add_subdirectory(bench) -endif () +# if (LINUX) +# add_subdirectory(bench) +# endif () # if (USE_DYNAMIC_SIMD) # add_executable(dynamic_simd_test diff --git a/internal/core/unittest/test_chunked_segment.cpp b/internal/core/unittest/test_chunked_segment.cpp index 84987ccdbacf7..69aef7f0648ea 100644 --- a/internal/core/unittest/test_chunked_segment.cpp +++ b/internal/core/unittest/test_chunked_segment.cpp @@ -11,26 +11,36 @@ #include #include +#include +#include #include #include "arrow/table_builder.h" #include "arrow/type_fwd.h" #include "common/BitsetView.h" +#include "common/ChunkWriter.h" +#include "common/Common.h" #include "common/Consts.h" #include "common/FieldDataInterface.h" +#include "common/FieldMeta.h" +#include "common/File.h" #include "common/QueryInfo.h" #include "common/Schema.h" #include "common/Types.h" #include "expr/ITypeExpr.h" +#include "gtest/gtest.h" #include "index/IndexFactory.h" #include "index/IndexInfo.h" #include "index/Meta.h" #include "knowhere/comp/index_param.h" #include "mmap/ChunkedColumn.h" +#include "mmap/Column.h" #include "mmap/Types.h" +#include "mmap/Utils.h" #include "pb/plan.pb.h" #include "pb/schema.pb.h" #include "query/ExecPlanNodeVisitor.h" #include "query/SearchOnSealed.h" +#include "segcore/ChunkedSegmentSealedImpl.h" #include "segcore/SegcoreConfig.h" #include "segcore/SegmentSealed.h" #include "segcore/SegmentSealedImpl.h" @@ -39,7 +49,9 @@ #include #include #include +#include #include +#include #include struct DeferRelease { @@ -369,3 +381,178 @@ TEST_F(TestChunkSegment, TestCompareExpr) { plan, segment.get(), chunk_num * test_data_count, MAX_TIMESTAMP); ASSERT_EQ(chunk_num * test_data_count, final.count()); } + +class TestRetrievePerf1 + : public ::testing::TestWithParam> {}; +INSTANTIATE_TEST_SUITE_P(PerfParam1, + TestRetrievePerf1, + testing::Combine(testing::Values(20, 100, 1000), + testing::Values(50, 200, 2000, 65536), + testing::Values(true, false))); +TEST_P(TestRetrievePerf1, StringPerf) { + int chunk_num = std::get<0>(GetParam()); + int test_data_count = 100000 / chunk_num; + auto str_builder = std::make_shared(); + int str_size = std::get<1>(GetParam()); + for (int i = 0; i < test_data_count; i++) { + auto status = str_builder->Append(std::string(str_size, 'a')); + ASSERT_TRUE(status.ok()); + } + std::shared_ptr arrow_str; + auto status = str_builder->Finish(&arrow_str); + ASSERT_TRUE(status.ok()); + + auto arrow_str_field = arrow::field("string1", arrow::int64()); + auto arrow_schema = + std::make_shared(arrow::FieldVector(1, arrow_str_field)); + + auto schema = std::make_shared(); + auto pk_fid = schema->AddDebugField("pk", DataType::INT64, true); + auto str_fid = schema->AddDebugField("string1", DataType::VARCHAR, true); + schema->AddField(FieldName("ts"), TimestampFieldID, DataType::INT64, true); + schema->set_primary_field_id(pk_fid); + auto segment = + segcore::CreateSealedSegment(schema, + nullptr, + -1, + segcore::SegcoreConfig::default_config(), + false, + true, + true); + FieldDataInfo field_info; + field_info.field_id = str_fid.get(); + field_info.row_count = test_data_count * chunk_num; + for (int i = 0; i < chunk_num; i++) { + auto record_batch = arrow::RecordBatch::Make( + arrow_schema, arrow_str->length(), {arrow_str}); + auto reader = + arrow::RecordBatchReader::Make({record_batch}).ValueOrDie(); + + field_info.arrow_reader_channel->push( + std::make_shared(reader, nullptr, nullptr)); + } + field_info.arrow_reader_channel->close(); + + bool is_mmap = std::get<2>(GetParam()); + if (is_mmap) { + auto temp_dir = boost::filesystem::temp_directory_path() / + boost::filesystem::unique_path() / + fmt::format("{}-{}", str_size, test_data_count); + boost::filesystem::create_directories(temp_dir); + field_info.mmap_dir_path = temp_dir.native(); + segment->MapFieldData(str_fid, field_info); + } else { + segment->LoadFieldData(str_fid, field_info); + } + + std::vector segment_offsets; + for (int i = 0; i < 100; i++) { + segment_offsets.push_back(i * 256 + 128); + } + + std::chrono::high_resolution_clock::time_point start = + std::chrono::high_resolution_clock::now(); + segment->bulk_subscript( + str_fid, segment_offsets.data(), segment_offsets.size()); + std::chrono::high_resolution_clock::time_point end = + std::chrono::high_resolution_clock::now(); + std::cout << fmt::format( + "chunk num: {}, str size: {}, mmap: {}, time cost: {}, " + "segment off: {}, copy_duration: {}", + chunk_num, + str_size, + is_mmap, + std::chrono::duration_cast( + end - start) + .count(), + segment_offsets.size(), + COPY_STR_D) + << std::endl; + COPY_STR_D = 0; +} + +class TestRetrievePerf2 + : public ::testing::TestWithParam> {}; +INSTANTIATE_TEST_SUITE_P(PerfParam2, + TestRetrievePerf2, + testing::Combine(testing::Values(50, 200, 2000, 65536), + testing::Values(true, false))); +TEST_P(TestRetrievePerf2, StringPerf) { + int test_data_count = 100000; + auto str_builder = std::make_shared(); + int str_size = std::get<0>(GetParam()); + for (int i = 0; i < test_data_count / 10; i++) { + auto status = str_builder->Append(std::string(str_size, 'a')); + if (!status.ok()) { + std::cout << status.message() << std::endl; + } + ASSERT_TRUE(status.ok()); + } + std::shared_ptr arrow_str; + auto status = str_builder->Finish(&arrow_str); + ASSERT_TRUE(status.ok()); + + auto schema = std::make_shared(); + auto pk_fid = schema->AddDebugField("pk", DataType::INT64, true); + auto str_fid = schema->AddDebugField("string1", DataType::VARCHAR, true); + schema->AddField(FieldName("ts"), TimestampFieldID, DataType::INT64, true); + schema->set_primary_field_id(pk_fid); + auto segment = + segcore::CreateSealedSegment(schema, + nullptr, + -1, + segcore::SegcoreConfig::default_config(), + false, + true, + false); + + FieldDataInfo field_info; + field_info.field_id = str_fid.get(); + field_info.row_count = test_data_count; + + for (int i = 0; i < 10; ++i) { + auto fdata = + std::make_shared>(DataType::VARCHAR, true); + fdata->FillFieldData( + std::dynamic_pointer_cast(arrow_str)); + field_info.channel->push(fdata); + } + field_info.channel->close(); + + bool is_mmap = std::get<1>(GetParam()); + if (is_mmap) { + auto temp_dir = boost::filesystem::temp_directory_path() / + boost::filesystem::unique_path() / + fmt::format("{}", str_size); + boost::filesystem::create_directories(temp_dir); + field_info.mmap_dir_path = temp_dir.native(); + segment->MapFieldData(str_fid, field_info); + } else { + segment->LoadFieldData(str_fid, field_info); + } + + std::vector segment_offsets; + for (int i = 0; i < 100; i++) { + segment_offsets.push_back(is_mmap ? i * 256 + 128 : i * 32 + 16); + } + + std::chrono::high_resolution_clock::time_point start = + std::chrono::high_resolution_clock::now(); + segment->bulk_subscript( + str_fid, segment_offsets.data(), segment_offsets.size()); + std::chrono::high_resolution_clock::time_point end = + std::chrono::high_resolution_clock::now(); + std::cout << fmt::format( + "str size: {}, time cost: {}, is mmap: {}, segment off: " + "{}, copy " + "duration: {}", + str_size, + std::chrono::duration_cast( + end - start) + .count(), + is_mmap, + segment_offsets.size(), + COPY_STR_D) + << std::endl; + COPY_STR_D = 0; +} \ No newline at end of file