diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 9953c9aecf0ae..0335ea16871ec 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -307,7 +307,8 @@ ${CMAKE_EXTRA_ARGS} \ -DUSE_DYNAMIC_SIMD=${USE_DYNAMIC_SIMD} \ -DCPU_ARCH=${CPU_ARCH} \ -DINDEX_ENGINE=${INDEX_ENGINE} \ - -DENABLE_GCP_NATIVE=${ENABLE_GCP_NATIVE} " + -DENABLE_GCP_NATIVE=${ENABLE_GCP_NATIVE} \ + -DENABLE_AZURE_FS=${ENABLE_AZURE_FS} " if [ -z "$BUILD_WITHOUT_AZURE" ]; then CMAKE_CMD=${CMAKE_CMD}"-DAZURE_BUILD_DIR=${AZURE_BUILD_DIR} \ -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} " diff --git a/go.mod b/go.mod index 00fb58580d2b5..cb272a6141a9d 100644 --- a/go.mod +++ b/go.mod @@ -18,7 +18,7 @@ require ( github.com/gin-gonic/gin v1.9.1 github.com/go-playground/validator/v10 v10.14.0 github.com/gofrs/flock v0.8.1 - github.com/golang/protobuf v1.5.4 // indirect + github.com/golang/protobuf v1.5.4 github.com/google/btree v1.1.2 github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 github.com/klauspost/compress v1.17.9 @@ -97,9 +97,9 @@ require ( github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible // indirect github.com/alibabacloud-go/debug v0.0.0-20190504072949-9472017b5c68 // indirect github.com/alibabacloud-go/tea v1.1.8 // indirect - github.com/andybalholm/brotli v1.0.4 // indirect + github.com/andybalholm/brotli v1.1.0 // indirect github.com/apache/pulsar-client-go v0.6.1-0.20210728062540-29414db801a7 // indirect - github.com/apache/thrift v0.18.1 // indirect + github.com/apache/thrift v0.19.0 // indirect github.com/ardielle/ardielle-go v1.5.2 // indirect github.com/benesch/cgosymbolizer v0.0.0-20190515212042-bec6fe6e597b // indirect github.com/beorn7/perks v1.0.1 // indirect @@ -143,7 +143,7 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/mock v1.6.0 // indirect github.com/golang/snappy v0.0.4 // indirect - github.com/google/flatbuffers v2.0.8+incompatible // indirect + github.com/google/flatbuffers v24.3.25+incompatible // indirect github.com/google/s2a-go v0.1.7 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect github.com/googleapis/gax-go/v2 v2.12.5 // indirect @@ -190,7 +190,7 @@ require ( github.com/pelletier/go-toml/v2 v2.0.8 // indirect github.com/petermattis/goid v0.0.0-20180202154549-b0b1615b78e5 // indirect github.com/pierrec/lz4 v2.5.2+incompatible // indirect - github.com/pierrec/lz4/v4 v4.1.18 // indirect + github.com/pierrec/lz4/v4 v4.1.21 // indirect github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c // indirect github.com/pingcap/failpoint v0.0.0-20210918120811-547c13e3eb00 // indirect github.com/pingcap/goleveldb v0.0.0-20191226122134-f82aafb29989 // indirect @@ -245,13 +245,13 @@ require ( go.opentelemetry.io/proto/otlp v1.0.0 // indirect go.uber.org/automaxprocs v1.5.3 // indirect golang.org/x/arch v0.3.0 // indirect - golang.org/x/mod v0.17.0 // indirect + golang.org/x/mod v0.18.0 // indirect golang.org/x/sys v0.28.0 // indirect golang.org/x/term v0.27.0 // indirect golang.org/x/time v0.5.0 // indirect - golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d // indirect - golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect - gonum.org/v1/gonum v0.11.0 // indirect + golang.org/x/tools v0.22.0 // indirect + golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 // indirect + gonum.org/v1/gonum v0.14.0 // indirect google.golang.org/genproto v0.0.0-20240624140628-dc46fd24d27d // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240617180043-68d350f18fd4 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240730163845-b1a4ccb954bf // indirect diff --git a/go.sum b/go.sum index fc960fb1a7152..7aa48e29614df 100644 --- a/go.sum +++ b/go.sum @@ -104,14 +104,14 @@ github.com/alibabacloud-go/tea v1.1.8 h1:vFF0707fqjGiQTxrtMnIXRjOCvQXf49CuDVRtTo github.com/alibabacloud-go/tea v1.1.8/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4= github.com/aliyun/credentials-go v1.2.7 h1:gLtFylxLZ1TWi1pStIt1O6a53GFU1zkNwjtJir2B4ow= github.com/aliyun/credentials-go v1.2.7/go.mod h1:/KowD1cfGSLrLsH28Jr8W+xwoId0ywIy5lNzDz6O1vw= -github.com/andybalholm/brotli v1.0.4 h1:V7DdXeJtZscaqfNuAdSRuRFzuiKlHSC/Zh3zl9qY3JY= -github.com/andybalholm/brotli v1.0.4/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig= +github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= +github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= github.com/antihax/optional v0.0.0-20180407024304-ca021399b1a6/go.mod h1:V8iCPQYkqmusNa815XgQio277wI47sdRh1dUOLdyC6Q= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= github.com/antlr4-go/antlr/v4 v4.13.1/go.mod h1:GKmUxMtwp6ZgGwZSva4eWPC5mS6vUAmOABFgjdkM7Nw= -github.com/apache/thrift v0.18.1 h1:lNhK/1nqjbwbiOPDBPFJVKxgDEGSepKuTh6OLiXW8kg= -github.com/apache/thrift v0.18.1/go.mod h1:rdQn/dCcDKEWjjylUeueum4vQEjG2v8v2PqriUnbr+I= +github.com/apache/thrift v0.19.0 h1:sOqkWPzMj7w6XaYbJQG7m4sGqVolaW/0D28Ln7yPzMk= +github.com/apache/thrift v0.19.0/go.mod h1:SUALL216IiaOw2Oy+5Vs9lboJ/t9g40C+G07Dc0QC1I= github.com/ardielle/ardielle-go v1.5.2 h1:TilHTpHIQJ27R1Tl/iITBzMwiUGSlVfiVhwDNGM3Zj4= github.com/ardielle/ardielle-go v1.5.2/go.mod h1:I4hy1n795cUhaVt/ojz83SNVCYIGsAFAONtv2Dr7HUI= github.com/ardielle/ardielle-tools v1.5.4/go.mod h1:oZN+JRMnqGiIhrzkRN9l26Cej9dEx4jeNG6A+AdkShk= @@ -397,8 +397,8 @@ github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ github.com/google/btree v1.0.1/go.mod h1:xXMiIv4Fb/0kKde4SpL7qlzvu5cMJDRkFDxJfI9uaxA= github.com/google/btree v1.1.2 h1:xf4v41cLI2Z6FxbKm+8Bu+m8ifhj15JuZ9sa0jZCMUU= github.com/google/btree v1.1.2/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/flatbuffers v2.0.8+incompatible h1:ivUb1cGomAB101ZM1T0nOiWz9pSrTMoa9+EiY7igmkM= -github.com/google/flatbuffers v2.0.8+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= +github.com/google/flatbuffers v24.3.25+incompatible h1:CX395cjN9Kke9mmalRoL3d81AtFUxJM+yDthflgJGkI= +github.com/google/flatbuffers v24.3.25+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.2.1-0.20190312032427-6f77996f0c42/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= @@ -712,8 +712,8 @@ github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2 github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI= github.com/pierrec/lz4 v2.5.2+incompatible h1:WCjObylUIOlKy/+7Abdn34TLIkXiA4UWUMhxq9m9ZXI= github.com/pierrec/lz4 v2.5.2+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY= -github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ= -github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= +github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= github.com/pingcap/errors v0.11.0/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= github.com/pingcap/errors v0.11.4/go.mod h1:Oi8TUi2kEtXXLMJk9l1cGmz20kV3TaQ0usTwv5KuLY8= github.com/pingcap/errors v0.11.5-0.20211224045212-9687c2b0f87c h1:xpW9bvK+HuuTmyFqUwr+jcCvpVkK7sumiz+ko5H9eq4= @@ -1092,8 +1092,8 @@ golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0= +golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1337,19 +1337,19 @@ golang.org/x/tools v0.1.1/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= -golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/tools v0.22.0 h1:gqSGLZqv+AI9lIQzniJ0nZDRG5GBPsSi+DRNHWNz6yA= +golang.org/x/tools v0.22.0/go.mod h1:aCwcsjqvq7Yqt6TNyX7QMU2enbQ/Gt0bo6krSeEri+c= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk= -golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= +golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028 h1:+cNy6SZtPcJQH3LJVLOSmiC7MMxXNOb3PU/VUEz+EhU= +golang.org/x/xerrors v0.0.0-20231012003039-104605ab7028/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0= -gonum.org/v1/gonum v0.11.0 h1:f1IJhK4Km5tBJmaiJXtk/PkL4cdVX6J+tGiM187uT5E= -gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA= +gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0= +gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU= gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY= diff --git a/internal/core/conanfile.py b/internal/core/conanfile.py index 9978fe53f0c32..89d7c0592151e 100644 --- a/internal/core/conanfile.py +++ b/internal/core/conanfile.py @@ -13,7 +13,7 @@ class MilvusConan(ConanFile): "lz4/1.9.4#c5afb86edd69ac0df30e3a9e192e43db", "snappy/1.1.9#0519333fef284acd04806243de7d3070", "lzo/2.10#9517fc1bcc4d4cc229a79806003a1baa", - "arrow/15.0.0#0456d916ff25d509e0724c5b219b4c45", + "arrow/17.0.0#8cea917a6e06ca17c28411966d6fcdd7", "openssl/3.1.2#02594c4c0a6e2b4feb3cd15119993597", "aws-sdk-cpp/1.9.234#28d6d2c175975900ce292bafe8022c88", "googleapis/cci.20221108#65604e1b3b9a6b363044da625b201a2a", diff --git a/internal/core/src/CMakeLists.txt b/internal/core/src/CMakeLists.txt index 0c17d074bd224..1b1baa28b235b 100644 --- a/internal/core/src/CMakeLists.txt +++ b/internal/core/src/CMakeLists.txt @@ -32,6 +32,7 @@ include_directories( ${SIMDJSON_INCLUDE_DIR} ${TANTIVY_INCLUDE_DIR} ${CONAN_INCLUDE_DIRS} + ${MILVUS_STORAGE_INCLUDE_DIR} ) add_subdirectory( pb ) @@ -73,6 +74,7 @@ set(LINK_TARGETS simdjson tantivy_binding knowhere + milvus-storage ${OpenMP_CXX_FLAGS} ${CONAN_LIBS}) diff --git a/internal/core/src/segcore/arrow/c/abi.h b/internal/core/src/segcore/arrow/c/abi.h new file mode 100644 index 0000000000000..dc488ac4ecb5c --- /dev/null +++ b/internal/core/src/segcore/arrow/c/abi.h @@ -0,0 +1,204 @@ + +#pragma once + +#include <stdint.h> + +// Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef ARROW_C_DATA_INTERFACE +#define ARROW_C_DATA_INTERFACE + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowSchema { + // Array type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + int64_t n_children; + struct ArrowSchema** children; + struct ArrowSchema* dictionary; + + // Release callback + void (*release)(struct ArrowSchema*); + // Opaque producer-specific data + void* private_data; +}; + +struct ArrowArray { + // Array data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DATA_INTERFACE + +#ifndef ARROW_C_DEVICE_DATA_INTERFACE +#define ARROW_C_DEVICE_DATA_INTERFACE + +// Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html + +// DeviceType for the allocated memory +typedef int32_t ArrowDeviceType; + +// CPU device, same as using ArrowArray directly +#define ARROW_DEVICE_CPU 1 +// CUDA GPU Device +#define ARROW_DEVICE_CUDA 2 +// Pinned CUDA CPU memory by cudaMallocHost +#define ARROW_DEVICE_CUDA_HOST 3 +// OpenCL Device +#define ARROW_DEVICE_OPENCL 4 +// Vulkan buffer for next-gen graphics +#define ARROW_DEVICE_VULKAN 7 +// Metal for Apple GPU +#define ARROW_DEVICE_METAL 8 +// Verilog simulator buffer +#define ARROW_DEVICE_VPI 9 +// ROCm GPUs for AMD GPUs +#define ARROW_DEVICE_ROCM 10 +// Pinned ROCm CPU memory allocated by hipMallocHost +#define ARROW_DEVICE_ROCM_HOST 11 +// Reserved for extension +#define ARROW_DEVICE_EXT_DEV 12 +// CUDA managed/unified memory allocated by cudaMallocManaged +#define ARROW_DEVICE_CUDA_MANAGED 13 +// unified shared memory allocated on a oneAPI non-partitioned device. +#define ARROW_DEVICE_ONEAPI 14 +// GPU support for next-gen WebGPU standard +#define ARROW_DEVICE_WEBGPU 15 +// Qualcomm Hexagon DSP +#define ARROW_DEVICE_HEXAGON 16 + +struct ArrowDeviceArray { + // the Allocated Array + // + // the buffers in the array (along with the buffers of any + // children) are what is allocated on the device. + struct ArrowArray array; + // The device id to identify a specific device + int64_t device_id; + // The type of device which can access this memory. + ArrowDeviceType device_type; + // An event-like object to synchronize on if needed. + void* sync_event; + // Reserved bytes for future expansion. + int64_t reserved[3]; +}; + +#endif // ARROW_C_DEVICE_DATA_INTERFACE + +#ifndef ARROW_C_STREAM_INTERFACE +#define ARROW_C_STREAM_INTERFACE + +struct ArrowArrayStream { + // Callback to get the stream type + // (will be the same for all arrays in the stream). + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowArray must be released independently from the stream. + int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowArrayStream*); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowArrayStream*); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_STREAM_INTERFACE + +#ifndef ARROW_C_DEVICE_STREAM_INTERFACE +#define ARROW_C_DEVICE_STREAM_INTERFACE + +// Equivalent to ArrowArrayStream, but for ArrowDeviceArrays. +// +// This stream is intended to provide a stream of data on a single +// device, if a producer wants data to be produced on multiple devices +// then multiple streams should be provided. One per device. +struct ArrowDeviceArrayStream { + // The device that this stream produces data on. + ArrowDeviceType device_type; + + // Callback to get the stream schema + // (will be the same for all arrays in the stream). + // + // Return value 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowSchema must be released independently from the stream. + // The schema should be accessible via CPU memory. + int (*get_schema)(struct ArrowDeviceArrayStream* self, + struct ArrowSchema* out); + + // Callback to get the next array + // (if no error and the array is released, the stream has ended) + // + // Return value: 0 if successful, an `errno`-compatible error code otherwise. + // + // If successful, the ArrowDeviceArray must be released independently from the stream. + int (*get_next)(struct ArrowDeviceArrayStream* self, + struct ArrowDeviceArray* out); + + // Callback to get optional detailed error information. + // This must only be called if the last stream operation failed + // with a non-0 return code. + // + // Return value: pointer to a null-terminated character array describing + // the last error, or NULL if no description is available. + // + // The returned pointer is only valid until the next operation on this stream + // (including release). + const char* (*get_last_error)(struct ArrowDeviceArrayStream* self); + + // Release callback: release the stream's own resources. + // Note that arrays returned by `get_next` must be individually released. + void (*release)(struct ArrowDeviceArrayStream* self); + + // Opaque producer-specific data + void* private_data; +}; + +#endif // ARROW_C_DEVICE_STREAM_INTERFACE + +#ifdef __cplusplus +} +#endif diff --git a/internal/core/src/segcore/arrow/c/helpers.h b/internal/core/src/segcore/arrow/c/helpers.h new file mode 100644 index 0000000000000..a1c50f5216424 --- /dev/null +++ b/internal/core/src/segcore/arrow/c/helpers.h @@ -0,0 +1,187 @@ + +#pragma once + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "arrow/c/abi.h" + +#define ARROW_C_ASSERT(condition, msg) \ + do { \ + if (!(condition)) { \ + fprintf(stderr, "%s:%d:: %s", __FILE__, __LINE__, (msg)); \ + abort(); \ + } \ + } while (0) + +#ifdef __cplusplus +extern "C" { +#endif + +/// Query whether the C schema is released +inline int +ArrowSchemaIsReleased(const struct ArrowSchema* schema) { + return schema->release == NULL; +} + +/// Mark the C schema released (for use in release callbacks) +inline void +ArrowSchemaMarkReleased(struct ArrowSchema* schema) { + schema->release = NULL; +} + +/// Move the C schema from `src` to `dest` +/// +/// Note `dest` must *not* point to a valid schema already, otherwise there +/// will be a memory leak. +inline void +ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dest) { + assert(dest != src); + assert(!ArrowSchemaIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowSchema)); + ArrowSchemaMarkReleased(src); +} + +/// Release the C schema, if necessary, by calling its release callback +inline void +ArrowSchemaRelease(struct ArrowSchema* schema) { + if (!ArrowSchemaIsReleased(schema)) { + schema->release(schema); + ARROW_C_ASSERT(ArrowSchemaIsReleased(schema), + "ArrowSchemaRelease did not cleanup release callback"); + } +} + +/// Query whether the C array is released +inline int +ArrowArrayIsReleased(const struct ArrowArray* array) { + return array->release == NULL; +} + +inline int +ArrowDeviceArrayIsReleased(const struct ArrowDeviceArray* array) { + return ArrowArrayIsReleased(&array->array); +} + +/// Mark the C array released (for use in release callbacks) +inline void +ArrowArrayMarkReleased(struct ArrowArray* array) { + array->release = NULL; +} + +inline void +ArrowDeviceArrayMarkReleased(struct ArrowDeviceArray* array) { + ArrowArrayMarkReleased(&array->array); +} + +/// Move the C array from `src` to `dest` +/// +/// Note `dest` must *not* point to a valid array already, otherwise there +/// will be a memory leak. +inline void +ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dest) { + assert(dest != src); + assert(!ArrowArrayIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowArray)); + ArrowArrayMarkReleased(src); +} + +inline void +ArrowDeviceArrayMove(struct ArrowDeviceArray* src, + struct ArrowDeviceArray* dest) { + assert(dest != src); + assert(!ArrowDeviceArrayIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowDeviceArray)); + ArrowDeviceArrayMarkReleased(src); +} + +/// Release the C array, if necessary, by calling its release callback +inline void +ArrowArrayRelease(struct ArrowArray* array) { + if (!ArrowArrayIsReleased(array)) { + array->release(array); + ARROW_C_ASSERT(ArrowArrayIsReleased(array), + "ArrowArrayRelease did not cleanup release callback"); + } +} + +inline void +ArrowDeviceArrayRelease(struct ArrowDeviceArray* array) { + if (!ArrowDeviceArrayIsReleased(array)) { + array->array.release(&array->array); + ARROW_C_ASSERT( + ArrowDeviceArrayIsReleased(array), + "ArrowDeviceArrayRelease did not cleanup release callback"); + } +} + +/// Query whether the C array stream is released +inline int +ArrowArrayStreamIsReleased(const struct ArrowArrayStream* stream) { + return stream->release == NULL; +} + +inline int +ArrowDeviceArrayStreamIsReleased(const struct ArrowDeviceArrayStream* stream) { + return stream->release == NULL; +} + +/// Mark the C array stream released (for use in release callbacks) +inline void +ArrowArrayStreamMarkReleased(struct ArrowArrayStream* stream) { + stream->release = NULL; +} + +inline void +ArrowDeviceArrayStreamMarkReleased(struct ArrowDeviceArrayStream* stream) { + stream->release = NULL; +} + +/// Move the C array stream from `src` to `dest` +/// +/// Note `dest` must *not* point to a valid stream already, otherwise there +/// will be a memory leak. +inline void +ArrowArrayStreamMove(struct ArrowArrayStream* src, + struct ArrowArrayStream* dest) { + assert(dest != src); + assert(!ArrowArrayStreamIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowArrayStream)); + ArrowArrayStreamMarkReleased(src); +} + +inline void +ArrowDeviceArrayStreamMove(struct ArrowDeviceArrayStream* src, + struct ArrowDeviceArrayStream* dest) { + assert(dest != src); + assert(!ArrowDeviceArrayStreamIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowDeviceArrayStream)); + ArrowDeviceArrayStreamMarkReleased(src); +} + +/// Release the C array stream, if necessary, by calling its release callback +inline void +ArrowArrayStreamRelease(struct ArrowArrayStream* stream) { + if (!ArrowArrayStreamIsReleased(stream)) { + stream->release(stream); + ARROW_C_ASSERT( + ArrowArrayStreamIsReleased(stream), + "ArrowArrayStreamRelease did not cleanup release callback"); + } +} + +inline void +ArrowDeviceArrayStreamRelease(struct ArrowDeviceArrayStream* stream) { + if (!ArrowDeviceArrayStreamIsReleased(stream)) { + stream->release(stream); + ARROW_C_ASSERT( + ArrowDeviceArrayStreamIsReleased(stream), + "ArrowDeviceArrayStreamRelease did not cleanup release callback"); + } +} + +#ifdef __cplusplus +} +#endif diff --git a/internal/core/src/segcore/packed_reader_c.cpp b/internal/core/src/segcore/packed_reader_c.cpp new file mode 100644 index 0000000000000..56aaf0e1977c5 --- /dev/null +++ b/internal/core/src/segcore/packed_reader_c.cpp @@ -0,0 +1,97 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "segcore/packed_reader_c.h" +#include "milvus-storage/packed/reader.h" +#include "milvus-storage/common/log.h" +#include "milvus-storage/filesystem/fs.h" +#include "milvus-storage/common/config.h" + +#include <arrow/c/bridge.h> +#include <arrow/filesystem/filesystem.h> +#include <arrow/status.h> +#include <memory> + +int +NewPackedReader(const char* path, + struct ArrowSchema* schema, + const int64_t buffer_size, + CPackedReader* c_packed_reader) { + try { + auto truePath = std::string(path); + auto factory = std::make_shared<milvus_storage::FileSystemFactory>(); + auto conf = milvus_storage::StorageConfig(); + conf.uri = "file:///tmp/"; + auto trueFs = factory->BuildFileSystem(conf, &truePath).value(); + auto trueSchema = arrow::ImportSchema(schema).ValueOrDie(); + std::set<int> needed_columns; + for (int i = 0; i < trueSchema->num_fields(); i++) { + needed_columns.emplace(i); + } + auto reader = std::make_unique<milvus_storage::PackedRecordBatchReader>( + *trueFs, path, trueSchema, needed_columns, buffer_size); + *c_packed_reader = reader.release(); + return 0; + } catch (std::exception& e) { + return -1; + } +} + +int +ReadNext(CPackedReader c_packed_reader, + CArrowArray* out_array, + CArrowSchema* out_schema) { + try { + auto packed_reader = + static_cast<milvus_storage::PackedRecordBatchReader*>( + c_packed_reader); + std::shared_ptr<arrow::RecordBatch> record_batch; + auto status = packed_reader->ReadNext(&record_batch); + if (!status.ok()) { + return -1; + } + if (record_batch == nullptr) { + // end of file + return 0; + } else { + std::unique_ptr<ArrowArray> arr = std::make_unique<ArrowArray>(); + std::unique_ptr<ArrowSchema> schema = + std::make_unique<ArrowSchema>(); + auto status = arrow::ExportRecordBatch( + *record_batch, arr.get(), schema.get()); + if (!status.ok()) { + return -1; + } + *out_array = arr.release(); + *out_schema = schema.release(); + return 0; + } + return 0; + } catch (std::exception& e) { + return -1; + } +} + +int +CloseReader(CPackedReader c_packed_reader) { + try { + auto packed_reader = + static_cast<milvus_storage::PackedRecordBatchReader*>( + c_packed_reader); + delete packed_reader; + return 0; + } catch (std::exception& e) { + return -1; + } +} \ No newline at end of file diff --git a/internal/core/src/segcore/packed_reader_c.h b/internal/core/src/segcore/packed_reader_c.h new file mode 100644 index 0000000000000..7a5c90cf16e3c --- /dev/null +++ b/internal/core/src/segcore/packed_reader_c.h @@ -0,0 +1,64 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include <arrow/c/abi.h> + +typedef void* CPackedReader; +typedef void* CArrowArray; +typedef void* CArrowSchema; + +/** + * @brief Open a packed reader to read needed columns in the specified path. + * + * @param path The root path of the packed files to read. + * @param schema The original schema of data. + * @param buffer_size The max buffer size of the packed reader. + * @param c_packed_reader The output pointer of the packed reader. + */ +int +NewPackedReader(const char* path, + struct ArrowSchema* schema, + const int64_t buffer_size, + CPackedReader* c_packed_reader); + +/** + * @brief Read the next record batch from the packed reader. + * By default, the maximum return batch is 1024 rows. + * + * @param c_packed_reader The packed reader to read. + * @param out_array The output pointer of the arrow array. + * @param out_schema The output pointer of the arrow schema. + */ +int +ReadNext(CPackedReader c_packed_reader, + CArrowArray* out_array, + CArrowSchema* out_schema); + +/** + * @brief Close the packed reader and release the resources. + * + * @param c_packed_reader The packed reader to close. + */ +int +CloseReader(CPackedReader c_packed_reader); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/internal/core/src/segcore/packed_writer_c.cpp b/internal/core/src/segcore/packed_writer_c.cpp new file mode 100644 index 0000000000000..613e21d78013a --- /dev/null +++ b/internal/core/src/segcore/packed_writer_c.cpp @@ -0,0 +1,81 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "segcore/packed_writer_c.h" +#include "milvus-storage/packed/writer.h" +#include "milvus-storage/common/log.h" +#include "milvus-storage/common/config.h" +#include "milvus-storage/filesystem/fs.h" + +#include <arrow/c/bridge.h> +#include <arrow/filesystem/filesystem.h> + +int +NewPackedWriter(const char* path, + struct ArrowSchema* schema, + const int64_t buffer_size, + CPackedWriter* c_packed_writer) { + try { + auto truePath = std::string(path); + auto factory = std::make_shared<milvus_storage::FileSystemFactory>(); + auto conf = milvus_storage::StorageConfig(); + conf.uri = "file:///tmp/"; + auto trueFs = factory->BuildFileSystem(conf, &truePath).value(); + auto trueSchema = arrow::ImportSchema(schema).ValueOrDie(); + auto writer = std::make_unique<milvus_storage::PackedRecordBatchWriter>( + buffer_size, trueSchema, trueFs, truePath, conf); + + *c_packed_writer = writer.release(); + return 0; + } catch (std::exception& e) { + return -1; + } +} + +int +WriteRecordBatch(CPackedWriter c_packed_writer, + struct ArrowArray* array, + struct ArrowSchema* schema) { + try { + auto packed_writer = + static_cast<milvus_storage::PackedRecordBatchWriter*>( + c_packed_writer); + auto record_batch = + arrow::ImportRecordBatch(array, schema).ValueOrDie(); + auto status = packed_writer->Write(record_batch); + if (!status.ok()) { + return -1; + } + return 0; + } catch (std::exception& e) { + return -1; + } +} + +int +CloseWriter(CPackedWriter c_packed_writer) { + try { + auto packed_writer = + static_cast<milvus_storage::PackedRecordBatchWriter*>( + c_packed_writer); + auto status = packed_writer->Close(); + delete packed_writer; + if (!status.ok()) { + return -1; + } + return 0; + } catch (std::exception& e) { + return -1; + } +} \ No newline at end of file diff --git a/internal/core/src/segcore/packed_writer_c.h b/internal/core/src/segcore/packed_writer_c.h new file mode 100644 index 0000000000000..207aba502d468 --- /dev/null +++ b/internal/core/src/segcore/packed_writer_c.h @@ -0,0 +1,41 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include <arrow/c/abi.h> + +typedef void* CPackedWriter; + +int +NewPackedWriter(const char* path, + struct ArrowSchema* schema, + const int64_t buffer_size, + CPackedWriter* c_packed_writer); + +int +WriteRecordBatch(CPackedWriter c_packed_writer, + struct ArrowArray* array, + struct ArrowSchema* schema); + +int +CloseWriter(CPackedWriter c_packed_writer); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/internal/core/thirdparty/CMakeLists.txt b/internal/core/thirdparty/CMakeLists.txt index 5fe44881ad3e8..391528d679540 100644 --- a/internal/core/thirdparty/CMakeLists.txt +++ b/internal/core/thirdparty/CMakeLists.txt @@ -45,3 +45,4 @@ if (LINUX) add_subdirectory(jemalloc) endif() +add_subdirectory(milvus-storage) \ No newline at end of file diff --git a/internal/core/thirdparty/milvus-storage/CMakeLists.txt b/internal/core/thirdparty/milvus-storage/CMakeLists.txt new file mode 100644 index 0000000000000..10500c68e5d5e --- /dev/null +++ b/internal/core/thirdparty/milvus-storage/CMakeLists.txt @@ -0,0 +1,51 @@ +#------------------------------------------------------------------------------- +# Copyright (C) 2019-2020 Zilliz. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under the License. +#------------------------------------------------------------------------------- + +# Update milvus-storage_VERSION for the first occurrence +milvus_add_pkg_config("milvus-storage") +set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES "") +set( milvus-storage_VERSION cb4fbbc ) +set( GIT_REPOSITORY "https://github.com/shaoting-huang/milvus-storage.git") +message(STATUS "milvus-storage repo: ${GIT_REPOSITORY}") +message(STATUS "milvus-storage version: ${milvus-storage_VERSION}") + +message(STATUS "Building milvus-storage-${milvus-storage_SOURCE_VER} from source") +message(STATUS ${CMAKE_BUILD_TYPE}) + +if ( ENABLE_AZURE_FS STREQUAL "ON" ) + set(WITH_AZURE_FS ON CACHE BOOL "" FORCE ) +else () + set(WITH_AZURE_FS OFF CACHE BOOL "" FORCE ) +endif () + +set( CMAKE_PREFIX_PATH ${CONAN_BOOST_ROOT} ) +FetchContent_Declare( + milvus-storage + GIT_REPOSITORY ${GIT_REPOSITORY} + GIT_TAG ${milvus-storage_VERSION} + SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/milvus-storage-src + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/milvus-storage-build + SOURCE_SUBDIR cpp + DOWNLOAD_DIR ${THIRDPARTY_DOWNLOAD_PATH} ) + +FetchContent_GetProperties( milvus-storage ) +if ( NOT milvus-storage_POPULATED ) + FetchContent_Populate( milvus-storage ) + + # Adding the following target: + # milvus-storage + add_subdirectory( ${milvus-storage_SOURCE_DIR}/cpp + ${milvus-storage_BINARY_DIR} ) +endif() + +set( MILVUS_STORAGE_INCLUDE_DIR ${milvus-storage_SOURCE_DIR}/cpp/include CACHE INTERNAL "Path to milvus-storage include directory" ) \ No newline at end of file diff --git a/internal/core/thirdparty/milvus-storage/milvus-storage.pc.in b/internal/core/thirdparty/milvus-storage/milvus-storage.pc.in new file mode 100644 index 0000000000000..e72695474cbf8 --- /dev/null +++ b/internal/core/thirdparty/milvus-storage/milvus-storage.pc.in @@ -0,0 +1,9 @@ +libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ + +Name: Milvus Storage +Description: Storage modules for Milvus +Version: @MILVUS_VERSION@ + +Libs: -L${libdir} -lmilvus-storage +Cflags: -I${includedir} diff --git a/internal/core/unittest/CMakeLists.txt b/internal/core/unittest/CMakeLists.txt index 172bc4a05c6c9..4fce8645b02d0 100644 --- a/internal/core/unittest/CMakeLists.txt +++ b/internal/core/unittest/CMakeLists.txt @@ -16,6 +16,7 @@ include_directories( ${SIMDJSON_INCLUDE_DIR} ${TANTIVY_INCLUDE_DIR} ${CONAN_INCLUDE_DIRS} + ${MILVUS_STORAGE_INCLUDE_DIR} ) add_definitions(-DMILVUS_TEST_SEGCORE_YAML_PATH="${CMAKE_SOURCE_DIR}/unittest/test_utils/test_segcore.yaml") diff --git a/internal/storagev2/OWNERS b/internal/storagev2/OWNERS new file mode 100644 index 0000000000000..43bbbe7016716 --- /dev/null +++ b/internal/storagev2/OWNERS @@ -0,0 +1,7 @@ +reviewers: + - tedxu + - shaoting-huang + - sunby + +approvers: + - maintainers \ No newline at end of file diff --git a/internal/storagev2/common/arrow_util/arrow_util.go b/internal/storagev2/common/arrow_util/arrow_util.go new file mode 100644 index 0000000000000..fa0b49d015c0e --- /dev/null +++ b/internal/storagev2/common/arrow_util/arrow_util.go @@ -0,0 +1,79 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package arrow_util + +import ( + "context" + + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/apache/arrow/go/v12/arrow/memory" + "github.com/apache/arrow/go/v12/parquet/file" + "github.com/apache/arrow/go/v12/parquet/pqarrow" + "github.com/milvus-io/milvus/internal/storagev2/common/constant" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" +) + +func MakeArrowFileReader(fs fs.Fs, filePath string) (*pqarrow.FileReader, error) { + f, err := fs.OpenFile(filePath) + if err != nil { + return nil, err + } + parquetReader, err := file.NewParquetReader(f) + if err != nil { + return nil, err + } + return pqarrow.NewFileReader(parquetReader, pqarrow.ArrowReadProperties{BatchSize: constant.ReadBatchSize}, memory.DefaultAllocator) +} + +func MakeArrowRecordReader(reader *pqarrow.FileReader, opts *options.ReadOptions) (array.RecordReader, error) { + var rowGroupsIndices []int + var columnIndices []int + metadata := reader.ParquetReader().MetaData() + for _, c := range opts.Columns { + columnIndices = append(columnIndices, metadata.Schema.ColumnIndexByName(c)) + } + for _, f := range opts.Filters { + columnIndices = append(columnIndices, metadata.Schema.ColumnIndexByName(f.GetColumnName())) + } + + for i := 0; i < len(metadata.RowGroups); i++ { + rg := metadata.RowGroup(i) + var canIgnored bool + for _, filter := range opts.Filters { + columnIndex := rg.Schema.ColumnIndexByName(filter.GetColumnName()) + columnChunk, err := rg.ColumnChunk(columnIndex) + if err != nil { + return nil, err + } + columnStats, err := columnChunk.Statistics() + if err != nil { + return nil, err + } + if columnStats == nil || !columnStats.HasMinMax() { + continue + } + if filter.CheckStatistics(columnStats) { + canIgnored = true + break + } + } + if !canIgnored { + rowGroupsIndices = append(rowGroupsIndices, i) + } + } + + return reader.GetRecordReader(context.TODO(), columnIndices, rowGroupsIndices) +} diff --git a/internal/storagev2/common/constant/constant.go b/internal/storagev2/common/constant/constant.go new file mode 100644 index 0000000000000..8a72710a95970 --- /dev/null +++ b/internal/storagev2/common/constant/constant.go @@ -0,0 +1,31 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package constant + +const ( + ReadBatchSize = 1024 + ManifestTempFileSuffix = ".manifest.tmp" + ManifestFileSuffix = ".manifest" + ManifestDir = "versions" + BlobDir = "blobs" + ParquetDataFileSuffix = ".parquet" + OffsetFieldName = "__offset" + VectorDataDir = "vector" + ScalarDataDir = "scalar" + DeleteDataDir = "delete" + LatestManifestVersion = -1 + + EndpointOverride = "endpoint_override" +) diff --git a/internal/storagev2/common/errors/errors.go b/internal/storagev2/common/errors/errors.go new file mode 100644 index 0000000000000..cff628a8b8a42 --- /dev/null +++ b/internal/storagev2/common/errors/errors.go @@ -0,0 +1,27 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package errors + +import "errors" + +var ( + ErrSchemaIsNil = errors.New("schema is nil") + ErrBlobAlreadyExist = errors.New("blob already exist") + ErrBlobNotExist = errors.New("blob not exist") + ErrSchemaNotMatch = errors.New("schema not match") + ErrColumnNotExist = errors.New("column not exist") + ErrInvalidPath = errors.New("invlid path") + ErrNoEndpoint = errors.New("no endpoint is specified") +) diff --git a/internal/storagev2/common/log/field.go b/internal/storagev2/common/log/field.go new file mode 100644 index 0000000000000..d59e3b915d499 --- /dev/null +++ b/internal/storagev2/common/log/field.go @@ -0,0 +1,70 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package log + +import "go.uber.org/zap" + +var ( + // not lint + Skip = zap.Skip + Binary = zap.Binary + Bool = zap.Bool + Boolp = zap.Boolp + ByteString = zap.ByteString + Complex128 = zap.Complex128 + Complex128p = zap.Complex128p + Complex64 = zap.Complex64 + Complex64p = zap.Complex64p + Float64 = zap.Float64 + Float64p = zap.Float64p + Float32 = zap.Float32 + Float32p = zap.Float32p + Int = zap.Int + Intp = zap.Intp + Int64 = zap.Int64 + Int64p = zap.Int64p + Int32 = zap.Int32 + Int32p = zap.Int32p + Int16 = zap.Int16 + Int16p = zap.Int16p + Int8 = zap.Int8 + Int8p = zap.Int8p + String = zap.String + Stringp = zap.Stringp + Uint = zap.Uint + Uintp = zap.Uintp + Uint64 = zap.Uint64 + Uint64p = zap.Uint64p + Uint32 = zap.Uint32 + Uint32p = zap.Uint32p + Uint16 = zap.Uint16 + Uint16p = zap.Uint16p + Uint8 = zap.Uint8 + Uint8p = zap.Uint8p + Uintptr = zap.Uintptr + Uintptrp = zap.Uintptrp + Reflect = zap.Reflect + Namespace = zap.Namespace + Stringer = zap.Stringer + Time = zap.Time + Timep = zap.Timep + Stack = zap.Stack + StackSkip = zap.StackSkip + Duration = zap.Duration + Durationp = zap.Durationp + Object = zap.Object + Inline = zap.Inline + Any = zap.Any +) diff --git a/internal/storagev2/common/log/log.go b/internal/storagev2/common/log/log.go new file mode 100644 index 0000000000000..87f477d9c23ca --- /dev/null +++ b/internal/storagev2/common/log/log.go @@ -0,0 +1,106 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package log + +import ( + "io" + "os" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" +) + +type Level = zapcore.Level + +const ( + DebugLevel = zapcore.DebugLevel + InfoLevel = zapcore.InfoLevel + WarnLevel = zapcore.WarnLevel + ErrorLevel = zapcore.ErrorLevel + PanicLevel = zapcore.PanicLevel + FatalLevel = zapcore.FatalLevel +) + +type Logger struct { + l *zap.Logger + al *zap.AtomicLevel +} + +func New(out io.Writer, level Level) *Logger { + if out == nil { + out = os.Stderr + } + + al := zap.NewAtomicLevelAt(level) + cfg := zap.NewDevelopmentEncoderConfig() + + core := zapcore.NewCore( + zapcore.NewConsoleEncoder(cfg), + zapcore.AddSync(out), + al, + ) + return &Logger{l: zap.New(core, zap.AddCaller(), zap.AddCallerSkip(2)), al: &al} +} + +func (l *Logger) SetLevel(level Level) { + if l.al != nil { + l.al.SetLevel(level) + } +} + +type Field = zap.Field + +func (l *Logger) Debug(msg string, fields ...Field) { + l.l.Debug(msg, fields...) +} + +func (l *Logger) Info(msg string, fields ...Field) { + l.l.Info(msg, fields...) +} + +func (l *Logger) Warn(msg string, fields ...Field) { + l.l.Warn(msg, fields...) +} + +func (l *Logger) Error(msg string, fields ...Field) { + l.l.Error(msg, fields...) +} + +func (l *Logger) Panic(msg string, fields ...Field) { + l.l.Panic(msg, fields...) +} + +func (l *Logger) Fatal(msg string, fields ...Field) { + l.l.Fatal(msg, fields...) +} + +func (l *Logger) Sync() error { + return l.l.Sync() +} + +var std = New(os.Stderr, DebugLevel) + +func Default() *Logger { return std } +func ReplaceDefault(l *Logger) { std = l } +func SetLevel(level Level) { std.SetLevel(level) } + +func Debug(msg string, fields ...Field) { std.Debug(msg, fields...) } +func Info(msg string, fields ...Field) { std.Info(msg, fields...) } +func Warn(msg string, fields ...Field) { std.Warn(msg, fields...) } +func Error(msg string, fields ...Field) { std.Error(msg, fields...) } +func Panic(msg string, fields ...Field) { std.Panic(msg, fields...) } +func Fatal(msg string, fields ...Field) { std.Fatal(msg, fields...) } + +func Sync() error { return std.Sync() } diff --git a/internal/storagev2/common/log/log_test.go b/internal/storagev2/common/log/log_test.go new file mode 100644 index 0000000000000..646e1323e68e6 --- /dev/null +++ b/internal/storagev2/common/log/log_test.go @@ -0,0 +1,33 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package log + +import ( + "testing" +) + +func TestLogger(t *testing.T) { + defer Sync() + Info("Testing") + Debug("Testing") + Warn("Testing") + Error("Testing") + defer func() { + if err := recover(); err != nil { + Debug("logPanic recover") + } + }() + Panic("Testing") +} diff --git a/internal/storagev2/common/log/options.go b/internal/storagev2/common/log/options.go new file mode 100644 index 0000000000000..3e93d1beaea99 --- /dev/null +++ b/internal/storagev2/common/log/options.go @@ -0,0 +1,34 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package log + +import "go.uber.org/zap" + +type Option = zap.Option + +var ( + WrapCore = zap.WrapCore + Hooks = zap.Hooks + Fields = zap.Fields + ErrorOutput = zap.ErrorOutput + Development = zap.Development + AddCaller = zap.AddCaller + WithCaller = zap.WithCaller + AddCallerSkip = zap.AddCallerSkip + AddStacktrace = zap.AddStacktrace + IncreaseLevel = zap.IncreaseLevel + WithFatalHook = zap.WithFatalHook + WithClock = zap.WithClock +) diff --git a/internal/storagev2/common/utils/utils.go b/internal/storagev2/common/utils/utils.go new file mode 100644 index 0000000000000..5b774835abff2 --- /dev/null +++ b/internal/storagev2/common/utils/utils.go @@ -0,0 +1,407 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import ( + "errors" + "fmt" + "path/filepath" + "strconv" + "strings" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/endian" + "github.com/google/uuid" + "github.com/milvus-io/milvus/internal/storagev2/common/constant" + "github.com/milvus-io/milvus/internal/storagev2/common/log" + "github.com/milvus-io/milvus/internal/storagev2/proto/schema_proto" +) + +var ErrInvalidArgument = errors.New("invalid argument") + +func ToProtobufType(dataType arrow.Type) (schema_proto.LogicType, error) { + typeId := int(dataType) + if typeId < 0 || typeId >= int(schema_proto.LogicType_MAX_ID) { + return schema_proto.LogicType_NA, fmt.Errorf("parse data type %v: %w", dataType, ErrInvalidArgument) + } + return schema_proto.LogicType(typeId), nil +} + +func ToProtobufMetadata(metadata *arrow.Metadata) (*schema_proto.KeyValueMetadata, error) { + keys := metadata.Keys() + values := metadata.Values() + return &schema_proto.KeyValueMetadata{Keys: keys, Values: values}, nil +} + +func ToProtobufDataType(dataType arrow.DataType) (*schema_proto.DataType, error) { + protoType := &schema_proto.DataType{} + err := SetTypeValues(protoType, dataType) + if err != nil { + return nil, err + } + logicType, err := ToProtobufType(dataType.ID()) + if err != nil { + return nil, err + } + protoType.LogicType = logicType + + if len(GetFields(dataType)) > 0 { + for _, field := range GetFields(dataType) { + protoField := &schema_proto.Field{} + protoFieldType, err := ToProtobufField(&field) + if err != nil { + return nil, err + } + protoField = protoFieldType + protoType.Children = append(protoType.Children, protoField) + } + } + + return protoType, nil +} + +// GetFields TODO CHECK MORE TYPES +func GetFields(dataType arrow.DataType) []arrow.Field { + switch dataType.ID() { + case arrow.LIST: + listType, _ := dataType.(*arrow.ListType) + return listType.Fields() + case arrow.STRUCT: + structType, _ := dataType.(*arrow.StructType) + return structType.Fields() + case arrow.MAP: + mapType, _ := dataType.(*arrow.MapType) + return mapType.Fields() + case arrow.FIXED_SIZE_LIST: + listType, _ := dataType.(*arrow.FixedSizeListType) + return listType.Fields() + default: + return nil + } +} + +func ToProtobufField(field *arrow.Field) (*schema_proto.Field, error) { + protoField := &schema_proto.Field{} + protoField.Name = field.Name + protoField.Nullable = field.Nullable + + if field.Metadata.Len() != 0 { + fieldMetadata, err := ToProtobufMetadata(&field.Metadata) + if err != nil { + return nil, fmt.Errorf("convert to protobuf field: %w", err) + } + protoField.Metadata = fieldMetadata + } + + dataType, err := ToProtobufDataType(field.Type) + if err != nil { + return nil, fmt.Errorf("convert to protobuf field: %w", err) + } + protoField.DataType = dataType + return protoField, nil +} + +func SetTypeValues(protoType *schema_proto.DataType, dataType arrow.DataType) error { + switch dataType.ID() { + case arrow.FIXED_SIZE_BINARY: + realType, ok := dataType.(*arrow.FixedSizeBinaryType) + if !ok { + return fmt.Errorf("convert to fixed size binary type: %w", ErrInvalidArgument) + } + fixedSizeBinaryType := &schema_proto.FixedSizeBinaryType{} + fixedSizeBinaryType.ByteWidth = int32(realType.ByteWidth) + protoType.TypeRelatedValues = &schema_proto.DataType_FixedSizeBinaryType{FixedSizeBinaryType: fixedSizeBinaryType} + break + case arrow.FIXED_SIZE_LIST: + realType, ok := dataType.(*arrow.FixedSizeListType) + if !ok { + return fmt.Errorf("convert to fixed size list type: %w", ErrInvalidArgument) + } + fixedSizeListType := &schema_proto.FixedSizeListType{} + fixedSizeListType.ListSize = int32(realType.Len()) + protoType.TypeRelatedValues = &schema_proto.DataType_FixedSizeListType{FixedSizeListType: fixedSizeListType} + break + case arrow.DICTIONARY: + realType, ok := dataType.(*arrow.DictionaryType) + if !ok { + return fmt.Errorf("convert to dictionary type: %w", ErrInvalidArgument) + } + dictionaryType := &schema_proto.DictionaryType{} + indexType, err := ToProtobufDataType(realType.IndexType) + if err != nil { + return err + } + dictionaryType.IndexType = indexType + valueType, err := ToProtobufDataType(realType.ValueType) + if err != nil { + return err + } + dictionaryType.ValueType = valueType + dictionaryType.Ordered = realType.Ordered + protoType.TypeRelatedValues = &schema_proto.DataType_DictionaryType{DictionaryType: dictionaryType} + break + + case arrow.MAP: + realType, ok := dataType.(*arrow.MapType) + if !ok { + return fmt.Errorf("convert to map type: %w", ErrInvalidArgument) + } + mapType := &schema_proto.MapType{} + mapType.KeysSorted = realType.KeysSorted + protoType.TypeRelatedValues = &schema_proto.DataType_MapType{MapType: mapType} + break + + default: + } + + return nil +} + +func ToProtobufSchema(schema *arrow.Schema) (*schema_proto.ArrowSchema, error) { + protoSchema := &schema_proto.ArrowSchema{} + for _, field := range schema.Fields() { + protoField, err := ToProtobufField(&field) + if err != nil { + return nil, err + } + protoSchema.Fields = append(protoSchema.Fields, protoField) + } + if schema.Endianness() == endian.LittleEndian { + protoSchema.Endianness = schema_proto.Endianness_Little + } else if schema.Endianness() == endian.BigEndian { + protoSchema.Endianness = schema_proto.Endianness_Big + } + + // TODO FIX ME: golang proto not support proto_schema->mutable_metadata()->add_keys(key); + if schema.HasMetadata() && !schema.HasMetadata() { + for _, key := range schema.Metadata().Keys() { + protoKeyValue := protoSchema.GetMetadata() + protoKeyValue.Keys = append(protoKeyValue.Keys, key) + } + for _, value := range schema.Metadata().Values() { + protoKeyValue := protoSchema.GetMetadata() + protoKeyValue.Values = append(protoKeyValue.Values, value) + } + } + + return protoSchema, nil +} + +func FromProtobufSchema(schema *schema_proto.ArrowSchema) (*arrow.Schema, error) { + fields := make([]arrow.Field, 0, len(schema.Fields)) + for _, field := range schema.Fields { + tmp, err := FromProtobufField(field) + if err != nil { + return nil, err + } + fields = append(fields, *tmp) + } + tmp, err := FromProtobufKeyValueMetadata(schema.Metadata) + if err != nil { + return nil, err + } + newSchema := arrow.NewSchema(fields, tmp) + return newSchema, nil +} + +func FromProtobufField(field *schema_proto.Field) (*arrow.Field, error) { + datatype, err := FromProtobufDataType(field.DataType) + if err != nil { + return nil, err + } + + metadata, err := FromProtobufKeyValueMetadata(field.GetMetadata()) + if err != nil { + return nil, err + } + + return &arrow.Field{Name: field.Name, Type: datatype, Nullable: field.Nullable, Metadata: *metadata}, nil +} + +func FromProtobufKeyValueMetadata(metadata *schema_proto.KeyValueMetadata) (*arrow.Metadata, error) { + keys := make([]string, 0) + values := make([]string, 0) + if metadata != nil { + keys = metadata.Keys + values = metadata.Values + } + newMetadata := arrow.NewMetadata(keys, values) + return &newMetadata, nil +} + +func FromProtobufDataType(dataType *schema_proto.DataType) (arrow.DataType, error) { + switch dataType.LogicType { + case schema_proto.LogicType_NA: + return &arrow.NullType{}, nil + case schema_proto.LogicType_BOOL: + return &arrow.BooleanType{}, nil + case schema_proto.LogicType_UINT8: + return &arrow.Uint8Type{}, nil + case schema_proto.LogicType_INT8: + return &arrow.Int8Type{}, nil + case schema_proto.LogicType_UINT16: + return &arrow.Uint16Type{}, nil + case schema_proto.LogicType_INT16: + return &arrow.Int16Type{}, nil + case schema_proto.LogicType_UINT32: + return &arrow.Uint32Type{}, nil + case schema_proto.LogicType_INT32: + return &arrow.Int32Type{}, nil + case schema_proto.LogicType_UINT64: + return &arrow.Uint64Type{}, nil + case schema_proto.LogicType_INT64: + return &arrow.Int64Type{}, nil + case schema_proto.LogicType_HALF_FLOAT: + return &arrow.Float16Type{}, nil + case schema_proto.LogicType_FLOAT: + return &arrow.Float32Type{}, nil + case schema_proto.LogicType_DOUBLE: + return &arrow.Float64Type{}, nil + case schema_proto.LogicType_STRING: + return &arrow.StringType{}, nil + case schema_proto.LogicType_BINARY: + return &arrow.BinaryType{}, nil + + case schema_proto.LogicType_LIST: + fieldType, err := FromProtobufField(dataType.Children[0]) + if err != nil { + return nil, err + } + listType := arrow.ListOf(fieldType.Type) + return listType, nil + + case schema_proto.LogicType_STRUCT: + fields := make([]arrow.Field, 0, len(dataType.Children)) + for _, child := range dataType.Children { + field, err := FromProtobufField(child) + if err != nil { + return nil, err + } + fields = append(fields, *field) + } + structType := arrow.StructOf(fields...) + return structType, nil + + case schema_proto.LogicType_DICTIONARY: + keyType, err := FromProtobufField(dataType.Children[0]) + if err != nil { + return nil, err + } + valueType, err := FromProtobufField(dataType.Children[1]) + if err != nil { + return nil, err + } + dictType := &arrow.DictionaryType{ + IndexType: keyType.Type, + ValueType: valueType.Type, + } + return dictType, nil + + case schema_proto.LogicType_MAP: + fieldType, err := FromProtobufField(dataType.Children[0]) + if err != nil { + return nil, err + } + // TODO FIX ME + return arrow.MapOf(fieldType.Type, fieldType.Type), nil + + case schema_proto.LogicType_FIXED_SIZE_BINARY: + + sizeBinaryType := arrow.FixedSizeBinaryType{ByteWidth: int(dataType.GetFixedSizeBinaryType().ByteWidth)} + return &sizeBinaryType, nil + + case schema_proto.LogicType_FIXED_SIZE_LIST: + fieldType, err := FromProtobufField(dataType.Children[0]) + if err != nil { + return nil, err + } + fixedSizeListType := arrow.FixedSizeListOf(int32(int(dataType.GetFixedSizeListType().ListSize)), fieldType.Type) + return fixedSizeListType, nil + + default: + return nil, fmt.Errorf("parse protobuf datatype: %w", ErrInvalidArgument) + } +} + +func GetNewParquetFilePath(path string) string { + scalarFileId := uuid.New() + path = filepath.Join(path, scalarFileId.String()+constant.ParquetDataFileSuffix) + return path +} + +func GetManifestFilePath(path string, version int64) string { + path = filepath.Join(path, constant.ManifestDir, strconv.FormatInt(version, 10)+constant.ManifestFileSuffix) + return path +} + +func GetManifestTmpFilePath(path string, version int64) string { + path = filepath.Join(path, constant.ManifestDir, strconv.FormatInt(version, 10)+constant.ManifestTempFileSuffix) + return path +} + +func GetBlobFilePath(path string) string { + blobId := uuid.New() + return filepath.Join(GetBlobDir(path), blobId.String()) +} + +func GetManifestDir(path string) string { + path = filepath.Join(path, constant.ManifestDir) + return path +} + +func GetVectorDataDir(path string) string { + return filepath.Join(path, constant.VectorDataDir) +} + +func GetScalarDataDir(path string) string { + return filepath.Join(path, constant.ScalarDataDir) +} + +func GetBlobDir(path string) string { + return filepath.Join(path, constant.BlobDir) +} + +func GetDeleteDataDir(path string) string { + return filepath.Join(path, constant.DeleteDataDir) +} + +func ParseVersionFromFileName(path string) int64 { + pos := strings.Index(path, constant.ManifestFileSuffix) + if pos == -1 || !strings.HasSuffix(path, constant.ManifestFileSuffix) { + log.Warn("manifest file suffix not match", log.String("path", path)) + return -1 + } + version := path[0:pos] + versionInt, err := strconv.ParseInt(version, 10, 64) + if err != nil { + log.Error("parse version from file name error", log.String("path", path), log.String("version", version)) + return -1 + } + return versionInt +} + +func ProjectSchema(sc *arrow.Schema, columns []string) *arrow.Schema { + var fields []arrow.Field + for _, field := range sc.Fields() { + for _, column := range columns { + if field.Name == column { + fields = append(fields, field) + break + } + } + } + + return arrow.NewSchema(fields, nil) +} diff --git a/internal/storagev2/docs/layout.md b/internal/storagev2/docs/layout.md new file mode 100644 index 0000000000000..776d0cd39d26e --- /dev/null +++ b/internal/storagev2/docs/layout.md @@ -0,0 +1,22 @@ + + +**storage layer interface**: supply reader/writer of storage which contains read options. Maintain meta of storage and handle atomic read/write with multiple files (maybe have different format) on disks. + +--- + +**File Reader/Writer interface**: receive data and read options from upper layer and turn the raw data to our defined data. + +--- + +**File Format Reader/Writer**: file format reader/writer (eg. parquet/raw/others like orc). + +--- + +**File system interface**: support different file system (eg. in-memory, aws, minio, posix, windows). + + + + + + + diff --git a/internal/storagev2/file/blob/blob.go b/internal/storagev2/file/blob/blob.go new file mode 100644 index 0000000000000..d99147aeec539 --- /dev/null +++ b/internal/storagev2/file/blob/blob.go @@ -0,0 +1,39 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package blob + +import "github.com/milvus-io/milvus/internal/storagev2/proto/manifest_proto" + +type Blob struct { + Name string + Size int64 + File string +} + +func (b Blob) ToProtobuf() *manifest_proto.Blob { + blob := &manifest_proto.Blob{} + blob.Name = b.Name + blob.Size = b.Size + blob.File = b.File + return blob +} + +func FromProtobuf(blob *manifest_proto.Blob) Blob { + return Blob{ + Name: blob.Name, + Size: blob.Size, + File: blob.File, + } +} diff --git a/internal/storagev2/file/fragment/deletefragment.go b/internal/storagev2/file/fragment/deletefragment.go new file mode 100644 index 0000000000000..2d7329adf5a8b --- /dev/null +++ b/internal/storagev2/file/fragment/deletefragment.go @@ -0,0 +1,45 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fragment + +import ( + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/storage/schema" +) + +type ( + pkType any + DeleteFragmentVector []DeleteFragment + DeleteFragment struct { + id int64 + schema *schema.Schema + fs fs.Fs + data map[pkType][]int64 + } +) + +func NewDeleteFragment(id int64, schema *schema.Schema, fs fs.Fs) *DeleteFragment { + return &DeleteFragment{ + id: id, + schema: schema, + fs: fs, + data: make(map[pkType][]int64), + } +} + +func Make(f fs.Fs, s *schema.Schema, frag Fragment) DeleteFragment { + // TODO: implement + panic("implement me") +} diff --git a/internal/storagev2/file/fragment/fragment.go b/internal/storagev2/file/fragment/fragment.go new file mode 100644 index 0000000000000..a69b1030e8d4f --- /dev/null +++ b/internal/storagev2/file/fragment/fragment.go @@ -0,0 +1,80 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fragment + +import "github.com/milvus-io/milvus/internal/storagev2/proto/manifest_proto" + +type FragmentType int32 + +const ( + kUnknown FragmentType = 0 + kData FragmentType = 1 + kDelete FragmentType = 2 +) + +type Fragment struct { + fragmentId int64 + files []string +} + +type FragmentVector []Fragment + +func ToFilesVector(fragments []Fragment) []string { + files := make([]string, 0) + for _, fragment := range fragments { + files = append(files, fragment.files...) + } + return files +} + +func NewFragment() Fragment { + return Fragment{ + files: make([]string, 0), + } +} + +func (f *Fragment) AddFile(file string) { + f.files = append(f.files, file) +} + +func (f *Fragment) Files() []string { + return f.files +} + +func (f *Fragment) FragmentId() int64 { + return f.fragmentId +} + +func (f *Fragment) SetFragmentId(fragmentId int64) { + f.fragmentId = fragmentId +} + +func (f *Fragment) ToProtobuf() *manifest_proto.Fragment { + fragment := &manifest_proto.Fragment{} + fragment.Id = f.fragmentId + for _, file := range f.files { + fragment.Files = append(fragment.Files, file) + } + return fragment +} + +func FromProtobuf(fragment *manifest_proto.Fragment) Fragment { + newFragment := NewFragment() + newFragment.SetFragmentId(fragment.GetId()) + for _, file := range fragment.Files { + newFragment.files = append(newFragment.files, file) + } + return newFragment +} diff --git a/internal/storagev2/filter/conjunction_filter.go b/internal/storagev2/filter/conjunction_filter.go new file mode 100644 index 0000000000000..30d9bcb44cf74 --- /dev/null +++ b/internal/storagev2/filter/conjunction_filter.go @@ -0,0 +1,84 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/parquet/metadata" + "github.com/bits-and-blooms/bitset" +) + +type ConjunctionAndFilter struct { + filters []Filter + columnName string +} + +func (f *ConjunctionAndFilter) GetColumnName() string { + return f.columnName +} + +// FIXME: should have 3 cases. +// 1. all records satisfy the filter, this group dont need to check filter again. +// 2. no record satisfies the filter. +// 3. some records satisfy the filter, this group should check filter again. +func (f *ConjunctionAndFilter) CheckStatistics(stats metadata.TypedStatistics) bool { + for _, filter := range f.filters { + if filter.CheckStatistics(stats) { + return true + } + } + return false +} + +func (f *ConjunctionAndFilter) Type() FilterType { + return And +} + +func (f *ConjunctionAndFilter) Apply(colData arrow.Array, filterBitSet *bitset.BitSet) { + for i := 0; i < len(f.filters); i++ { + f.filters[i].Apply(colData, filterBitSet) + } +} + +type ConjunctionOrFilter struct { + filters []Filter +} + +func (f *ConjunctionOrFilter) CheckStatistics(stats metadata.TypedStatistics) bool { + for _, filter := range f.filters { + if !filter.CheckStatistics(stats) { + return false + } + } + return true +} + +func (f *ConjunctionOrFilter) Apply(colData arrow.Array, filterBitSet *bitset.BitSet) { + orBitSet := bitset.New(filterBitSet.Len()) + for i := 1; i < len(f.filters); i++ { + childBitSet := filterBitSet.Clone() + f.filters[i].Apply(colData, childBitSet) + orBitSet.Intersection(childBitSet) + } + filterBitSet.Union(orBitSet) +} + +func (f *ConjunctionOrFilter) Type() FilterType { + return Or +} + +func NewConjunctionAndFilter(filters ...Filter) *ConjunctionAndFilter { + return &ConjunctionAndFilter{filters: filters} +} diff --git a/internal/storagev2/filter/constant_filter.go b/internal/storagev2/filter/constant_filter.go new file mode 100644 index 0000000000000..63a94b307a1b8 --- /dev/null +++ b/internal/storagev2/filter/constant_filter.go @@ -0,0 +1,151 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/apache/arrow/go/v12/parquet" + "github.com/apache/arrow/go/v12/parquet/metadata" + "github.com/bits-and-blooms/bitset" +) + +type ConstantFilter struct { + cmpType ComparisonType + value interface{} + columnName string +} + +func (f *ConstantFilter) GetColumnName() string { + return f.columnName +} + +func (f *ConstantFilter) CheckStatistics(stats metadata.TypedStatistics) bool { + // FIXME: value may be int8/uint8/...., we should encapsulate the value type, now we just do type assertion for prototype + switch stats.Type() { + case parquet.Types.Int32: + i32stats := stats.(*metadata.Int32Statistics) + if i32stats.HasMinMax() { + return checkStats(f.value.(int32), i32stats.Min(), i32stats.Max(), f.cmpType) + } + case parquet.Types.Int64: + i64stats := stats.(*metadata.Int64Statistics) + if i64stats.HasMinMax() { + return checkStats(f.value.(int64), i64stats.Min(), i64stats.Max(), f.cmpType) + } + case parquet.Types.Float: + floatstats := stats.(*metadata.Float32Statistics) + if floatstats.HasMinMax() { + return checkStats(f.value.(float32), floatstats.Min(), floatstats.Max(), f.cmpType) + } + case parquet.Types.Double: + doublestats := stats.(*metadata.Float64Statistics) + if doublestats.HasMinMax() { + return checkStats(f.value.(float64), doublestats.Min(), doublestats.Max(), f.cmpType) + } + } + return false +} + +type comparableValue interface { + int32 | int64 | float32 | float64 +} + +func checkStats[T comparableValue](value, min, max T, cmpType ComparisonType) bool { + switch cmpType { + case Equal: + return value < min || value > max + case NotEqual: + return value == min && value == max + case LessThan: + return value <= min + case LessThanOrEqual: + return value < min + case GreaterThan: + return value >= max + case GreaterThanOrEqual: + return value > max + default: + return false + } +} + +func (f *ConstantFilter) Apply(colData arrow.Array, filterBitSet *bitset.BitSet) { + switch data := colData.(type) { + case *array.Int8: + filterColumn(f.value.(int8), data.Int8Values(), f.cmpType, filterBitSet) + case *array.Uint8: + filterColumn(f.value.(uint8), data.Uint8Values(), f.cmpType, filterBitSet) + case *array.Int16: + filterColumn(f.value.(int16), data.Int16Values(), f.cmpType, filterBitSet) + case *array.Uint16: + filterColumn(f.value.(uint16), data.Uint16Values(), f.cmpType, filterBitSet) + case *array.Int32: + filterColumn(f.value.(int32), data.Int32Values(), f.cmpType, filterBitSet) + case *array.Uint32: + filterColumn(f.value.(uint32), data.Uint32Values(), f.cmpType, filterBitSet) + case *array.Int64: + filterColumn(f.value.(int64), data.Int64Values(), f.cmpType, filterBitSet) + case *array.Uint64: + filterColumn(f.value.(uint64), data.Uint64Values(), f.cmpType, filterBitSet) + case *array.Float32: + filterColumn(f.value.(float32), data.Float32Values(), f.cmpType, filterBitSet) + case *array.Float64: + filterColumn(f.value.(float64), data.Float64Values(), f.cmpType, filterBitSet) + } +} + +type comparableColumnType interface { + int8 | uint8 | int16 | uint16 | int32 | uint32 | int64 | uint64 | float32 | float64 +} + +func filterColumn[T comparableColumnType](value T, targets []T, cmpType ComparisonType, filterBitSet *bitset.BitSet) { + for i, target := range targets { + if checkColumn(value, target, cmpType) { + filterBitSet.Set(uint(i)) + } + } +} + +func checkColumn[T comparableColumnType](value, target T, cmpType ComparisonType) bool { + switch cmpType { + case Equal: + return value != target + case NotEqual: + return value == target + case LessThan: + return value <= target + case LessThanOrEqual: + return value < target + case GreaterThan: + return value >= target + case GreaterThanOrEqual: + return value > target + default: + return false + } +} + +func (f *ConstantFilter) Type() FilterType { + return Constant +} + +func NewConstantFilter(cmpType ComparisonType, columnName string, value interface{}) *ConstantFilter { + return &ConstantFilter{ + cmpType: cmpType, + columnName: columnName, + value: value, + } +} diff --git a/internal/storagev2/filter/filter.go b/internal/storagev2/filter/filter.go new file mode 100644 index 0000000000000..4b803fcdd3352 --- /dev/null +++ b/internal/storagev2/filter/filter.go @@ -0,0 +1,48 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package filter + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/parquet/metadata" + "github.com/bits-and-blooms/bitset" +) + +type FilterType int8 + +const ( + And FilterType = iota + Or + Constant + Range +) + +type Filter interface { + CheckStatistics(metadata.TypedStatistics) bool + Type() FilterType + Apply(colData arrow.Array, filterBitSet *bitset.BitSet) + GetColumnName() string +} + +type ComparisonType int8 + +const ( + Equal ComparisonType = iota + NotEqual + LessThan + LessThanOrEqual + GreaterThan + GreaterThanOrEqual +) diff --git a/internal/storagev2/io/format/parquet/file_reader.go b/internal/storagev2/io/format/parquet/file_reader.go new file mode 100644 index 0000000000000..fc305a11a22af --- /dev/null +++ b/internal/storagev2/io/format/parquet/file_reader.go @@ -0,0 +1,219 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package parquet + +import ( + "context" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/apache/arrow/go/v12/arrow/memory" + "github.com/apache/arrow/go/v12/parquet/file" + "github.com/apache/arrow/go/v12/parquet/metadata" + "github.com/apache/arrow/go/v12/parquet/pqarrow" + "github.com/bits-and-blooms/bitset" + "github.com/milvus-io/milvus/internal/storagev2/common/constant" + "github.com/milvus-io/milvus/internal/storagev2/filter" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" +) + +type FileReader struct { + reader *pqarrow.FileReader + options *options.ReadOptions + recReader pqarrow.RecordReader +} + +// When the Reader reaches the end of the underlying stream, it returns (nil, io.EOF) +func (r *FileReader) Read() (arrow.Record, error) { + if r.recReader == nil { + // lazy init + if err := r.initRecReader(); err != nil { + return nil, err + } + } + rec, err := r.recReader.Read() + if err != nil { + return nil, err + } + + return applyFilters(rec, r.options.Filters), nil +} + +func applyFilters(rec arrow.Record, filters map[string]filter.Filter) arrow.Record { + filterBitSet := bitset.New(uint(rec.NumRows())) + for col, f := range filters { + colIndices := rec.Schema().FieldIndices(col) + if len(colIndices) == 0 { + panic("column not found") + } + colIndex := colIndices[0] + arr := rec.Column(colIndex) + f.Apply(arr, filterBitSet) + } + + if filterBitSet.None() { + return rec + } + + var cols []arrow.Array + for i := 0; i < int(rec.NumCols()); i++ { + col := rec.Column(i) + switch t := col.(type) { + case *array.Int8: + builder := array.NewInt8Builder(memory.DefaultAllocator) + filtered := filterRecord(t.Int8Values(), filterBitSet) + builder.AppendValues(filtered, nil) + cols = append(cols, builder.NewArray()) + case *array.Uint8: + builder := array.NewUint8Builder(memory.DefaultAllocator) + filtered := filterRecord(t.Uint8Values(), filterBitSet) + builder.AppendValues(filtered, nil) + cols = append(cols, builder.NewArray()) + case *array.Int16: + builder := array.NewInt16Builder(memory.DefaultAllocator) + filtered := filterRecord(t.Int16Values(), filterBitSet) + builder.AppendValues(filtered, nil) + cols = append(cols, builder.NewArray()) + case *array.Uint16: + builder := array.NewUint16Builder(memory.DefaultAllocator) + filtered := filterRecord(t.Uint16Values(), filterBitSet) + builder.AppendValues(filtered, nil) + cols = append(cols, builder.NewArray()) + case *array.Int32: + builder := array.NewInt32Builder(memory.DefaultAllocator) + filtered := filterRecord(t.Int32Values(), filterBitSet) + builder.AppendValues(filtered, nil) + cols = append(cols, builder.NewArray()) + case *array.Uint32: + builder := array.NewUint32Builder(memory.DefaultAllocator) + filtered := filterRecord(t.Uint32Values(), filterBitSet) + builder.AppendValues(filtered, nil) + cols = append(cols, builder.NewArray()) + case *array.Int64: + builder := array.NewInt64Builder(memory.DefaultAllocator) + filtered := filterRecord(t.Int64Values(), filterBitSet) + builder.AppendValues(filtered, nil) + cols = append(cols, builder.NewArray()) + case *array.Uint64: + builder := array.NewUint64Builder(memory.DefaultAllocator) + filtered := filterRecord(t.Uint64Values(), filterBitSet) + builder.AppendValues(filtered, nil) + cols = append(cols, builder.NewArray()) + default: + panic("unsupported type") + } + } + + return array.NewRecord(rec.Schema(), cols, int64(cols[0].Len())) +} + +type comparableColumnType interface { + int8 | uint8 | int16 | uint16 | int32 | uint32 | int64 | uint64 | float32 | float64 +} + +func filterRecord[T comparableColumnType](targets []T, filterBitSet *bitset.BitSet) []T { + var res []T + for i := 0; i < int(filterBitSet.Len()); i++ { + if !filterBitSet.Test(uint(i)) { + res = append(res, targets[i]) + } + } + return res +} + +func (r *FileReader) initRecReader() error { + var ( + filters map[string]filter.Filter = r.options.Filters + columns []string = r.options.Columns + ) + + var ( + rowGroupNum int = r.reader.ParquetReader().NumRowGroups() + fileMetaData *metadata.FileMetaData = r.reader.ParquetReader().MetaData() + ) + + var rowGroups []int + var colIndices []int + // filters check column statistics +x1: + for i := 0; i < rowGroupNum; i++ { + rowGroupMetaData := fileMetaData.RowGroup(i) + for col, filter := range filters { + if checkColumnStats(rowGroupMetaData, col, filter) { + // ignore the row group + break x1 + } + } + rowGroups = append(rowGroups, i) + } + + for _, col := range columns { + colIndex := fileMetaData.Schema.Root().FieldIndexByName(col) + if colIndex == -1 { + panic("column not found") + } + colIndices = append(colIndices, colIndex) + } + + recReader, err := r.reader.GetRecordReader(context.TODO(), colIndices, rowGroups) + if err != nil { + return err + } + r.recReader = recReader + return nil +} + +func checkColumnStats(rowGroupMetaData *metadata.RowGroupMetaData, col string, f filter.Filter) bool { + colIndex := rowGroupMetaData.Schema.Root().FieldIndexByName(col) + if colIndex == -1 { + panic("column not found") + } + colMetaData, err := rowGroupMetaData.ColumnChunk(colIndex) + if err != nil { + panic(err) + } + + stats, err := colMetaData.Statistics() + if err != nil || stats == nil { + return false + } + return f.CheckStatistics(stats) +} + +func (r *FileReader) Close() error { + if r.recReader != nil { + r.recReader.Release() + } + return nil +} + +func NewFileReader(fs fs.Fs, filePath string, options *options.ReadOptions) (*FileReader, error) { + f, err := fs.OpenFile(filePath) + if err != nil { + return nil, err + } + + parquetReader, err := file.NewParquetReader(f) + if err != nil { + return nil, err + } + + reader, err := pqarrow.NewFileReader(parquetReader, pqarrow.ArrowReadProperties{BatchSize: constant.ReadBatchSize}, memory.DefaultAllocator) + if err != nil { + return nil, err + } + return &FileReader{reader: reader, options: options}, nil +} diff --git a/internal/storagev2/io/format/parquet/file_writer.go b/internal/storagev2/io/format/parquet/file_writer.go new file mode 100644 index 0000000000000..9875cbe180695 --- /dev/null +++ b/internal/storagev2/io/format/parquet/file_writer.go @@ -0,0 +1,60 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package parquet + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/parquet" + "github.com/apache/arrow/go/v12/parquet/pqarrow" + "github.com/milvus-io/milvus/internal/storagev2/io/format" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" +) + +var _ format.Writer = (*FileWriter)(nil) + +type FileWriter struct { + writer *pqarrow.FileWriter + count int64 +} + +func (f *FileWriter) Write(record arrow.Record) error { + if err := f.writer.Write(record); err != nil { + return err + } + f.count += record.NumRows() + return nil +} + +func (f *FileWriter) Count() int64 { + return f.count +} + +func (f *FileWriter) Close() error { + return f.writer.Close() +} + +func NewFileWriter(schema *arrow.Schema, fs fs.Fs, filePath string) (*FileWriter, error) { + file, err := fs.OpenFile(filePath) + if err != nil { + return nil, err + } + + w, err := pqarrow.NewFileWriter(schema, file, parquet.NewWriterProperties(), pqarrow.DefaultWriterProps()) + if err != nil { + return nil, err + } + + return &FileWriter{writer: w}, nil +} diff --git a/internal/storagev2/io/format/reader.go b/internal/storagev2/io/format/reader.go new file mode 100644 index 0000000000000..7907020b29da6 --- /dev/null +++ b/internal/storagev2/io/format/reader.go @@ -0,0 +1,24 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package format + +import ( + "github.com/apache/arrow/go/v12/arrow" +) + +type Reader interface { + Read() (arrow.Record, error) + Close() error +} diff --git a/internal/storagev2/io/format/writer.go b/internal/storagev2/io/format/writer.go new file mode 100644 index 0000000000000..04ec6df738e90 --- /dev/null +++ b/internal/storagev2/io/format/writer.go @@ -0,0 +1,23 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package format + +import "github.com/apache/arrow/go/v12/arrow" + +type Writer interface { + Write(record arrow.Record) error + Count() int64 + Close() error +} diff --git a/internal/storagev2/io/fs/factory.go b/internal/storagev2/io/fs/factory.go new file mode 100644 index 0000000000000..7271115ec7167 --- /dev/null +++ b/internal/storagev2/io/fs/factory.go @@ -0,0 +1,40 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "net/url" + + "github.com/milvus-io/milvus/internal/storagev2/storage/options" +) + +type Factory struct{} + +func (f *Factory) Create(fsType options.FsType, uri *url.URL) (Fs, error) { + switch fsType { + case options.InMemory: + return NewMemoryFs(), nil + case options.LocalFS: + return NewLocalFs(uri), nil + case options.S3: + return NewMinioFs(uri) + default: + panic("unknown fs type") + } +} + +func NewFsFactory() *Factory { + return &Factory{} +} diff --git a/internal/storagev2/io/fs/file/file.go b/internal/storagev2/io/fs/file/file.go new file mode 100644 index 0000000000000..d29729a587449 --- /dev/null +++ b/internal/storagev2/io/fs/file/file.go @@ -0,0 +1,25 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package file + +import "io" + +type File interface { + io.Writer + io.ReaderAt + io.Seeker + io.Reader + io.Closer +} diff --git a/internal/storagev2/io/fs/file/local_file.go b/internal/storagev2/io/fs/file/local_file.go new file mode 100644 index 0000000000000..035c411c3718a --- /dev/null +++ b/internal/storagev2/io/fs/file/local_file.go @@ -0,0 +1,52 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package file + +import ( + "io" + "os" +) + +var EOF = io.EOF + +type LocalFile struct { + file os.File +} + +func (l *LocalFile) Read(p []byte) (n int, err error) { + return l.file.Read(p) +} + +func (l *LocalFile) Write(p []byte) (n int, err error) { + return l.file.Write(p) +} + +func (l *LocalFile) ReadAt(p []byte, off int64) (n int, err error) { + return l.file.ReadAt(p, off) +} + +func (l *LocalFile) Seek(offset int64, whence int) (int64, error) { + return l.file.Seek(offset, whence) +} + +func (l *LocalFile) Close() error { + return l.file.Close() +} + +func NewLocalFile(f *os.File) *LocalFile { + return &LocalFile{ + file: *f, + } +} diff --git a/internal/storagev2/io/fs/file/memory_file.go b/internal/storagev2/io/fs/file/memory_file.go new file mode 100644 index 0000000000000..042954cfdc4a6 --- /dev/null +++ b/internal/storagev2/io/fs/file/memory_file.go @@ -0,0 +1,115 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package file + +import ( + "errors" + "io" +) + +var errInvalid = errors.New("invalid argument") + +type MemoryFile struct { + b []byte + i int +} + +func (f *MemoryFile) Close() error { + return nil +} + +func (f *MemoryFile) Read(p []byte) (n int, err error) { + if f.i >= len(f.b) { + return 0, io.EOF + } + n = copy(p, f.b[f.i:]) + f.i += n + return n, nil +} + +func (f *MemoryFile) Write(b []byte) (int, error) { + n, err := f.writeAt(b, int64(f.i)) + f.i += n + return n, err +} + +func (f *MemoryFile) writeAt(b []byte, off int64) (int, error) { + if off < 0 || int64(int(off)) < off { + return 0, errInvalid + } + if off > int64(len(f.b)) { + f.truncate(off) + } + n := copy(f.b[off:], b) + f.b = append(f.b, b[n:]...) + return len(b), nil +} + +func (f *MemoryFile) truncate(n int64) error { + switch { + case n < 0 || int64(int(n)) < n: + return errInvalid + case n <= int64(len(f.b)): + f.b = f.b[:n] + return nil + default: + f.b = append(f.b, make([]byte, int(n)-len(f.b))...) + return nil + } +} + +func (f *MemoryFile) ReadAt(b []byte, off int64) (n int, err error) { + if off < 0 || int64(int(off)) < off { + return 0, errInvalid + } + if off > int64(len(f.b)) { + return 0, io.EOF + } + n = copy(b, f.b[off:]) + f.i += n + if n < len(b) { + return n, io.EOF + } + return n, nil +} + +func (f *MemoryFile) Seek(offset int64, whence int) (int64, error) { + var abs int64 + switch whence { + case io.SeekStart: + abs = offset + case io.SeekCurrent: + abs = int64(f.i) + offset + case io.SeekEnd: + abs = int64(len(f.b)) + offset + default: + return 0, errInvalid + } + if abs < 0 { + return 0, errInvalid + } + f.i = int(abs) + return abs, nil +} + +func (f *MemoryFile) Bytes() []byte { + return f.b +} + +func NewMemoryFile(b []byte) *MemoryFile { + return &MemoryFile{ + b: b, + } +} diff --git a/internal/storagev2/io/fs/file/minio_file.go b/internal/storagev2/io/fs/file/minio_file.go new file mode 100644 index 0000000000000..b2967aa557db2 --- /dev/null +++ b/internal/storagev2/io/fs/file/minio_file.go @@ -0,0 +1,73 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package file + +import ( + "bytes" + "context" + + "github.com/minio/minio-go/v7" +) + +var _ File = (*MinioFile)(nil) + +type MinioFile struct { + *minio.Object + writer *MemoryFile + client *minio.Client + fileName string + bucketName string +} + +func (f *MinioFile) Write(b []byte) (int, error) { + return f.writer.Write(b) +} + +func (f *MinioFile) Close() error { + if len(f.writer.b) == 0 { + return nil + } + _, err := f.client.PutObject(context.TODO(), f.bucketName, f.fileName, bytes.NewReader(f.writer.b), int64(len(f.writer.b)), minio.PutObjectOptions{}) + return err +} + +func NewMinioFile(client *minio.Client, fileName string, bucketName string) (*MinioFile, error) { + _, err := client.StatObject(context.TODO(), bucketName, fileName, minio.StatObjectOptions{}) + if err != nil { + eresp := minio.ToErrorResponse(err) + if eresp.Code != "NoSuchKey" { + return nil, err + } + return &MinioFile{ + writer: NewMemoryFile(nil), + client: client, + fileName: fileName, + bucketName: bucketName, + }, nil + } + + object, err := client.GetObject(context.TODO(), bucketName, fileName, minio.GetObjectOptions{}) + if err != nil { + return nil, err + } + + return &MinioFile{ + Object: object, + writer: NewMemoryFile(nil), + client: client, + fileName: fileName, + bucketName: bucketName, + }, nil +} diff --git a/internal/storagev2/io/fs/fs.go b/internal/storagev2/io/fs/fs.go new file mode 100644 index 0000000000000..ef605de4da3d0 --- /dev/null +++ b/internal/storagev2/io/fs/fs.go @@ -0,0 +1,34 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "github.com/milvus-io/milvus/internal/storagev2/io/fs/file" +) + +type Fs interface { + OpenFile(path string) (file.File, error) + Rename(src string, dst string) error + DeleteFile(path string) error + CreateDir(path string) error + List(path string) ([]FileEntry, error) + ReadFile(path string) ([]byte, error) + Exist(path string) (bool, error) + Path() string + MkdirAll(dir string, i int) error +} +type FileEntry struct { + Path string +} diff --git a/internal/storagev2/io/fs/fs_util.go b/internal/storagev2/io/fs/fs_util.go new file mode 100644 index 0000000000000..1025418a30c08 --- /dev/null +++ b/internal/storagev2/io/fs/fs_util.go @@ -0,0 +1,41 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "errors" + "fmt" + "net/url" + + "github.com/milvus-io/milvus/internal/storagev2/storage/options" +) + +var ErrInvalidFsType = errors.New("invalid fs type") + +func BuildFileSystem(uri string) (Fs, error) { + parsedUri, err := url.Parse(uri) + if err != nil { + return nil, fmt.Errorf("build file system with uri %s: %w", uri, err) + } + switch parsedUri.Scheme { + case "file": + return NewFsFactory().Create(options.LocalFS, parsedUri) + case "s3": + return NewFsFactory().Create(options.S3, parsedUri) + + default: + return nil, fmt.Errorf("build file system with uri %s: %w", uri, ErrInvalidFsType) + } +} diff --git a/internal/storagev2/io/fs/local_fs.go b/internal/storagev2/io/fs/local_fs.go new file mode 100644 index 0000000000000..a08a82f610d3a --- /dev/null +++ b/internal/storagev2/io/fs/local_fs.go @@ -0,0 +1,95 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "net/url" + "os" + "path/filepath" + + "github.com/milvus-io/milvus/internal/storagev2/common/log" + "github.com/milvus-io/milvus/internal/storagev2/io/fs/file" +) + +type LocalFS struct { + path string +} + +func (l *LocalFS) MkdirAll(dir string, i int) error { + return os.MkdirAll(dir, os.FileMode(i)) +} + +func (l *LocalFS) OpenFile(path string) (file.File, error) { + // Extract the directory from the path + dir := filepath.Dir(path) + // Create the directory (including all necessary parent directories) + err := os.MkdirAll(dir, os.ModePerm) + if err != nil { + return nil, err + } + open, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE, 0o666) + if err != nil { + return nil, err + } + return file.NewLocalFile(open), nil +} + +// Rename renames (moves) a file. If newpath already exists and is not a directory, Rename replaces it. +func (l *LocalFS) Rename(src string, dst string) error { + return os.Rename(src, dst) +} + +func (l *LocalFS) DeleteFile(path string) error { + return os.Remove(path) +} + +func (l *LocalFS) CreateDir(path string) error { + err := os.MkdirAll(path, os.ModePerm) + if err != nil && !os.IsExist(err) { + log.Error(err.Error()) + } + return nil +} + +func (l *LocalFS) List(path string) ([]FileEntry, error) { + entries, err := os.ReadDir(path) + if err != nil { + log.Error(err.Error()) + return nil, err + } + + ret := make([]FileEntry, 0, len(entries)) + for _, entry := range entries { + ret = append(ret, FileEntry{Path: filepath.Join(path, entry.Name())}) + } + + return ret, nil +} + +func (l *LocalFS) ReadFile(path string) ([]byte, error) { + return os.ReadFile(path) +} + +func (l *LocalFS) Exist(path string) (bool, error) { + panic("not implemented") +} + +func (l *LocalFS) Path() string { + return l.path +} + +func NewLocalFs(uri *url.URL) *LocalFS { + return &LocalFS{uri.Path} +} diff --git a/internal/storagev2/io/fs/memory_fs.go b/internal/storagev2/io/fs/memory_fs.go new file mode 100644 index 0000000000000..5bebc3c40f13d --- /dev/null +++ b/internal/storagev2/io/fs/memory_fs.go @@ -0,0 +1,78 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "github.com/milvus-io/milvus/internal/storagev2/io/fs/file" +) + +type MemoryFs struct { + files map[string]*file.MemoryFile +} + +func (m *MemoryFs) MkdirAll(dir string, i int) error { + // TODO implement me + panic("implement me") +} + +func (m *MemoryFs) List(path string) ([]FileEntry, error) { + // TODO implement me + panic("implement me") +} + +func (m *MemoryFs) OpenFile(path string) (file.File, error) { + if f, ok := m.files[path]; ok { + return file.NewMemoryFile(f.Bytes()), nil + } + f := file.NewMemoryFile(nil) + m.files[path] = f + return f, nil +} + +func (m *MemoryFs) Rename(path string, path2 string) error { + if _, ok := m.files[path]; !ok { + return nil + } + m.files[path2] = m.files[path] + delete(m.files, path) + return nil +} + +func (m *MemoryFs) DeleteFile(path string) error { + delete(m.files, path) + return nil +} + +func (m *MemoryFs) CreateDir(path string) error { + return nil +} + +func (m *MemoryFs) ReadFile(path string) ([]byte, error) { + panic("implement me") +} + +func (m *MemoryFs) Exist(path string) (bool, error) { + panic("not implemented") +} + +func (m *MemoryFs) Path() string { + panic("not implemented") +} + +func NewMemoryFs() *MemoryFs { + return &MemoryFs{ + files: make(map[string]*file.MemoryFile), + } +} diff --git a/internal/storagev2/io/fs/minio_fs.go b/internal/storagev2/io/fs/minio_fs.go new file mode 100644 index 0000000000000..805d6ff356e6b --- /dev/null +++ b/internal/storagev2/io/fs/minio_fs.go @@ -0,0 +1,200 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "context" + "fmt" + "io" + "net/url" + "path" + "strings" + + "github.com/milvus-io/milvus/internal/storagev2/common/constant" + "github.com/milvus-io/milvus/internal/storagev2/common/errors" + "github.com/milvus-io/milvus/internal/storagev2/common/log" + "github.com/milvus-io/milvus/internal/storagev2/io/fs/file" + "github.com/minio/minio-go/v7" + "github.com/minio/minio-go/v7/pkg/credentials" + "go.uber.org/zap" +) + +type MinioFs struct { + client *minio.Client + bucketName string + path string +} + +func (fs *MinioFs) MkdirAll(dir string, i int) error { + // TODO implement me + panic("implement me") +} + +func (fs *MinioFs) OpenFile(path string) (file.File, error) { + err, bucket, path := getRealPath(path) + if err != nil { + return nil, err + } + return file.NewMinioFile(fs.client, path, bucket) +} + +func (fs *MinioFs) Rename(src string, dst string) error { + err, dstBucket, dst := getRealPath(dst) + if err != nil { + return err + } + err, srcBucket, src := getRealPath(src) + if err != nil { + return err + } + _, err = fs.client.CopyObject(context.TODO(), minio.CopyDestOptions{Bucket: dstBucket, Object: dst}, minio.CopySrcOptions{Bucket: srcBucket, Object: src}) + if err != nil { + return err + } + err = fs.client.RemoveObject(context.TODO(), srcBucket, src, minio.RemoveObjectOptions{}) + if err != nil { + log.Warn("failed to remove source object", log.String("source", src)) + } + return nil +} + +func (fs *MinioFs) DeleteFile(path string) error { + err, bucket, path := getRealPath(path) + if err != nil { + return err + } + return fs.client.RemoveObject(context.TODO(), bucket, path, minio.RemoveObjectOptions{}) +} + +func (fs *MinioFs) CreateDir(path string) error { + return nil +} + +func (fs *MinioFs) List(prefix string) ([]FileEntry, error) { + err, bucket, prefix := getRealPath(prefix) + if err != nil { + return nil, err + } + ret := make([]FileEntry, 0) + for objInfo := range fs.client.ListObjects(context.TODO(), bucket, minio.ListObjectsOptions{Prefix: prefix, Recursive: true}) { + if objInfo.Err != nil { + log.Warn("list object error", zap.Error(objInfo.Err)) + return nil, objInfo.Err + } + ret = append(ret, FileEntry{Path: path.Join(bucket, objInfo.Key)}) + } + return ret, nil +} + +func (fs *MinioFs) ReadFile(path string) ([]byte, error) { + err, bucket, path := getRealPath(path) + if err != nil { + return nil, err + } + obj, err := fs.client.GetObject(context.TODO(), bucket, path, minio.GetObjectOptions{}) + if err != nil { + return nil, err + } + + stat, err := obj.Stat() + if err != nil { + return nil, err + } + + buf := make([]byte, stat.Size) + n, err := obj.Read(buf) + if err != nil && err != io.EOF { + return nil, err + } + if n != int(stat.Size) { + return nil, fmt.Errorf("failed to read full file, expect: %d, actual: %d", stat.Size, n) + } + return buf, nil +} + +func (fs *MinioFs) Exist(path string) (bool, error) { + err, bucket, path := getRealPath(path) + if err != nil { + return false, err + } + _, err = fs.client.StatObject(context.TODO(), bucket, path, minio.StatObjectOptions{}) + if err != nil { + resp := minio.ToErrorResponse(err) + if resp.Code == "NoSuchKey" { + return false, nil + } + return false, err + } + return true, nil +} + +func (fs *MinioFs) Path() string { + return path.Join(fs.bucketName, strings.TrimPrefix(fs.path, "/")) +} + +// uri should be s3://username:password@bucket/path?endpoint_override=localhost%3A9000 +func NewMinioFs(uri *url.URL) (*MinioFs, error) { + accessKey := uri.User.Username() + secretAccessKey, set := uri.User.Password() + if !set { + log.Warn("secret access key not set") + } + + endpoints, ok := uri.Query()[constant.EndpointOverride] + if !ok || len(endpoints) == 0 { + return nil, errors.ErrNoEndpoint + } + + cli, err := minio.New(endpoints[0], &minio.Options{ + BucketLookup: minio.BucketLookupAuto, + Creds: credentials.NewStaticV4(accessKey, secretAccessKey, ""), + }) + if err != nil { + return nil, err + } + + bucket := uri.Host + path := uri.Path + + log.Info("minio fs infos", zap.String("endpoint", endpoints[0]), zap.String("bucket", bucket), zap.String("path", path)) + + exist, err := cli.BucketExists(context.TODO(), bucket) + if err != nil { + return nil, err + } + + if !exist { + if err = cli.MakeBucket(context.TODO(), bucket, minio.MakeBucketOptions{}); err != nil { + return nil, err + } + } + + return &MinioFs{ + client: cli, + bucketName: bucket, + path: path, + }, nil +} + +func getRealPath(path string) (error, string, string) { + if strings.HasPrefix(path, "/") { + return fmt.Errorf("Invalid path, %s should not start with '/'", path), "", "" + } + words := strings.SplitN(path, "/", 2) + if (len(words)) != 2 { + return fmt.Errorf("Invalid path, %s should contains at least one '/'", path), "", "" + } + return nil, words[0], words[1] +} diff --git a/internal/storagev2/io/fs/minio_fs_test.go b/internal/storagev2/io/fs/minio_fs_test.go new file mode 100644 index 0000000000000..b34cd0ce2cac3 --- /dev/null +++ b/internal/storagev2/io/fs/minio_fs_test.go @@ -0,0 +1,141 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs_test + +import ( + "io" + "testing" + + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/minio/minio-go/v7" + "github.com/stretchr/testify/suite" +) + +type MinioFsTestSuite struct { + suite.Suite + fs fs.Fs + client *minio.Client +} + +func (suite *MinioFsTestSuite) SetupSuite() { + fs, err := fs.BuildFileSystem("s3://minioadmin:minioadmin@default/path1?endpoint_override=localhost%3A9000") + suite.NoError(err) + suite.fs = fs +} + +func (suite *MinioFsTestSuite) TestMinioOpenFile() { + file, err := suite.fs.OpenFile("default/a") + suite.NoError(err) + n, err := file.Write([]byte{1}) + suite.NoError(err) + suite.Equal(1, n) + suite.NoError(file.Close()) + + file, err = suite.fs.OpenFile("default/a") + suite.NoError(err) + buf := make([]byte, 10) + n, err = file.Read(buf) + suite.Equal(io.EOF, err) + suite.Equal(1, n) + suite.ElementsMatch(buf[:n], []byte{1}) + + suite.NoError(suite.fs.DeleteFile("default/a")) +} + +func (suite *MinioFsTestSuite) TestMinioRename() { + file, err := suite.fs.OpenFile("default/a") + suite.NoError(err) + n, err := file.Write([]byte{1}) + suite.NoError(err) + suite.Equal(1, n) + suite.NoError(file.Close()) + + err = suite.fs.Rename("default/a", "default/b") + suite.NoError(err) + + file, err = suite.fs.OpenFile("default/b") + suite.NoError(err) + buf := make([]byte, 10) + n, err = file.Read(buf) + suite.Equal(io.EOF, err) + suite.Equal(1, n) + suite.ElementsMatch(buf[:n], []byte{1}) +} + +func (suite *MinioFsTestSuite) TestMinioFsDeleteFile() { + file, err := suite.fs.OpenFile("default/a") + suite.NoError(err) + n, err := file.Write([]byte{1}) + suite.NoError(err) + suite.Equal(1, n) + suite.NoError(file.Close()) + + err = suite.fs.DeleteFile("default/a") + suite.NoError(err) + + exist, err := suite.fs.Exist("default/a") + suite.NoError(err) + suite.False(exist) +} + +func (suite *MinioFsTestSuite) TestMinioFsList() { + file, err := suite.fs.OpenFile("default/a/b/c") + suite.NoError(err) + _, err = file.Write([]byte{1}) + suite.NoError(err) + suite.NoError(file.Close()) + + entries, err := suite.fs.List("default/a/") + suite.NoError(err) + suite.EqualValues([]fs.FileEntry{{Path: "default/a/b/c"}}, entries) + + suite.NoError(suite.fs.DeleteFile("default/a/b/c")) +} + +func (suite *MinioFsTestSuite) TestMinioFsReadFile() { + file, err := suite.fs.OpenFile("default/a") + suite.NoError(err) + n, err := file.Write([]byte{1}) + suite.NoError(err) + suite.Equal(1, n) + suite.NoError(file.Close()) + + content, err := suite.fs.ReadFile("default/a") + suite.NoError(err) + suite.EqualValues([]byte{1}, content) +} + +func (suite *MinioFsTestSuite) TestMinioFsExist() { + exist, err := suite.fs.Exist("default/nonexist") + suite.NoError(err) + suite.False(exist) + + file, err := suite.fs.OpenFile("default/exist") + suite.NoError(err) + n, err := file.Write([]byte{1}) + suite.NoError(err) + suite.Equal(1, n) + suite.NoError(file.Close()) + + exist, err = suite.fs.Exist("default/exist") + suite.NoError(err) + suite.True(exist) + + suite.NoError(suite.fs.DeleteFile("default/exist")) +} + +func TestMinioFsSuite(t *testing.T) { + suite.Run(t, &MinioFsTestSuite{}) +} diff --git a/internal/storagev2/packed/packed_reader.go b/internal/storagev2/packed/packed_reader.go new file mode 100644 index 0000000000000..e35acfc1b8c78 --- /dev/null +++ b/internal/storagev2/packed/packed_reader.go @@ -0,0 +1,85 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package packed + +/* +#cgo pkg-config:milvus_core + +#include <stdlib.h> +#include "segcore/packed_reader_c.h" +#include "segcore/arrow/c/abi.h" +#include "segcore/arrow/c/helpers.h" +*/ + +import "C" + +import ( + "errors" + "fmt" + "unsafe" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/cdata" +) + +func NewPackedReader(path string, schema *arrow.Schema, bufferSize int) (*PackedReader, error) { + var cas cdata.CArrowSchema + cdata.ExportArrowSchema(schema, &cas) + cSchema := (*C.struct_ArrowSchema)(unsafe.Pointer(&cas)) + + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + + cBufferSize := C.int64_t(bufferSize) + + var cPackedReader C.CPackedReader + status := C.NewPackedReader(cPath, cSchema, cBufferSize, &cPackedReader) + if status != 0 { + return nil, errors.New(fmt.Sprintf("failed to new packed reader: %s, status: %d", path, status)) + } + return &PackedReader{cPackedReader: cPackedReader, schema: schema}, nil +} + +func (pr *PackedReader) ReadNext() (arrow.Record, error) { + var cArr C.CArrowArray + var cSchema C.CArrowSchema + status := C.ReadNext(pr.cPackedReader, &cArr, &cSchema) + if status != 0 { + return nil, fmt.Errorf("ReadNext failed with error code %d", status) + } + + if cArr == nil { + return nil, nil // end of stream, no more records to read + } + + // Convert ArrowArray to Go RecordBatch using cdata + goCArr := (*cdata.CArrowArray)(unsafe.Pointer(cArr)) + goCSchema := (*cdata.CArrowSchema)(unsafe.Pointer(cSchema)) + recordBatch, err := cdata.ImportCRecordBatch(goCArr, goCSchema) + if err != nil { + return nil, fmt.Errorf("failed to convert ArrowArray to Record: %w", err) + } + + // Return the RecordBatch as an arrow.Record + return recordBatch, nil +} + +func (pr *PackedReader) Close() error { + status := C.CloseReader(pr.cPackedReader) + if status != 0 { + return errors.New("PackedReader: failed to close file") + } + return nil +} diff --git a/internal/storagev2/packed/packed_test.go b/internal/storagev2/packed/packed_test.go new file mode 100644 index 0000000000000..592de2590cf25 --- /dev/null +++ b/internal/storagev2/packed/packed_test.go @@ -0,0 +1,146 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package packed + +import ( + "testing" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/apache/arrow/go/v12/arrow/memory" + "github.com/stretchr/testify/assert" + "golang.org/x/exp/rand" +) + +func TestPackedOneFile(t *testing.T) { + batches := 100 + schema := arrow.NewSchema([]arrow.Field{ + {Name: "a", Type: arrow.PrimitiveTypes.Int32}, + {Name: "b", Type: arrow.PrimitiveTypes.Int64}, + {Name: "c", Type: arrow.BinaryTypes.String}, + }, nil) + + b := array.NewRecordBuilder(memory.DefaultAllocator, schema) + defer b.Release() + for idx := range schema.Fields() { + switch idx { + case 0: + b.Field(idx).(*array.Int32Builder).AppendValues( + []int32{int32(1), int32(2), int32(3)}, nil, + ) + case 1: + b.Field(idx).(*array.Int64Builder).AppendValues( + []int64{int64(4), int64(5), int64(6)}, nil, + ) + case 2: + b.Field(idx).(*array.StringBuilder).AppendValues( + []string{"a", "b", "c"}, nil, + ) + } + } + rec := b.NewRecord() + defer rec.Release() + path := "/tmp" + bufferSize := 10 * 1024 * 1024 // 10MB + pw, err := NewPackedWriter(path, schema, bufferSize) + assert.NoError(t, err) + for i := 0; i < batches; i++ { + err = pw.WriteRecordBatch(rec) + assert.NoError(t, err) + } + err = pw.Close() + assert.NoError(t, err) + + reader, err := NewPackedReader(path, schema, bufferSize) + assert.NoError(t, err) + rr, err := reader.ReadNext() + assert.NoError(t, err) + defer rr.Release() + assert.Equal(t, int64(3*batches), rr.NumRows()) +} + +func TestPackedMultiFiles(t *testing.T) { + batches := 1000 + schema := arrow.NewSchema([]arrow.Field{ + {Name: "a", Type: arrow.PrimitiveTypes.Int32}, + {Name: "b", Type: arrow.PrimitiveTypes.Int64}, + {Name: "c", Type: arrow.BinaryTypes.String}, + }, nil) + + b := array.NewRecordBuilder(memory.DefaultAllocator, schema) + strLen := 1000 + arrLen := 30 + defer b.Release() + for idx := range schema.Fields() { + switch idx { + case 0: + values := make([]int32, arrLen) + for i := 0; i < arrLen; i++ { + values[i] = int32(i + 1) + } + b.Field(idx).(*array.Int32Builder).AppendValues(values, nil) + case 1: + values := make([]int64, arrLen) + for i := 0; i < arrLen; i++ { + values[i] = int64(i + 1) + } + b.Field(idx).(*array.Int64Builder).AppendValues(values, nil) + case 2: + values := make([]string, arrLen) + for i := 0; i < arrLen; i++ { + values[i] = randomString(strLen) + } + b.Field(idx).(*array.StringBuilder).AppendValues(values, nil) + } + } + rec := b.NewRecord() + defer rec.Release() + path := "/tmp" + bufferSize := 10 * 1024 * 1024 // 10MB + pw, err := NewPackedWriter(path, schema, bufferSize) + assert.NoError(t, err) + for i := 0; i < batches; i++ { + err = pw.WriteRecordBatch(rec) + assert.NoError(t, err) + } + err = pw.Close() + assert.NoError(t, err) + + reader, err := NewPackedReader(path, schema, bufferSize) + assert.NoError(t, err) + var rows int64 = 0 + var rr arrow.Record + for { + rr, err = reader.ReadNext() + assert.NoError(t, err) + if rr == nil { + // end of file + break + } + + rows += rr.NumRows() + } + + assert.Equal(t, int64(arrLen*batches), rows) +} + +func randomString(length int) string { + const charset = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + result := make([]byte, length) + for i := range result { + result[i] = charset[rand.Intn(len(charset))] + } + return string(result) +} diff --git a/internal/storagev2/packed/packed_writer.go b/internal/storagev2/packed/packed_writer.go new file mode 100644 index 0000000000000..ed05e2a3c08e0 --- /dev/null +++ b/internal/storagev2/packed/packed_writer.go @@ -0,0 +1,77 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package packed + +/* +#cgo pkg-config: milvus_core + +#include <stdlib.h> +#include "segcore/packed_writer_c.h" +#include "segcore/arrow/c/abi.h" +#include "segcore/arrow/c/helpers.h" +*/ +import "C" + +import ( + "errors" + "fmt" + "unsafe" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/cdata" +) + +func NewPackedWriter(path string, schema *arrow.Schema, bufferSize int) (*PackedWriter, error) { + var cas cdata.CArrowSchema + cdata.ExportArrowSchema(schema, &cas) + cSchema := (*C.struct_ArrowSchema)(unsafe.Pointer(&cas)) + + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + + cBufferSize := C.int64_t(bufferSize) + + var cPackedWriter C.CPackedWriter + status := C.NewPackedWriter(cPath, cSchema, cBufferSize, &cPackedWriter) + if status != 0 { + return nil, errors.New(fmt.Sprintf("failed to new packed writer: %s, status: %d", path, status)) + } + return &PackedWriter{cPackedWriter: cPackedWriter}, nil +} + +func (pw *PackedWriter) WriteRecordBatch(recordBatch arrow.Record) error { + var caa cdata.CArrowArray + var cas cdata.CArrowSchema + + cdata.ExportArrowRecordBatch(recordBatch, &caa, &cas) + + cArr := (*C.struct_ArrowArray)(unsafe.Pointer(&caa)) + cSchema := (*C.struct_ArrowSchema)(unsafe.Pointer(&cas)) + + status := C.WriteRecordBatch(pw.cPackedWriter, cArr, cSchema) + if status != 0 { + return errors.New("PackedWriter: failed to write record batch") + } + + return nil +} + +func (pw *PackedWriter) Close() error { + status := C.CloseWriter(pw.cPackedWriter) + if status != 0 { + return errors.New("PackedWriter: failed to close file") + } + return nil +} diff --git a/internal/storagev2/packed/type.go b/internal/storagev2/packed/type.go new file mode 100644 index 0000000000000..3301e1606e113 --- /dev/null +++ b/internal/storagev2/packed/type.go @@ -0,0 +1,46 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package packed + +/* +#include <stdlib.h> +#include "segcore/arrow/c/abi.h" +#include "segcore/arrow/c/helpers.h" +#include "segcore/packed_reader_c.h" +#include "segcore/packed_writer_c.h" +*/ +import "C" + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/cdata" +) + +type PackedWriter struct { + cPackedWriter C.CPackedWriter +} + +type PackedReader struct { + cPackedReader C.CPackedReader + arr *cdata.CArrowArray + schema *arrow.Schema +} + +type ( + // CArrowSchema is the C Data Interface for ArrowSchemas + CArrowSchema = C.struct_ArrowSchema + // CArrowArray is the C Data Interface object for Arrow Arrays as defined in abi.h + CArrowArray = C.struct_ArrowArray +) diff --git a/internal/storagev2/proto/manifest.proto b/internal/storagev2/proto/manifest.proto new file mode 100644 index 0000000000000..68b186368be3e --- /dev/null +++ b/internal/storagev2/proto/manifest.proto @@ -0,0 +1,27 @@ +syntax = "proto3"; +import "storage_schema.proto"; +package manifest_proto; +option go_package = "github.com/milvus-io/milvus/internal/storagev2/proto/manifest_proto"; + +message Options { string uri = 1; } + +message Manifest { + int64 version = 1; + Options options = 2; + schema_proto.Schema schema = 3; + repeated Fragment scalar_fragments = 4; + repeated Fragment vector_fragments = 5; + repeated Fragment delete_fragments = 6; + repeated Blob blobs = 7; +} + +message Fragment { + int64 id = 1; + repeated string files = 2; +} + +message Blob { + string name = 1; + int64 size = 2; + string file = 3; +} diff --git a/internal/storagev2/proto/manifest_proto/manifest.pb.go b/internal/storagev2/proto/manifest_proto/manifest.pb.go new file mode 100644 index 0000000000000..a6dda69ba4f52 --- /dev/null +++ b/internal/storagev2/proto/manifest_proto/manifest.pb.go @@ -0,0 +1,285 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// source: manifest.proto + +package manifest_proto + +import ( + fmt "fmt" + proto "github.com/golang/protobuf/proto" + schema_proto "github.com/milvus-io/milvus/internal/storagev2/proto/schema_proto" + math "math" +) + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package + +type Options struct { + Uri string `protobuf:"bytes,1,opt,name=uri,proto3" json:"uri,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *Options) Reset() { *m = Options{} } +func (m *Options) String() string { return proto.CompactTextString(m) } +func (*Options) ProtoMessage() {} +func (*Options) Descriptor() ([]byte, []int) { + return fileDescriptor_0bb23f43f7afb4c1, []int{0} +} + +func (m *Options) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_Options.Unmarshal(m, b) +} +func (m *Options) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_Options.Marshal(b, m, deterministic) +} +func (m *Options) XXX_Merge(src proto.Message) { + xxx_messageInfo_Options.Merge(m, src) +} +func (m *Options) XXX_Size() int { + return xxx_messageInfo_Options.Size(m) +} +func (m *Options) XXX_DiscardUnknown() { + xxx_messageInfo_Options.DiscardUnknown(m) +} + +var xxx_messageInfo_Options proto.InternalMessageInfo + +func (m *Options) GetUri() string { + if m != nil { + return m.Uri + } + return "" +} + +type Manifest struct { + Version int64 `protobuf:"varint,1,opt,name=version,proto3" json:"version,omitempty"` + Options *Options `protobuf:"bytes,2,opt,name=options,proto3" json:"options,omitempty"` + Schema *schema_proto.Schema `protobuf:"bytes,3,opt,name=schema,proto3" json:"schema,omitempty"` + ScalarFragments []*Fragment `protobuf:"bytes,4,rep,name=scalar_fragments,json=scalarFragments,proto3" json:"scalar_fragments,omitempty"` + VectorFragments []*Fragment `protobuf:"bytes,5,rep,name=vector_fragments,json=vectorFragments,proto3" json:"vector_fragments,omitempty"` + DeleteFragments []*Fragment `protobuf:"bytes,6,rep,name=delete_fragments,json=deleteFragments,proto3" json:"delete_fragments,omitempty"` + Blobs []*Blob `protobuf:"bytes,7,rep,name=blobs,proto3" json:"blobs,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *Manifest) Reset() { *m = Manifest{} } +func (m *Manifest) String() string { return proto.CompactTextString(m) } +func (*Manifest) ProtoMessage() {} +func (*Manifest) Descriptor() ([]byte, []int) { + return fileDescriptor_0bb23f43f7afb4c1, []int{1} +} + +func (m *Manifest) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_Manifest.Unmarshal(m, b) +} +func (m *Manifest) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_Manifest.Marshal(b, m, deterministic) +} +func (m *Manifest) XXX_Merge(src proto.Message) { + xxx_messageInfo_Manifest.Merge(m, src) +} +func (m *Manifest) XXX_Size() int { + return xxx_messageInfo_Manifest.Size(m) +} +func (m *Manifest) XXX_DiscardUnknown() { + xxx_messageInfo_Manifest.DiscardUnknown(m) +} + +var xxx_messageInfo_Manifest proto.InternalMessageInfo + +func (m *Manifest) GetVersion() int64 { + if m != nil { + return m.Version + } + return 0 +} + +func (m *Manifest) GetOptions() *Options { + if m != nil { + return m.Options + } + return nil +} + +func (m *Manifest) GetSchema() *schema_proto.Schema { + if m != nil { + return m.Schema + } + return nil +} + +func (m *Manifest) GetScalarFragments() []*Fragment { + if m != nil { + return m.ScalarFragments + } + return nil +} + +func (m *Manifest) GetVectorFragments() []*Fragment { + if m != nil { + return m.VectorFragments + } + return nil +} + +func (m *Manifest) GetDeleteFragments() []*Fragment { + if m != nil { + return m.DeleteFragments + } + return nil +} + +func (m *Manifest) GetBlobs() []*Blob { + if m != nil { + return m.Blobs + } + return nil +} + +type Fragment struct { + Id int64 `protobuf:"varint,1,opt,name=id,proto3" json:"id,omitempty"` + Files []string `protobuf:"bytes,2,rep,name=files,proto3" json:"files,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *Fragment) Reset() { *m = Fragment{} } +func (m *Fragment) String() string { return proto.CompactTextString(m) } +func (*Fragment) ProtoMessage() {} +func (*Fragment) Descriptor() ([]byte, []int) { + return fileDescriptor_0bb23f43f7afb4c1, []int{2} +} + +func (m *Fragment) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_Fragment.Unmarshal(m, b) +} +func (m *Fragment) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_Fragment.Marshal(b, m, deterministic) +} +func (m *Fragment) XXX_Merge(src proto.Message) { + xxx_messageInfo_Fragment.Merge(m, src) +} +func (m *Fragment) XXX_Size() int { + return xxx_messageInfo_Fragment.Size(m) +} +func (m *Fragment) XXX_DiscardUnknown() { + xxx_messageInfo_Fragment.DiscardUnknown(m) +} + +var xxx_messageInfo_Fragment proto.InternalMessageInfo + +func (m *Fragment) GetId() int64 { + if m != nil { + return m.Id + } + return 0 +} + +func (m *Fragment) GetFiles() []string { + if m != nil { + return m.Files + } + return nil +} + +type Blob struct { + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Size int64 `protobuf:"varint,2,opt,name=size,proto3" json:"size,omitempty"` + File string `protobuf:"bytes,3,opt,name=file,proto3" json:"file,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *Blob) Reset() { *m = Blob{} } +func (m *Blob) String() string { return proto.CompactTextString(m) } +func (*Blob) ProtoMessage() {} +func (*Blob) Descriptor() ([]byte, []int) { + return fileDescriptor_0bb23f43f7afb4c1, []int{3} +} + +func (m *Blob) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_Blob.Unmarshal(m, b) +} +func (m *Blob) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_Blob.Marshal(b, m, deterministic) +} +func (m *Blob) XXX_Merge(src proto.Message) { + xxx_messageInfo_Blob.Merge(m, src) +} +func (m *Blob) XXX_Size() int { + return xxx_messageInfo_Blob.Size(m) +} +func (m *Blob) XXX_DiscardUnknown() { + xxx_messageInfo_Blob.DiscardUnknown(m) +} + +var xxx_messageInfo_Blob proto.InternalMessageInfo + +func (m *Blob) GetName() string { + if m != nil { + return m.Name + } + return "" +} + +func (m *Blob) GetSize() int64 { + if m != nil { + return m.Size + } + return 0 +} + +func (m *Blob) GetFile() string { + if m != nil { + return m.File + } + return "" +} + +func init() { + proto.RegisterType((*Options)(nil), "manifest_proto.Options") + proto.RegisterType((*Manifest)(nil), "manifest_proto.Manifest") + proto.RegisterType((*Fragment)(nil), "manifest_proto.Fragment") + proto.RegisterType((*Blob)(nil), "manifest_proto.Blob") +} + +func init() { proto.RegisterFile("manifest.proto", fileDescriptor_0bb23f43f7afb4c1) } + +var fileDescriptor_0bb23f43f7afb4c1 = []byte{ + // 343 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x84, 0x91, 0xcf, 0x4f, 0xc2, 0x30, + 0x1c, 0xc5, 0x03, 0x05, 0x06, 0x5f, 0x0c, 0x92, 0x86, 0xc4, 0x46, 0x2f, 0x64, 0xa7, 0xc5, 0xe8, + 0xa6, 0x78, 0x34, 0x5e, 0x30, 0xf1, 0x66, 0x4c, 0xea, 0xcd, 0x0b, 0xe9, 0x46, 0x19, 0x4d, 0xb6, + 0x95, 0xac, 0x85, 0x83, 0x27, 0xff, 0x74, 0xd3, 0x1f, 0x53, 0xc6, 0x85, 0x13, 0xef, 0x95, 0xf7, + 0x3e, 0xdf, 0xae, 0x5f, 0x98, 0x94, 0xac, 0x12, 0x1b, 0xae, 0x74, 0xbc, 0xab, 0xa5, 0x96, 0xf8, + 0xcf, 0xaf, 0xac, 0xbf, 0xbe, 0x50, 0xd9, 0x96, 0x97, 0xcc, 0xfd, 0x1b, 0xde, 0x40, 0xf0, 0xb1, + 0xd3, 0x42, 0x56, 0x0a, 0x4f, 0x01, 0xed, 0x6b, 0x41, 0x3a, 0xf3, 0x4e, 0x34, 0xa2, 0x46, 0x86, + 0x3f, 0x08, 0x86, 0xef, 0xbe, 0x8d, 0x09, 0x04, 0x07, 0x5e, 0x2b, 0x21, 0x2b, 0x1b, 0x41, 0xb4, + 0xb1, 0xf8, 0x11, 0x02, 0xe9, 0x18, 0xa4, 0x3b, 0xef, 0x44, 0xe3, 0xc5, 0x55, 0xdc, 0x9e, 0x19, + 0xfb, 0x11, 0xb4, 0xc9, 0xe1, 0x3b, 0x18, 0xb8, 0x6b, 0x10, 0x64, 0x1b, 0xb3, 0xd8, 0x59, 0x9f, + 0xff, 0xb4, 0x86, 0xfa, 0x0c, 0x7e, 0x85, 0xa9, 0xca, 0x58, 0xc1, 0xea, 0xd5, 0xa6, 0x66, 0x79, + 0xc9, 0x2b, 0xad, 0x48, 0x6f, 0x8e, 0xa2, 0xf1, 0x82, 0x9c, 0x4e, 0x7a, 0xf3, 0x01, 0x7a, 0xe9, + 0x1a, 0x8d, 0x57, 0x06, 0x72, 0xe0, 0x99, 0x96, 0xc7, 0x90, 0xfe, 0x39, 0x88, 0x6b, 0xb4, 0x20, + 0x6b, 0x5e, 0x70, 0xcd, 0x8f, 0x20, 0x83, 0x73, 0x10, 0xd7, 0xf8, 0x87, 0xdc, 0x42, 0x3f, 0x2d, + 0x64, 0xaa, 0x48, 0x60, 0x9b, 0xb3, 0xd3, 0xe6, 0xb2, 0x90, 0x29, 0x75, 0x91, 0xf0, 0x01, 0x86, + 0x4d, 0x11, 0x4f, 0xa0, 0x2b, 0xd6, 0xfe, 0xf1, 0xbb, 0x62, 0x8d, 0x67, 0xd0, 0xdf, 0x88, 0x82, + 0x9b, 0x57, 0x47, 0xd1, 0x88, 0x3a, 0x13, 0x2e, 0xa1, 0x67, 0x00, 0x18, 0x43, 0xaf, 0x62, 0x25, + 0xf7, 0xfb, 0xb4, 0xda, 0x9c, 0x29, 0xf1, 0xcd, 0xed, 0x9a, 0x10, 0xb5, 0xda, 0x9c, 0x99, 0xa2, + 0x5d, 0xc4, 0x88, 0x5a, 0xbd, 0x7c, 0xf9, 0x7a, 0xce, 0x85, 0xde, 0xee, 0xd3, 0x38, 0x93, 0x65, + 0x52, 0x8a, 0xe2, 0xb0, 0x57, 0xf7, 0x42, 0x36, 0x4a, 0x69, 0x59, 0xb3, 0x9c, 0x27, 0xb9, 0x4c, + 0xec, 0x8d, 0x93, 0xf6, 0x07, 0xa4, 0x03, 0xfb, 0xf3, 0xf4, 0x1b, 0x00, 0x00, 0xff, 0xff, 0xfc, + 0xe7, 0x01, 0xb7, 0x8b, 0x02, 0x00, 0x00, +} diff --git a/internal/storagev2/proto/schema_proto/storage_schema.pb.go b/internal/storagev2/proto/schema_proto/storage_schema.pb.go new file mode 100644 index 0000000000000..f205704574c02 --- /dev/null +++ b/internal/storagev2/proto/schema_proto/storage_schema.pb.go @@ -0,0 +1,795 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// source: storage_schema.proto + +package schema_proto + +import ( + fmt "fmt" + proto "github.com/golang/protobuf/proto" + math "math" +) + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.ProtoPackageIsVersion3 // please upgrade the proto package + +type LogicType int32 + +const ( + LogicType_NA LogicType = 0 + LogicType_BOOL LogicType = 1 + LogicType_UINT8 LogicType = 2 + LogicType_INT8 LogicType = 3 + LogicType_UINT16 LogicType = 4 + LogicType_INT16 LogicType = 5 + LogicType_UINT32 LogicType = 6 + LogicType_INT32 LogicType = 7 + LogicType_UINT64 LogicType = 8 + LogicType_INT64 LogicType = 9 + LogicType_HALF_FLOAT LogicType = 10 + LogicType_FLOAT LogicType = 11 + LogicType_DOUBLE LogicType = 12 + LogicType_STRING LogicType = 13 + LogicType_BINARY LogicType = 14 + LogicType_FIXED_SIZE_BINARY LogicType = 15 + // DATE32 = 16; + // DATE64 = 17; + // TIMESTAMP = 18; + // TIME32 = 19; + // TIME64 = 20; + // INTERVAL_MONTHS = 21; + // INTERVAL_DAY_TIME = 22; + // DECIMAL128 = 23; + // option allow_alias = true; + // DECIMAL = 23; // DECIMAL==DECIMAL128 + // DECIMAL256 = 24; + LogicType_LIST LogicType = 25 + LogicType_STRUCT LogicType = 26 + // SPARSE_UNION = 27; + // DENSE_UNION = 28; + LogicType_DICTIONARY LogicType = 29 + LogicType_MAP LogicType = 30 + // EXTENSION = 31; + LogicType_FIXED_SIZE_LIST LogicType = 32 + // DURATION = 33; + // LARGE_STRING = 34; + // LARGE_BINARY = 35; + // LARGE_LIST = 36; + // INTERVAL_MONTH_DAY_NANO = 37; + // RUN_END_ENCODED = 38; + LogicType_MAX_ID LogicType = 39 +) + +var LogicType_name = map[int32]string{ + 0: "NA", + 1: "BOOL", + 2: "UINT8", + 3: "INT8", + 4: "UINT16", + 5: "INT16", + 6: "UINT32", + 7: "INT32", + 8: "UINT64", + 9: "INT64", + 10: "HALF_FLOAT", + 11: "FLOAT", + 12: "DOUBLE", + 13: "STRING", + 14: "BINARY", + 15: "FIXED_SIZE_BINARY", + 25: "LIST", + 26: "STRUCT", + 29: "DICTIONARY", + 30: "MAP", + 32: "FIXED_SIZE_LIST", + 39: "MAX_ID", +} + +var LogicType_value = map[string]int32{ + "NA": 0, + "BOOL": 1, + "UINT8": 2, + "INT8": 3, + "UINT16": 4, + "INT16": 5, + "UINT32": 6, + "INT32": 7, + "UINT64": 8, + "INT64": 9, + "HALF_FLOAT": 10, + "FLOAT": 11, + "DOUBLE": 12, + "STRING": 13, + "BINARY": 14, + "FIXED_SIZE_BINARY": 15, + "LIST": 25, + "STRUCT": 26, + "DICTIONARY": 29, + "MAP": 30, + "FIXED_SIZE_LIST": 32, + "MAX_ID": 39, +} + +func (x LogicType) String() string { + return proto.EnumName(LogicType_name, int32(x)) +} + +func (LogicType) EnumDescriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{0} +} + +type Endianness int32 + +const ( + Endianness_Little Endianness = 0 + Endianness_Big Endianness = 1 +) + +var Endianness_name = map[int32]string{ + 0: "Little", + 1: "Big", +} + +var Endianness_value = map[string]int32{ + "Little": 0, + "Big": 1, +} + +func (x Endianness) String() string { + return proto.EnumName(Endianness_name, int32(x)) +} + +func (Endianness) EnumDescriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{1} +} + +type FixedSizeBinaryType struct { + ByteWidth int32 `protobuf:"varint,1,opt,name=byte_width,json=byteWidth,proto3" json:"byte_width,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *FixedSizeBinaryType) Reset() { *m = FixedSizeBinaryType{} } +func (m *FixedSizeBinaryType) String() string { return proto.CompactTextString(m) } +func (*FixedSizeBinaryType) ProtoMessage() {} +func (*FixedSizeBinaryType) Descriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{0} +} + +func (m *FixedSizeBinaryType) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_FixedSizeBinaryType.Unmarshal(m, b) +} +func (m *FixedSizeBinaryType) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_FixedSizeBinaryType.Marshal(b, m, deterministic) +} +func (m *FixedSizeBinaryType) XXX_Merge(src proto.Message) { + xxx_messageInfo_FixedSizeBinaryType.Merge(m, src) +} +func (m *FixedSizeBinaryType) XXX_Size() int { + return xxx_messageInfo_FixedSizeBinaryType.Size(m) +} +func (m *FixedSizeBinaryType) XXX_DiscardUnknown() { + xxx_messageInfo_FixedSizeBinaryType.DiscardUnknown(m) +} + +var xxx_messageInfo_FixedSizeBinaryType proto.InternalMessageInfo + +func (m *FixedSizeBinaryType) GetByteWidth() int32 { + if m != nil { + return m.ByteWidth + } + return 0 +} + +type FixedSizeListType struct { + ListSize int32 `protobuf:"varint,1,opt,name=list_size,json=listSize,proto3" json:"list_size,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *FixedSizeListType) Reset() { *m = FixedSizeListType{} } +func (m *FixedSizeListType) String() string { return proto.CompactTextString(m) } +func (*FixedSizeListType) ProtoMessage() {} +func (*FixedSizeListType) Descriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{1} +} + +func (m *FixedSizeListType) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_FixedSizeListType.Unmarshal(m, b) +} +func (m *FixedSizeListType) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_FixedSizeListType.Marshal(b, m, deterministic) +} +func (m *FixedSizeListType) XXX_Merge(src proto.Message) { + xxx_messageInfo_FixedSizeListType.Merge(m, src) +} +func (m *FixedSizeListType) XXX_Size() int { + return xxx_messageInfo_FixedSizeListType.Size(m) +} +func (m *FixedSizeListType) XXX_DiscardUnknown() { + xxx_messageInfo_FixedSizeListType.DiscardUnknown(m) +} + +var xxx_messageInfo_FixedSizeListType proto.InternalMessageInfo + +func (m *FixedSizeListType) GetListSize() int32 { + if m != nil { + return m.ListSize + } + return 0 +} + +type DictionaryType struct { + IndexType *DataType `protobuf:"bytes,1,opt,name=index_type,json=indexType,proto3" json:"index_type,omitempty"` + ValueType *DataType `protobuf:"bytes,2,opt,name=value_type,json=valueType,proto3" json:"value_type,omitempty"` + Ordered bool `protobuf:"varint,3,opt,name=ordered,proto3" json:"ordered,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *DictionaryType) Reset() { *m = DictionaryType{} } +func (m *DictionaryType) String() string { return proto.CompactTextString(m) } +func (*DictionaryType) ProtoMessage() {} +func (*DictionaryType) Descriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{2} +} + +func (m *DictionaryType) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_DictionaryType.Unmarshal(m, b) +} +func (m *DictionaryType) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_DictionaryType.Marshal(b, m, deterministic) +} +func (m *DictionaryType) XXX_Merge(src proto.Message) { + xxx_messageInfo_DictionaryType.Merge(m, src) +} +func (m *DictionaryType) XXX_Size() int { + return xxx_messageInfo_DictionaryType.Size(m) +} +func (m *DictionaryType) XXX_DiscardUnknown() { + xxx_messageInfo_DictionaryType.DiscardUnknown(m) +} + +var xxx_messageInfo_DictionaryType proto.InternalMessageInfo + +func (m *DictionaryType) GetIndexType() *DataType { + if m != nil { + return m.IndexType + } + return nil +} + +func (m *DictionaryType) GetValueType() *DataType { + if m != nil { + return m.ValueType + } + return nil +} + +func (m *DictionaryType) GetOrdered() bool { + if m != nil { + return m.Ordered + } + return false +} + +type MapType struct { + KeysSorted bool `protobuf:"varint,1,opt,name=keys_sorted,json=keysSorted,proto3" json:"keys_sorted,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *MapType) Reset() { *m = MapType{} } +func (m *MapType) String() string { return proto.CompactTextString(m) } +func (*MapType) ProtoMessage() {} +func (*MapType) Descriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{3} +} + +func (m *MapType) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_MapType.Unmarshal(m, b) +} +func (m *MapType) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_MapType.Marshal(b, m, deterministic) +} +func (m *MapType) XXX_Merge(src proto.Message) { + xxx_messageInfo_MapType.Merge(m, src) +} +func (m *MapType) XXX_Size() int { + return xxx_messageInfo_MapType.Size(m) +} +func (m *MapType) XXX_DiscardUnknown() { + xxx_messageInfo_MapType.DiscardUnknown(m) +} + +var xxx_messageInfo_MapType proto.InternalMessageInfo + +func (m *MapType) GetKeysSorted() bool { + if m != nil { + return m.KeysSorted + } + return false +} + +type DataType struct { + // Types that are valid to be assigned to TypeRelatedValues: + // + // *DataType_FixedSizeBinaryType + // *DataType_FixedSizeListType + // *DataType_DictionaryType + // *DataType_MapType + TypeRelatedValues isDataType_TypeRelatedValues `protobuf_oneof:"type_related_values"` + LogicType LogicType `protobuf:"varint,100,opt,name=logic_type,json=logicType,proto3,enum=schema_proto.LogicType" json:"logic_type,omitempty"` + Children []*Field `protobuf:"bytes,101,rep,name=children,proto3" json:"children,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *DataType) Reset() { *m = DataType{} } +func (m *DataType) String() string { return proto.CompactTextString(m) } +func (*DataType) ProtoMessage() {} +func (*DataType) Descriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{4} +} + +func (m *DataType) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_DataType.Unmarshal(m, b) +} +func (m *DataType) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_DataType.Marshal(b, m, deterministic) +} +func (m *DataType) XXX_Merge(src proto.Message) { + xxx_messageInfo_DataType.Merge(m, src) +} +func (m *DataType) XXX_Size() int { + return xxx_messageInfo_DataType.Size(m) +} +func (m *DataType) XXX_DiscardUnknown() { + xxx_messageInfo_DataType.DiscardUnknown(m) +} + +var xxx_messageInfo_DataType proto.InternalMessageInfo + +type isDataType_TypeRelatedValues interface { + isDataType_TypeRelatedValues() +} + +type DataType_FixedSizeBinaryType struct { + FixedSizeBinaryType *FixedSizeBinaryType `protobuf:"bytes,1,opt,name=fixed_size_binary_type,json=fixedSizeBinaryType,proto3,oneof"` +} + +type DataType_FixedSizeListType struct { + FixedSizeListType *FixedSizeListType `protobuf:"bytes,2,opt,name=fixed_size_list_type,json=fixedSizeListType,proto3,oneof"` +} + +type DataType_DictionaryType struct { + DictionaryType *DictionaryType `protobuf:"bytes,3,opt,name=dictionary_type,json=dictionaryType,proto3,oneof"` +} + +type DataType_MapType struct { + MapType *MapType `protobuf:"bytes,4,opt,name=map_type,json=mapType,proto3,oneof"` +} + +func (*DataType_FixedSizeBinaryType) isDataType_TypeRelatedValues() {} + +func (*DataType_FixedSizeListType) isDataType_TypeRelatedValues() {} + +func (*DataType_DictionaryType) isDataType_TypeRelatedValues() {} + +func (*DataType_MapType) isDataType_TypeRelatedValues() {} + +func (m *DataType) GetTypeRelatedValues() isDataType_TypeRelatedValues { + if m != nil { + return m.TypeRelatedValues + } + return nil +} + +func (m *DataType) GetFixedSizeBinaryType() *FixedSizeBinaryType { + if x, ok := m.GetTypeRelatedValues().(*DataType_FixedSizeBinaryType); ok { + return x.FixedSizeBinaryType + } + return nil +} + +func (m *DataType) GetFixedSizeListType() *FixedSizeListType { + if x, ok := m.GetTypeRelatedValues().(*DataType_FixedSizeListType); ok { + return x.FixedSizeListType + } + return nil +} + +func (m *DataType) GetDictionaryType() *DictionaryType { + if x, ok := m.GetTypeRelatedValues().(*DataType_DictionaryType); ok { + return x.DictionaryType + } + return nil +} + +func (m *DataType) GetMapType() *MapType { + if x, ok := m.GetTypeRelatedValues().(*DataType_MapType); ok { + return x.MapType + } + return nil +} + +func (m *DataType) GetLogicType() LogicType { + if m != nil { + return m.LogicType + } + return LogicType_NA +} + +func (m *DataType) GetChildren() []*Field { + if m != nil { + return m.Children + } + return nil +} + +// XXX_OneofWrappers is for the internal use of the proto package. +func (*DataType) XXX_OneofWrappers() []interface{} { + return []interface{}{ + (*DataType_FixedSizeBinaryType)(nil), + (*DataType_FixedSizeListType)(nil), + (*DataType_DictionaryType)(nil), + (*DataType_MapType)(nil), + } +} + +type KeyValueMetadata struct { + Keys []string `protobuf:"bytes,1,rep,name=keys,proto3" json:"keys,omitempty"` + Values []string `protobuf:"bytes,2,rep,name=values,proto3" json:"values,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *KeyValueMetadata) Reset() { *m = KeyValueMetadata{} } +func (m *KeyValueMetadata) String() string { return proto.CompactTextString(m) } +func (*KeyValueMetadata) ProtoMessage() {} +func (*KeyValueMetadata) Descriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{5} +} + +func (m *KeyValueMetadata) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_KeyValueMetadata.Unmarshal(m, b) +} +func (m *KeyValueMetadata) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_KeyValueMetadata.Marshal(b, m, deterministic) +} +func (m *KeyValueMetadata) XXX_Merge(src proto.Message) { + xxx_messageInfo_KeyValueMetadata.Merge(m, src) +} +func (m *KeyValueMetadata) XXX_Size() int { + return xxx_messageInfo_KeyValueMetadata.Size(m) +} +func (m *KeyValueMetadata) XXX_DiscardUnknown() { + xxx_messageInfo_KeyValueMetadata.DiscardUnknown(m) +} + +var xxx_messageInfo_KeyValueMetadata proto.InternalMessageInfo + +func (m *KeyValueMetadata) GetKeys() []string { + if m != nil { + return m.Keys + } + return nil +} + +func (m *KeyValueMetadata) GetValues() []string { + if m != nil { + return m.Values + } + return nil +} + +type Field struct { + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Nullable bool `protobuf:"varint,2,opt,name=nullable,proto3" json:"nullable,omitempty"` + DataType *DataType `protobuf:"bytes,3,opt,name=data_type,json=dataType,proto3" json:"data_type,omitempty"` + Metadata *KeyValueMetadata `protobuf:"bytes,4,opt,name=metadata,proto3" json:"metadata,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *Field) Reset() { *m = Field{} } +func (m *Field) String() string { return proto.CompactTextString(m) } +func (*Field) ProtoMessage() {} +func (*Field) Descriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{6} +} + +func (m *Field) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_Field.Unmarshal(m, b) +} +func (m *Field) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_Field.Marshal(b, m, deterministic) +} +func (m *Field) XXX_Merge(src proto.Message) { + xxx_messageInfo_Field.Merge(m, src) +} +func (m *Field) XXX_Size() int { + return xxx_messageInfo_Field.Size(m) +} +func (m *Field) XXX_DiscardUnknown() { + xxx_messageInfo_Field.DiscardUnknown(m) +} + +var xxx_messageInfo_Field proto.InternalMessageInfo + +func (m *Field) GetName() string { + if m != nil { + return m.Name + } + return "" +} + +func (m *Field) GetNullable() bool { + if m != nil { + return m.Nullable + } + return false +} + +func (m *Field) GetDataType() *DataType { + if m != nil { + return m.DataType + } + return nil +} + +func (m *Field) GetMetadata() *KeyValueMetadata { + if m != nil { + return m.Metadata + } + return nil +} + +type SchemaOptions struct { + PrimaryColumn string `protobuf:"bytes,1,opt,name=primary_column,json=primaryColumn,proto3" json:"primary_column,omitempty"` + VersionColumn string `protobuf:"bytes,2,opt,name=version_column,json=versionColumn,proto3" json:"version_column,omitempty"` + VectorColumn string `protobuf:"bytes,3,opt,name=vector_column,json=vectorColumn,proto3" json:"vector_column,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *SchemaOptions) Reset() { *m = SchemaOptions{} } +func (m *SchemaOptions) String() string { return proto.CompactTextString(m) } +func (*SchemaOptions) ProtoMessage() {} +func (*SchemaOptions) Descriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{7} +} + +func (m *SchemaOptions) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_SchemaOptions.Unmarshal(m, b) +} +func (m *SchemaOptions) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_SchemaOptions.Marshal(b, m, deterministic) +} +func (m *SchemaOptions) XXX_Merge(src proto.Message) { + xxx_messageInfo_SchemaOptions.Merge(m, src) +} +func (m *SchemaOptions) XXX_Size() int { + return xxx_messageInfo_SchemaOptions.Size(m) +} +func (m *SchemaOptions) XXX_DiscardUnknown() { + xxx_messageInfo_SchemaOptions.DiscardUnknown(m) +} + +var xxx_messageInfo_SchemaOptions proto.InternalMessageInfo + +func (m *SchemaOptions) GetPrimaryColumn() string { + if m != nil { + return m.PrimaryColumn + } + return "" +} + +func (m *SchemaOptions) GetVersionColumn() string { + if m != nil { + return m.VersionColumn + } + return "" +} + +func (m *SchemaOptions) GetVectorColumn() string { + if m != nil { + return m.VectorColumn + } + return "" +} + +type ArrowSchema struct { + Fields []*Field `protobuf:"bytes,1,rep,name=fields,proto3" json:"fields,omitempty"` + Endianness Endianness `protobuf:"varint,2,opt,name=endianness,proto3,enum=schema_proto.Endianness" json:"endianness,omitempty"` + Metadata *KeyValueMetadata `protobuf:"bytes,3,opt,name=metadata,proto3" json:"metadata,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *ArrowSchema) Reset() { *m = ArrowSchema{} } +func (m *ArrowSchema) String() string { return proto.CompactTextString(m) } +func (*ArrowSchema) ProtoMessage() {} +func (*ArrowSchema) Descriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{8} +} + +func (m *ArrowSchema) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_ArrowSchema.Unmarshal(m, b) +} +func (m *ArrowSchema) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_ArrowSchema.Marshal(b, m, deterministic) +} +func (m *ArrowSchema) XXX_Merge(src proto.Message) { + xxx_messageInfo_ArrowSchema.Merge(m, src) +} +func (m *ArrowSchema) XXX_Size() int { + return xxx_messageInfo_ArrowSchema.Size(m) +} +func (m *ArrowSchema) XXX_DiscardUnknown() { + xxx_messageInfo_ArrowSchema.DiscardUnknown(m) +} + +var xxx_messageInfo_ArrowSchema proto.InternalMessageInfo + +func (m *ArrowSchema) GetFields() []*Field { + if m != nil { + return m.Fields + } + return nil +} + +func (m *ArrowSchema) GetEndianness() Endianness { + if m != nil { + return m.Endianness + } + return Endianness_Little +} + +func (m *ArrowSchema) GetMetadata() *KeyValueMetadata { + if m != nil { + return m.Metadata + } + return nil +} + +type Schema struct { + ArrowSchema *ArrowSchema `protobuf:"bytes,1,opt,name=arrow_schema,json=arrowSchema,proto3" json:"arrow_schema,omitempty"` + SchemaOptions *SchemaOptions `protobuf:"bytes,2,opt,name=schema_options,json=schemaOptions,proto3" json:"schema_options,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *Schema) Reset() { *m = Schema{} } +func (m *Schema) String() string { return proto.CompactTextString(m) } +func (*Schema) ProtoMessage() {} +func (*Schema) Descriptor() ([]byte, []int) { + return fileDescriptor_a8f8ddeefeee12bd, []int{9} +} + +func (m *Schema) XXX_Unmarshal(b []byte) error { + return xxx_messageInfo_Schema.Unmarshal(m, b) +} +func (m *Schema) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + return xxx_messageInfo_Schema.Marshal(b, m, deterministic) +} +func (m *Schema) XXX_Merge(src proto.Message) { + xxx_messageInfo_Schema.Merge(m, src) +} +func (m *Schema) XXX_Size() int { + return xxx_messageInfo_Schema.Size(m) +} +func (m *Schema) XXX_DiscardUnknown() { + xxx_messageInfo_Schema.DiscardUnknown(m) +} + +var xxx_messageInfo_Schema proto.InternalMessageInfo + +func (m *Schema) GetArrowSchema() *ArrowSchema { + if m != nil { + return m.ArrowSchema + } + return nil +} + +func (m *Schema) GetSchemaOptions() *SchemaOptions { + if m != nil { + return m.SchemaOptions + } + return nil +} + +func init() { + proto.RegisterEnum("schema_proto.LogicType", LogicType_name, LogicType_value) + proto.RegisterEnum("schema_proto.Endianness", Endianness_name, Endianness_value) + proto.RegisterType((*FixedSizeBinaryType)(nil), "schema_proto.FixedSizeBinaryType") + proto.RegisterType((*FixedSizeListType)(nil), "schema_proto.FixedSizeListType") + proto.RegisterType((*DictionaryType)(nil), "schema_proto.DictionaryType") + proto.RegisterType((*MapType)(nil), "schema_proto.MapType") + proto.RegisterType((*DataType)(nil), "schema_proto.DataType") + proto.RegisterType((*KeyValueMetadata)(nil), "schema_proto.KeyValueMetadata") + proto.RegisterType((*Field)(nil), "schema_proto.Field") + proto.RegisterType((*SchemaOptions)(nil), "schema_proto.SchemaOptions") + proto.RegisterType((*ArrowSchema)(nil), "schema_proto.ArrowSchema") + proto.RegisterType((*Schema)(nil), "schema_proto.Schema") +} + +func init() { proto.RegisterFile("storage_schema.proto", fileDescriptor_a8f8ddeefeee12bd) } + +var fileDescriptor_a8f8ddeefeee12bd = []byte{ + // 905 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x94, 0x55, 0x5d, 0x6f, 0x1a, 0x47, + 0x14, 0x65, 0xc1, 0x86, 0xe5, 0x62, 0xf0, 0x78, 0x88, 0x5d, 0x92, 0x34, 0x09, 0xd9, 0xaa, 0x2a, + 0x72, 0x55, 0xd3, 0x62, 0xd7, 0x4a, 0x3f, 0x54, 0x09, 0x8c, 0x1d, 0x56, 0xc5, 0xa6, 0x1a, 0x70, + 0xeb, 0xe6, 0x65, 0xb5, 0xb0, 0x63, 0x3c, 0xea, 0x7e, 0xa0, 0xdd, 0xc5, 0x09, 0x79, 0xec, 0x63, + 0x7f, 0x41, 0x5f, 0xfa, 0xde, 0xc7, 0xbe, 0xf4, 0xff, 0x55, 0xf3, 0xb1, 0x74, 0xd7, 0x8d, 0x2b, + 0xe5, 0xc9, 0x77, 0xce, 0x9c, 0x73, 0xe7, 0xce, 0x39, 0xb3, 0x06, 0x1e, 0x44, 0x71, 0x10, 0xda, + 0x73, 0x6a, 0x45, 0xb3, 0x1b, 0xea, 0xd9, 0x07, 0x8b, 0x30, 0x88, 0x03, 0xbc, 0x25, 0x57, 0x96, + 0x58, 0x19, 0x47, 0x50, 0x3f, 0x63, 0x6f, 0xa8, 0x33, 0x66, 0x6f, 0x69, 0x8f, 0xf9, 0x76, 0xb8, + 0x9a, 0xac, 0x16, 0x14, 0x3f, 0x01, 0x98, 0xae, 0x62, 0x6a, 0xbd, 0x66, 0x4e, 0x7c, 0xd3, 0xd0, + 0x9a, 0x5a, 0x6b, 0x93, 0x94, 0x39, 0xf2, 0x13, 0x07, 0x8c, 0xcf, 0x61, 0x67, 0xad, 0x1a, 0xb2, + 0x28, 0x16, 0x9a, 0xc7, 0x50, 0x76, 0x59, 0x14, 0x5b, 0x11, 0x7b, 0x4b, 0x95, 0x44, 0xe7, 0x00, + 0x27, 0x19, 0xbf, 0x6b, 0x50, 0xeb, 0xb3, 0x59, 0xcc, 0x82, 0xf5, 0x19, 0x5f, 0x02, 0x30, 0xdf, + 0xa1, 0x6f, 0xac, 0x78, 0xb5, 0x90, 0x82, 0x4a, 0x67, 0xef, 0x20, 0x3d, 0xdd, 0x41, 0xdf, 0x8e, + 0x6d, 0xce, 0x25, 0x65, 0xc1, 0x4c, 0x64, 0xb7, 0xb6, 0xbb, 0xa4, 0x52, 0x96, 0xff, 0x7f, 0x99, + 0x60, 0x0a, 0x59, 0x03, 0x4a, 0x41, 0xe8, 0xd0, 0x90, 0x3a, 0x8d, 0x42, 0x53, 0x6b, 0xe9, 0x24, + 0x59, 0x1a, 0xfb, 0x50, 0x3a, 0xb7, 0x17, 0x82, 0xf4, 0x0c, 0x2a, 0xbf, 0xd0, 0x55, 0x64, 0x45, + 0x41, 0x18, 0x53, 0x47, 0xcc, 0xa4, 0x13, 0xe0, 0xd0, 0x58, 0x20, 0xc6, 0xdf, 0x05, 0xd0, 0x93, + 0xee, 0xf8, 0x0a, 0xf6, 0xae, 0xb9, 0x0b, 0xe2, 0xc6, 0xd6, 0x54, 0xb8, 0x97, 0xbe, 0xcc, 0xf3, + 0xec, 0x54, 0xef, 0xf0, 0x79, 0x90, 0x23, 0xf5, 0xeb, 0x77, 0xd8, 0x4f, 0xe0, 0x41, 0xaa, 0xb3, + 0x70, 0x35, 0x75, 0xdb, 0x67, 0xf7, 0xf4, 0x4d, 0x92, 0x18, 0xe4, 0xc8, 0xce, 0xf5, 0x7f, 0xe2, + 0x79, 0x09, 0xdb, 0xce, 0x3a, 0x00, 0xd9, 0xae, 0x20, 0xda, 0x7d, 0x78, 0xc7, 0xbc, 0x4c, 0x4a, + 0x83, 0x1c, 0xa9, 0x39, 0xd9, 0xdc, 0x3a, 0xa0, 0x7b, 0xf6, 0x42, 0x76, 0xd8, 0x10, 0x1d, 0x76, + 0xb3, 0x1d, 0x94, 0x9b, 0x83, 0x1c, 0x29, 0x79, 0xca, 0xd8, 0x63, 0x00, 0x37, 0x98, 0xb3, 0x99, + 0x54, 0x39, 0x4d, 0xad, 0x55, 0xeb, 0x7c, 0x90, 0x55, 0x0d, 0xf9, 0xbe, 0x4c, 0xcd, 0x4d, 0x4a, + 0xdc, 0x06, 0x7d, 0x76, 0xc3, 0x5c, 0x27, 0xa4, 0x7e, 0x83, 0x36, 0x0b, 0xad, 0x4a, 0xa7, 0x7e, + 0xf7, 0xf2, 0xd4, 0x75, 0xc8, 0x9a, 0xd4, 0xdb, 0x85, 0x3a, 0x3f, 0xc2, 0x0a, 0xa9, 0x6b, 0xc7, + 0xd4, 0xb1, 0xc4, 0x03, 0x88, 0x8c, 0xef, 0x00, 0x7d, 0x4f, 0x57, 0x3f, 0xf2, 0xc5, 0x39, 0x8d, + 0x6d, 0xc7, 0x8e, 0x6d, 0x8c, 0x61, 0x83, 0x27, 0xdb, 0xd0, 0x9a, 0x85, 0x56, 0x99, 0x88, 0x1a, + 0xef, 0x41, 0x51, 0x2a, 0x1a, 0x79, 0x81, 0xaa, 0x95, 0xf1, 0xa7, 0x06, 0x9b, 0xe2, 0x28, 0xae, + 0xf2, 0x6d, 0x4f, 0x46, 0x5c, 0x26, 0xa2, 0xc6, 0x8f, 0x40, 0xf7, 0x97, 0xae, 0x6b, 0x4f, 0x5d, + 0x19, 0x91, 0x4e, 0xd6, 0x6b, 0x7c, 0x08, 0x65, 0x7e, 0x5a, 0xda, 0xf0, 0xfb, 0x5e, 0xab, 0xee, + 0x24, 0x2f, 0xeb, 0x6b, 0xd0, 0x3d, 0x35, 0xa6, 0xb2, 0xf8, 0x69, 0x56, 0x73, 0xf7, 0x32, 0x64, + 0xcd, 0x37, 0x7e, 0xd5, 0xa0, 0x3a, 0x16, 0xdc, 0xd1, 0x82, 0xc7, 0x16, 0xe1, 0x8f, 0xa1, 0xb6, + 0x08, 0x99, 0xc7, 0x63, 0x9f, 0x05, 0xee, 0xd2, 0xf3, 0xd5, 0xf0, 0x55, 0x85, 0x9e, 0x08, 0x90, + 0xd3, 0x6e, 0x69, 0x18, 0xb1, 0xc0, 0x4f, 0x68, 0x79, 0x49, 0x53, 0xa8, 0xa2, 0x7d, 0x04, 0xd5, + 0x5b, 0x3a, 0x8b, 0x83, 0x30, 0x61, 0x15, 0x04, 0x6b, 0x4b, 0x82, 0x92, 0x64, 0xfc, 0xa5, 0x41, + 0xa5, 0x1b, 0x86, 0xc1, 0x6b, 0x39, 0x09, 0xfe, 0x14, 0x8a, 0xd7, 0xdc, 0x3e, 0xe9, 0xf6, 0x3d, + 0x29, 0x2a, 0x0a, 0x7e, 0x01, 0x40, 0x7d, 0x87, 0xd9, 0xbe, 0x4f, 0xa3, 0x48, 0x0c, 0x51, 0xeb, + 0x34, 0xb2, 0x82, 0xd3, 0xf5, 0x3e, 0x49, 0x71, 0x33, 0xbe, 0x15, 0xde, 0xd3, 0xb7, 0xdf, 0x34, + 0x28, 0xaa, 0x69, 0xbf, 0x85, 0x2d, 0x9b, 0x0f, 0xaf, 0xfe, 0x71, 0xaa, 0xcf, 0xf9, 0x61, 0xb6, + 0x55, 0xea, 0x7a, 0xa4, 0x62, 0xa7, 0xee, 0xda, 0x83, 0x9a, 0x22, 0x06, 0x32, 0x00, 0xf5, 0xd9, + 0x3e, 0xce, 0xea, 0x33, 0x19, 0x91, 0x6a, 0x94, 0x5e, 0xee, 0xff, 0x91, 0x87, 0xf2, 0xfa, 0x83, + 0xc0, 0x45, 0xc8, 0x5f, 0x74, 0x51, 0x0e, 0xeb, 0xb0, 0xd1, 0x1b, 0x8d, 0x86, 0x48, 0xc3, 0x65, + 0xd8, 0xbc, 0x34, 0x2f, 0x26, 0x2f, 0x50, 0x9e, 0x83, 0xa2, 0x2a, 0x60, 0x80, 0x22, 0x07, 0xbf, + 0x38, 0x46, 0x1b, 0x9c, 0x20, 0xcb, 0xcd, 0x04, 0x3e, 0xec, 0xa0, 0xa2, 0x82, 0x0f, 0x3b, 0xa8, + 0x94, 0xc0, 0xc7, 0x47, 0x48, 0x57, 0xf0, 0xf1, 0x11, 0x2a, 0xe3, 0x1a, 0xc0, 0xa0, 0x3b, 0x3c, + 0xb3, 0xce, 0x86, 0xa3, 0xee, 0x04, 0x01, 0xdf, 0x92, 0x65, 0x85, 0x2b, 0xfa, 0xa3, 0xcb, 0xde, + 0xf0, 0x14, 0x6d, 0xf1, 0x7a, 0x3c, 0x21, 0xe6, 0xc5, 0x4b, 0x54, 0xe5, 0x75, 0xcf, 0xbc, 0xe8, + 0x92, 0x9f, 0x51, 0x0d, 0xef, 0xc2, 0xce, 0x99, 0x79, 0x75, 0xda, 0xb7, 0xc6, 0xe6, 0xab, 0x53, + 0x4b, 0xc1, 0xdb, 0x7c, 0xc8, 0xa1, 0x39, 0x9e, 0xa0, 0x87, 0x4a, 0x78, 0x79, 0x32, 0x41, 0x8f, + 0xf8, 0x59, 0x7d, 0xf3, 0x64, 0x62, 0x8e, 0x04, 0xeb, 0x09, 0x2e, 0x41, 0xe1, 0xbc, 0xfb, 0x03, + 0x7a, 0x8a, 0xeb, 0xb0, 0x9d, 0xea, 0x22, 0x94, 0x4d, 0xae, 0x3c, 0xef, 0x5e, 0x59, 0x66, 0x1f, + 0x7d, 0xb2, 0xff, 0x1c, 0xe0, 0xdf, 0x27, 0xc0, 0x77, 0x86, 0x2c, 0x8e, 0x5d, 0x8a, 0x72, 0xbc, + 0x47, 0x8f, 0xcd, 0x91, 0xd6, 0xfb, 0xe6, 0xd5, 0x57, 0x73, 0x16, 0xdf, 0x2c, 0xa7, 0x07, 0xb3, + 0xc0, 0x6b, 0x7b, 0xcc, 0xbd, 0x5d, 0x46, 0x9f, 0xb1, 0x20, 0xa9, 0xd4, 0x6f, 0x63, 0x7b, 0x1e, + 0xb4, 0x45, 0x1e, 0xed, 0x74, 0x38, 0xd3, 0xa2, 0xf8, 0x73, 0xf8, 0x4f, 0x00, 0x00, 0x00, 0xff, + 0xff, 0x8d, 0x62, 0x07, 0xa2, 0x43, 0x07, 0x00, 0x00, +} diff --git a/internal/storagev2/proto/storage_schema.proto b/internal/storagev2/proto/storage_schema.proto new file mode 100644 index 0000000000000..4f81350918c2b --- /dev/null +++ b/internal/storagev2/proto/storage_schema.proto @@ -0,0 +1,106 @@ +syntax = "proto3"; + +package schema_proto; +option go_package = "github.com/milvus-io/milvus/internal/storagev2/proto/schema_proto"; + +enum LogicType { + NA = 0; + BOOL = 1; + UINT8 = 2; + INT8 = 3; + UINT16 = 4; + INT16 = 5; + UINT32 = 6; + INT32 = 7; + UINT64 = 8; + INT64 = 9; + HALF_FLOAT = 10; + FLOAT = 11; + DOUBLE = 12; + STRING = 13; + BINARY = 14; + FIXED_SIZE_BINARY = 15; + // DATE32 = 16; + // DATE64 = 17; + // TIMESTAMP = 18; + // TIME32 = 19; + // TIME64 = 20; + // INTERVAL_MONTHS = 21; + // INTERVAL_DAY_TIME = 22; + // DECIMAL128 = 23; + // option allow_alias = true; + // DECIMAL = 23; // DECIMAL==DECIMAL128 + // DECIMAL256 = 24; + LIST = 25; + STRUCT = 26; + // SPARSE_UNION = 27; + // DENSE_UNION = 28; + DICTIONARY = 29; + MAP = 30; + // EXTENSION = 31; + FIXED_SIZE_LIST = 32; + // DURATION = 33; + // LARGE_STRING = 34; + // LARGE_BINARY = 35; + // LARGE_LIST = 36; + // INTERVAL_MONTH_DAY_NANO = 37; + // RUN_END_ENCODED = 38; + MAX_ID = 39; +} + +enum Endianness { + Little = 0; + Big = 1; +} + +message FixedSizeBinaryType { int32 byte_width = 1; } + +message FixedSizeListType { int32 list_size = 1; } + +message DictionaryType { + DataType index_type = 1; + DataType value_type = 2; + bool ordered = 3; +} + +message MapType { bool keys_sorted = 1; } + +message DataType { + oneof type_related_values { + FixedSizeBinaryType fixed_size_binary_type = 1; + FixedSizeListType fixed_size_list_type = 2; + DictionaryType dictionary_type = 3; + MapType map_type = 4; + } + LogicType logic_type = 100; + repeated Field children = 101; +} + +message KeyValueMetadata { + repeated string keys = 1; + repeated string values = 2; +} + +message Field { + string name = 1; + bool nullable = 2; + DataType data_type = 3; + KeyValueMetadata metadata = 4; +} + +message SchemaOptions { + string primary_column = 1; + string version_column = 2; + string vector_column = 3; +} + +message ArrowSchema { + repeated Field fields = 1; + Endianness endianness = 2; + KeyValueMetadata metadata = 3; +} + +message Schema { + ArrowSchema arrow_schema = 1; + SchemaOptions schema_options = 2; +} diff --git a/internal/storagev2/reader/common_reader/delete_reader.go b/internal/storagev2/reader/common_reader/delete_reader.go new file mode 100644 index 0000000000000..b41f76d6a9a27 --- /dev/null +++ b/internal/storagev2/reader/common_reader/delete_reader.go @@ -0,0 +1,64 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package common_reader + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" + "github.com/milvus-io/milvus/internal/storagev2/storage/schema" +) + +type DeleteReader struct { + recordReader array.RecordReader + schemaOptions *schema.SchemaOptions + deleteFragments fragment.DeleteFragmentVector + options *options.ReadOptions +} + +func (d DeleteReader) Retain() { + // TODO implement me + panic("implement me") +} + +func (d DeleteReader) Release() { + // TODO implement me + panic("implement me") +} + +func (d DeleteReader) Schema() *arrow.Schema { + // TODO implement me + panic("implement me") +} + +func (d DeleteReader) Next() bool { + // TODO implement me + panic("implement me") +} + +func (d DeleteReader) Record() arrow.Record { + // TODO implement me + panic("implement me") +} + +func (d DeleteReader) Err() error { + // TODO implement me + panic("implement me") +} + +func NewDeleteReader(recordReader array.RecordReader, schemaOptions *schema.SchemaOptions, deleteFragments fragment.DeleteFragmentVector, options *options.ReadOptions) *DeleteReader { + return &DeleteReader{recordReader: recordReader, schemaOptions: schemaOptions, deleteFragments: deleteFragments, options: options} +} diff --git a/internal/storagev2/reader/common_reader/filter_reader.go b/internal/storagev2/reader/common_reader/filter_reader.go new file mode 100644 index 0000000000000..a11d3b0d895e0 --- /dev/null +++ b/internal/storagev2/reader/common_reader/filter_reader.go @@ -0,0 +1,83 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package common_reader + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" +) + +type FilterReader struct { + recordReader array.RecordReader + option *options.ReadOptions + currentFilteredBatchReader array.RecordReader +} + +func (r *FilterReader) Retain() { + // TODO implement me + panic("implement me") +} + +func (r *FilterReader) Release() { + // TODO implement me + panic("implement me") +} + +func (r *FilterReader) Schema() *arrow.Schema { + // TODO implement me + panic("implement me") +} + +func (r *FilterReader) Record() arrow.Record { + // TODO implement me + panic("implement me") +} + +func (r *FilterReader) Err() error { + // TODO implement me + panic("implement me") +} + +func MakeFilterReader(recordReader array.RecordReader, option *options.ReadOptions) *FilterReader { + return &FilterReader{ + recordReader: recordReader, + option: option, + } +} + +func (r *FilterReader) Next() bool { + //for { + // if r.currentFilteredBatchReader != nil { + // filteredBatch := r.currentFilteredBatchReader.Next() + // if err != nil { + // return false + // } + // if filteredBatch == nil { + // r.currentFilteredBatchReader = nil + // continue + // } + // return filteredBatch, nil + // } + // err := r.NextFilteredBatchReader() + // if err != nil { + // return nil + // } + // if r.currentFilteredBatchReader == nil { + // return nil + // } + //} + return false +} diff --git a/internal/storagev2/reader/common_reader/projection_reader.go b/internal/storagev2/reader/common_reader/projection_reader.go new file mode 100644 index 0000000000000..900c9e8f1e865 --- /dev/null +++ b/internal/storagev2/reader/common_reader/projection_reader.go @@ -0,0 +1,34 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package common_reader + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/milvus-io/milvus/internal/storagev2/common/utils" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" +) + +type ProjectionReader struct { + array.RecordReader + reader array.RecordReader + options *options.ReadOptions + schema *arrow.Schema +} + +func NewProjectionReader(reader array.RecordReader, options *options.ReadOptions, schema *arrow.Schema) array.RecordReader { + projectionSchema := utils.ProjectSchema(schema, options.Columns) + return &ProjectionReader{reader: reader, options: options, schema: projectionSchema} +} diff --git a/internal/storagev2/reader/record_reader/filter_query_record.go b/internal/storagev2/reader/record_reader/filter_query_record.go new file mode 100644 index 0000000000000..35d006c5028be --- /dev/null +++ b/internal/storagev2/reader/record_reader/filter_query_record.go @@ -0,0 +1,48 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package record_reader + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" + "github.com/milvus-io/milvus/internal/storagev2/storage/schema" +) + +type FilterQueryRecordReader struct { + // TODO implement me + ref int64 + schema *schema.Schema + options *options.ReadOptions + fs fs.Fs + scalarFragment fragment.FragmentVector + vectorFragment fragment.FragmentVector + deleteFragments fragment.DeleteFragmentVector + record arrow.Record +} + +func NewFilterQueryReader( + s *schema.Schema, + options *options.ReadOptions, + f fs.Fs, + scalarFragment fragment.FragmentVector, + vectorFragment fragment.FragmentVector, + deleteFragments fragment.DeleteFragmentVector, +) array.RecordReader { + // TODO implement me + panic("implement me") +} diff --git a/internal/storagev2/reader/record_reader/merge_record_reader.go b/internal/storagev2/reader/record_reader/merge_record_reader.go new file mode 100644 index 0000000000000..b5f8299aa9178 --- /dev/null +++ b/internal/storagev2/reader/record_reader/merge_record_reader.go @@ -0,0 +1,76 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package record_reader + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" + "github.com/milvus-io/milvus/internal/storagev2/storage/schema" +) + +type MergeRecordReader struct { + ref int64 + schema *schema.Schema + options *options.ReadOptions + fs fs.Fs + scalarFragments fragment.FragmentVector + vectorFragments fragment.FragmentVector + deleteFragments fragment.DeleteFragmentVector + record arrow.Record +} + +func (m MergeRecordReader) Retain() { + // TODO implement me + panic("implement me") +} + +func (m MergeRecordReader) Release() { + // TODO implement me + panic("implement me") +} + +func (m MergeRecordReader) Schema() *arrow.Schema { + // TODO implement me + panic("implement me") +} + +func (m MergeRecordReader) Next() bool { + // TODO implement me + panic("implement me") +} + +func (m MergeRecordReader) Record() arrow.Record { + // TODO implement me + panic("implement me") +} + +func (m MergeRecordReader) Err() error { + // TODO implement me + panic("implement me") +} + +func NewMergeRecordReader( + s *schema.Schema, + options *options.ReadOptions, + f fs.Fs, + scalarFragment fragment.FragmentVector, + vectorFragment fragment.FragmentVector, + deleteFragments fragment.DeleteFragmentVector, +) *MergeRecordReader { + // TODO implement me + panic("implement me") +} diff --git a/internal/storagev2/reader/record_reader/multi_files_sequential_reader.go b/internal/storagev2/reader/record_reader/multi_files_sequential_reader.go new file mode 100644 index 0000000000000..4ebafac592abc --- /dev/null +++ b/internal/storagev2/reader/record_reader/multi_files_sequential_reader.go @@ -0,0 +1,120 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package record_reader + +import ( + "sync/atomic" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/apache/arrow/go/v12/parquet/pqarrow" + "github.com/milvus-io/milvus/internal/storagev2/common/arrow_util" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" +) + +type MultiFilesSequentialReader struct { + fs fs.Fs + schema *arrow.Schema + files []string + nextPos int + options *options.ReadOptions + currReader array.RecordReader + err error + ref int64 +} + +func (m *MultiFilesSequentialReader) Retain() { + atomic.AddInt64(&m.ref, 1) +} + +func (m *MultiFilesSequentialReader) Release() { + if atomic.AddInt64(&m.ref, -1) == 0 { + if m.currReader != nil { + m.currReader.Release() + m.currReader = nil + } + } +} + +func (m *MultiFilesSequentialReader) Schema() *arrow.Schema { + return m.schema +} + +func (m *MultiFilesSequentialReader) Next() bool { + for true { + if m.currReader == nil { + if m.nextPos >= len(m.files) { + return false + } + + m.nextReader() + if m.err != nil { + return false + } + m.nextPos++ + } + if m.currReader.Next() { + return true + } + if m.currReader.Err() != nil { + m.err = m.currReader.Err() + return false + } + if m.currReader != nil { + m.currReader.Release() + m.currReader = nil + } + } + return false +} + +func (m *MultiFilesSequentialReader) Record() arrow.Record { + if m.currReader != nil { + return m.currReader.Record() + } + return nil +} + +func (m *MultiFilesSequentialReader) Err() error { + return m.err +} + +func (m *MultiFilesSequentialReader) nextReader() { + var fileReader *pqarrow.FileReader + fileReader, m.err = arrow_util.MakeArrowFileReader(m.fs, m.files[m.nextPos]) + if m.err != nil { + return + } + m.currReader, m.err = arrow_util.MakeArrowRecordReader(fileReader, m.options) + return +} + +func NewMultiFilesSequentialReader(fs fs.Fs, fragments fragment.FragmentVector, schema *arrow.Schema, options *options.ReadOptions) *MultiFilesSequentialReader { + files := make([]string, 0, len(fragments)) + for _, f := range fragments { + files = append(files, f.Files()...) + } + + return &MultiFilesSequentialReader{ + fs: fs, + schema: schema, + options: options, + files: files, + nextPos: 0, + ref: 1, + } +} diff --git a/internal/storagev2/reader/record_reader/record_reader.go b/internal/storagev2/reader/record_reader/record_reader.go new file mode 100644 index 0000000000000..d720498e5edd5 --- /dev/null +++ b/internal/storagev2/reader/record_reader/record_reader.go @@ -0,0 +1,94 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package record_reader + +import ( + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" + "github.com/milvus-io/milvus/internal/storagev2/filter" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/storage/manifest" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" + "github.com/milvus-io/milvus/internal/storagev2/storage/schema" +) + +func MakeRecordReader( + m *manifest.Manifest, + s *schema.Schema, + f fs.Fs, + deleteFragments fragment.DeleteFragmentVector, + options *options.ReadOptions, +) array.RecordReader { + relatedColumns := make([]string, 0) + for _, column := range options.Columns { + relatedColumns = append(relatedColumns, column) + } + + for _, filter := range options.Filters { + relatedColumns = append(relatedColumns, filter.GetColumnName()) + } + + scalarData := m.GetScalarFragments() + vectorData := m.GetVectorFragments() + + onlyScalar := onlyContainScalarColumns(s, relatedColumns) + onlyVector := onlyContainVectorColumns(s, relatedColumns) + + if onlyScalar || onlyVector { + var dataFragments fragment.FragmentVector + if onlyScalar { + dataFragments = scalarData + } else { + dataFragments = vectorData + } + return NewScanRecordReader(s, options, f, dataFragments, deleteFragments) + } + if len(options.Filters) > 0 && filtersOnlyContainPKAndVersion(s, options.FiltersV2) { + return NewMergeRecordReader(s, options, f, scalarData, vectorData, deleteFragments) + } + return NewFilterQueryReader(s, options, f, scalarData, vectorData, deleteFragments) +} + +func onlyContainVectorColumns(schema *schema.Schema, relatedColumns []string) bool { + for _, column := range relatedColumns { + if schema.Options().VectorColumn != column && schema.Options().PrimaryColumn != column && schema.Options().VersionColumn != column { + return false + } + } + return true +} + +func onlyContainScalarColumns(schema *schema.Schema, relatedColumns []string) bool { + for _, column := range relatedColumns { + if schema.Options().VectorColumn == column { + return false + } + } + return true +} + +func filtersOnlyContainPKAndVersion(s *schema.Schema, filters []filter.Filter) bool { + for _, f := range filters { + if f.GetColumnName() != s.Options().PrimaryColumn && + f.GetColumnName() != s.Options().VersionColumn { + return false + } + } + return true +} + +func MakeScanDeleteReader(manifest *manifest.Manifest, fs fs.Fs) array.RecordReader { + return NewMultiFilesSequentialReader(fs, manifest.GetDeleteFragments(), manifest.GetSchema().DeleteSchema(), options.NewReadOptions()) +} diff --git a/internal/storagev2/reader/record_reader/scan_record.go b/internal/storagev2/reader/record_reader/scan_record.go new file mode 100644 index 0000000000000..0b8d3b868c714 --- /dev/null +++ b/internal/storagev2/reader/record_reader/scan_record.go @@ -0,0 +1,150 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package record_reader + +import ( + "io" + "sync/atomic" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/milvus-io/milvus/internal/storagev2/common/log" + "github.com/milvus-io/milvus/internal/storagev2/common/utils" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" + "github.com/milvus-io/milvus/internal/storagev2/io/format" + "github.com/milvus-io/milvus/internal/storagev2/io/format/parquet" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/reader/common_reader" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" + "github.com/milvus-io/milvus/internal/storagev2/storage/schema" + "go.uber.org/zap" +) + +type ScanRecordReader struct { + ref int64 + schema *schema.Schema + options *options.ReadOptions + fs fs.Fs + dataFragments fragment.FragmentVector + deleteFragments fragment.DeleteFragmentVector + rec arrow.Record + curReader format.Reader + reader array.RecordReader + nextPos int + err error +} + +func NewScanRecordReader( + s *schema.Schema, + options *options.ReadOptions, + f fs.Fs, + dataFragments fragment.FragmentVector, + deleteFragments fragment.DeleteFragmentVector, +) *ScanRecordReader { + return &ScanRecordReader{ + ref: 1, + schema: s, + options: options, + fs: f, + dataFragments: dataFragments, + deleteFragments: deleteFragments, + } +} + +func (r *ScanRecordReader) Schema() *arrow.Schema { + return utils.ProjectSchema(r.schema.Schema(), r.options.OutputColumns()) +} + +func (r *ScanRecordReader) Retain() { + atomic.AddInt64(&r.ref, 1) +} + +func (r *ScanRecordReader) Release() { + if atomic.AddInt64(&r.ref, -1) == 0 { + if r.rec != nil { + r.rec.Release() + r.rec = nil + } + if r.curReader != nil { + r.curReader.Close() + r.curReader = nil + } + } +} + +func (r *ScanRecordReader) Next() bool { + datafiles := fragment.ToFilesVector(r.dataFragments) + log.Debug("ScanRecordReader Next", zap.Any("datafiles", datafiles)) + if r.rec != nil { + r.rec.Release() + r.rec = nil + } + for { + if r.curReader == nil { + if r.nextPos >= len(datafiles) { + return false + } + // FIXME: nil options + reader, err := parquet.NewFileReader(r.fs, datafiles[r.nextPos], r.options) + if err != nil { + r.err = err + return false + } + r.nextPos++ + r.curReader = reader + } + + rec, err := r.curReader.Read() + if err != nil { + if err == io.EOF { + r.curReader.Close() + r.curReader = nil + continue + } + // if error occurs in the middle of reading, return false + r.curReader.Close() + r.curReader = nil + r.err = err + return false + } + + if rec.NumRows() == 0 { + continue + } + + r.rec = rec + return true + } +} + +func (r *ScanRecordReader) Record() arrow.Record { + return r.rec +} + +func (r *ScanRecordReader) Err() error { + return r.err +} + +func (r *ScanRecordReader) MakeInnerReader() array.RecordReader { + // TODO implement me + reader := NewMultiFilesSequentialReader(r.fs, r.dataFragments, r.Schema(), r.options) + + filterReader := common_reader.MakeFilterReader(reader, r.options) + + deleteReader := common_reader.NewDeleteReader(filterReader, r.schema.Options(), r.deleteFragments, r.options) + + res := common_reader.NewProjectionReader(deleteReader, r.options, r.schema.Schema()) + return res +} diff --git a/internal/storagev2/scripts/setenv.sh b/internal/storagev2/scripts/setenv.sh new file mode 100644 index 0000000000000..3e15dbc3c20f6 --- /dev/null +++ b/internal/storagev2/scripts/setenv.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set +e + +unameOut="$(uname -s)" + +ROOT_DIR="$( cd -P "$( dirname "$SOURCE" )/.." && pwd )" + +# Update PKG_CONFIG_PATH +export PKG_CONFIG_PATH="${PKG_CONFIG_PATH}:${ROOT_DIR}/cpp/build/Release/" \ No newline at end of file diff --git a/internal/storagev2/storage/lock/lock_manager.go b/internal/storagev2/storage/lock/lock_manager.go new file mode 100644 index 0000000000000..f514903dc5e66 --- /dev/null +++ b/internal/storagev2/storage/lock/lock_manager.go @@ -0,0 +1,97 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lock + +import ( + "sync" + + "github.com/milvus-io/milvus/internal/storagev2/common/constant" + "github.com/milvus-io/milvus/internal/storagev2/common/log" + "github.com/pkg/errors" + "go.uber.org/zap" +) + +type LockManager interface { + // Acquire the lock, wait until the lock is available, return the version to be modified or use the newest version + Acquire() (version int64, useLatestVersion bool, err error) + // Release the lock, accepts the new allocated manifest version and success state of operations between Acquire and Release as parameters + Release(version int64, success bool) error +} + +type EmptyLockManager struct{} + +func (h *EmptyLockManager) Acquire() (version int64, useLatestVersion bool, err error) { + return constant.LatestManifestVersion, true, nil +} + +func (h *EmptyLockManager) Release(_ int64, _ bool) error { + return nil +} + +type MemoryLockManager struct { + mu sync.Mutex + locks map[int64]bool + nextVersion int64 +} + +func NewMemoryLockManager() *MemoryLockManager { + return &MemoryLockManager{ + mu: sync.Mutex{}, + locks: make(map[int64]bool), + nextVersion: 0, + } +} + +func (m *MemoryLockManager) Acquire() (version int64, useLatestVersion bool, err error) { + m.mu.Lock() + defer m.mu.Unlock() + + version = m.nextVersion + + if m.locks[version] { + log.Warn("lock is already acquired", zap.Int64("version", version)) + return version, false, errors.New("lock is already acquired") + } + + if version == constant.LatestManifestVersion { + useLatestVersion = true + } else { + useLatestVersion = false + } + m.locks[version] = true + log.Info("acquire lock", zap.Int64("version", version), zap.Bool("useLatestVersion", useLatestVersion)) + + return version, useLatestVersion, nil +} + +func (m *MemoryLockManager) Release(version int64, success bool) error { + m.mu.Lock() + defer m.mu.Unlock() + + realVersion := int64(0) + realVersion = version - 1 + if !m.locks[realVersion] { + return errors.New("lock is already released or does not exist") + } + m.locks[realVersion] = false + log.Info("release lock", zap.Int64("version", realVersion), zap.Bool("success", success)) + if success { + m.nextVersion = version + } else { + m.nextVersion = constant.LatestManifestVersion + } + + return nil +} diff --git a/internal/storagev2/storage/manifest/commit.go b/internal/storagev2/storage/manifest/commit.go new file mode 100644 index 0000000000000..33267de859493 --- /dev/null +++ b/internal/storagev2/storage/manifest/commit.go @@ -0,0 +1,80 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package manifest + +import ( + "github.com/milvus-io/milvus/internal/storagev2/common/constant" + "github.com/milvus-io/milvus/internal/storagev2/storage/lock" +) + +type ManifestCommit struct { + ops []ManifestCommitOp + lock lock.LockManager + rw ManifestReaderWriter +} + +func (m *ManifestCommit) AddOp(op ...ManifestCommitOp) { + m.ops = append(m.ops, op...) +} + +func (m ManifestCommit) Commit() (manifest *Manifest, err error) { + ver, latest, err := m.lock.Acquire() + if err != nil { + return nil, err + } + var version int64 + defer func() { + if err != nil { + if err2 := m.lock.Release(-1, false); err2 != nil { + err = err2 + } + } else { + err = m.lock.Release(version, true) + } + }() + var base *Manifest + if latest { + base, err = m.rw.Read(constant.LatestManifestVersion) + if err != nil { + return nil, err + } + base.version++ + } else { + base, err = m.rw.Read(ver) + if err != nil { + return nil, err + } + maxVersion, err := m.rw.MaxVersion() + if err != nil { + return nil, err + } + base.version = maxVersion + 1 + } + + for _, op := range m.ops { + op.commit(base) + } + version = base.version + + err = m.rw.Write(base) + if err != nil { + return nil, err + } + return base, nil +} + +func NewManifestCommit(lock lock.LockManager, rw ManifestReaderWriter) ManifestCommit { + return ManifestCommit{nil, lock, rw} +} diff --git a/internal/storagev2/storage/manifest/commit_op.go b/internal/storagev2/storage/manifest/commit_op.go new file mode 100644 index 0000000000000..e5117460199f8 --- /dev/null +++ b/internal/storagev2/storage/manifest/commit_op.go @@ -0,0 +1,68 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package manifest + +import ( + "github.com/milvus-io/milvus/internal/storagev2/common/errors" + "github.com/milvus-io/milvus/internal/storagev2/file/blob" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" +) + +type ManifestCommitOp interface { + commit(manifest *Manifest) error +} + +type AddScalarFragmentOp struct { + ScalarFragment fragment.Fragment +} + +func (op AddScalarFragmentOp) commit(manifest *Manifest) error { + op.ScalarFragment.SetFragmentId(manifest.Version()) + manifest.AddScalarFragment(op.ScalarFragment) + return nil +} + +type AddVectorFragmentOp struct { + VectorFragment fragment.Fragment +} + +func (op AddVectorFragmentOp) commit(manifest *Manifest) error { + op.VectorFragment.SetFragmentId(manifest.Version()) + manifest.AddVectorFragment(op.VectorFragment) + return nil +} + +type AddDeleteFragmentOp struct { + DeleteFragment fragment.Fragment +} + +func (op AddDeleteFragmentOp) commit(manifest *Manifest) error { + op.DeleteFragment.SetFragmentId(manifest.Version()) + manifest.AddDeleteFragment(op.DeleteFragment) + return nil +} + +type AddBlobOp struct { + Replace bool + Blob blob.Blob +} + +func (op AddBlobOp) commit(manifest *Manifest) error { + if !op.Replace && manifest.HasBlob(op.Blob.Name) { + return errors.ErrBlobAlreadyExist + } + manifest.AddBlob(op.Blob) + return nil +} diff --git a/internal/storagev2/storage/manifest/manifest.go b/internal/storagev2/storage/manifest/manifest.go new file mode 100644 index 0000000000000..1ef556fb4b850 --- /dev/null +++ b/internal/storagev2/storage/manifest/manifest.go @@ -0,0 +1,242 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package manifest + +import ( + "fmt" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/golang/protobuf/proto" + "github.com/milvus-io/milvus/internal/storagev2/common/log" + "github.com/milvus-io/milvus/internal/storagev2/file/blob" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/io/fs/file" + "github.com/milvus-io/milvus/internal/storagev2/proto/manifest_proto" + "github.com/milvus-io/milvus/internal/storagev2/storage/schema" +) + +type Manifest struct { + schema *schema.Schema + ScalarFragments fragment.FragmentVector + vectorFragments fragment.FragmentVector + deleteFragments fragment.FragmentVector + blobs []blob.Blob + version int64 +} + +func NewManifest(schema *schema.Schema) *Manifest { + return &Manifest{ + schema: schema, + } +} + +func Init() *Manifest { + return &Manifest{ + schema: schema.NewSchema(arrow.NewSchema(nil, nil), schema.DefaultSchemaOptions()), + } +} + +func (m *Manifest) Copy() *Manifest { + copied := *m + return &copied +} + +func (m *Manifest) GetSchema() *schema.Schema { + return m.schema +} + +func (m *Manifest) AddScalarFragment(fragment fragment.Fragment) { + m.ScalarFragments = append(m.ScalarFragments, fragment) +} + +func (m *Manifest) AddVectorFragment(fragment fragment.Fragment) { + m.vectorFragments = append(m.vectorFragments, fragment) +} + +func (m *Manifest) AddDeleteFragment(fragment fragment.Fragment) { + m.deleteFragments = append(m.deleteFragments, fragment) +} + +func (m *Manifest) GetScalarFragments() fragment.FragmentVector { + return m.ScalarFragments +} + +func (m *Manifest) GetVectorFragments() fragment.FragmentVector { + return m.vectorFragments +} + +func (m *Manifest) GetDeleteFragments() fragment.FragmentVector { + return m.deleteFragments +} + +func (m *Manifest) Version() int64 { + return m.version +} + +func (m *Manifest) SetVersion(version int64) { + m.version = version +} + +func (m *Manifest) GetBlobs() []blob.Blob { + return m.blobs +} + +func (m *Manifest) ToProtobuf() (*manifest_proto.Manifest, error) { + manifest := &manifest_proto.Manifest{} + manifest.Version = m.version + for _, vectorFragment := range m.vectorFragments { + manifest.VectorFragments = append(manifest.VectorFragments, vectorFragment.ToProtobuf()) + } + for _, scalarFragment := range m.ScalarFragments { + manifest.ScalarFragments = append(manifest.ScalarFragments, scalarFragment.ToProtobuf()) + } + for _, deleteFragment := range m.deleteFragments { + manifest.DeleteFragments = append(manifest.DeleteFragments, deleteFragment.ToProtobuf()) + } + + for _, blob := range m.blobs { + manifest.Blobs = append(manifest.Blobs, blob.ToProtobuf()) + } + + schemaProto, err := m.schema.ToProtobuf() + if err != nil { + return nil, err + } + manifest.Schema = schemaProto + + return manifest, nil +} + +func (m *Manifest) FromProtobuf(manifest *manifest_proto.Manifest) error { + err := m.schema.FromProtobuf(manifest.Schema) + if err != nil { + return err + } + + for _, vectorFragment := range manifest.VectorFragments { + m.vectorFragments = append(m.vectorFragments, fragment.FromProtobuf(vectorFragment)) + } + + for _, scalarFragment := range manifest.ScalarFragments { + m.ScalarFragments = append(m.ScalarFragments, fragment.FromProtobuf(scalarFragment)) + } + + for _, deleteFragment := range manifest.DeleteFragments { + m.deleteFragments = append(m.deleteFragments, fragment.FromProtobuf(deleteFragment)) + } + + for _, b := range manifest.Blobs { + m.blobs = append(m.blobs, blob.FromProtobuf(b)) + } + + m.version = manifest.Version + return nil +} + +func WriteManifestFile(manifest *Manifest, output file.File) error { + protoManifest, err := manifest.ToProtobuf() + if err != nil { + return err + } + + bytes, err := proto.Marshal(protoManifest) + if err != nil { + return fmt.Errorf("write manifest file: %w", err) + } + write, err := output.Write(bytes) + if err != nil { + return fmt.Errorf("write manifest file: %w", err) + } + if write != len(bytes) { + return fmt.Errorf("failed to write whole file, expect: %v, actual: %v", len(bytes), write) + } + if err = output.Close(); err != nil { + return err + } + return nil +} + +func (m *Manifest) HasBlob(name string) bool { + for _, b := range m.blobs { + if b.Name == name { + return true + } + } + + return false +} + +func (m *Manifest) AddBlob(blob blob.Blob) { + m.blobs = append(m.blobs, blob) +} + +func (m *Manifest) RemoveBlobIfExist(name string) { + idx := -1 + for i, b := range m.blobs { + if b.Name == name { + idx = i + break + } + } + + m.blobs = append(m.blobs[0:idx], m.blobs[idx+1:]...) +} + +func (m *Manifest) GetBlob(name string) (blob.Blob, bool) { + for _, b := range m.blobs { + if b.Name == name { + return b, true + } + } + + return blob.Blob{}, false +} + +func ParseFromFile(f fs.Fs, path string) (*Manifest, error) { + manifest := Init() + manifestProto := &manifest_proto.Manifest{} + + buf, err := f.ReadFile(path) + if err != nil { + return nil, err + } + err = proto.Unmarshal(buf, manifestProto) + if err != nil { + log.Error("Failed to unmarshal manifest proto", log.String("err", err.Error())) + return nil, fmt.Errorf("parse from file: %w", err) + } + err = manifest.FromProtobuf(manifestProto) + if err != nil { + return nil, err + } + + return manifest, nil +} + +// TODO REMOVE BELOW CODE + +type DataFile struct { + path string + cols []string +} + +func (d *DataFile) Path() string { + return d.path +} + +func NewDataFile(path string) *DataFile { + return &DataFile{path: path} +} diff --git a/internal/storagev2/storage/manifest/manifest_test.go b/internal/storagev2/storage/manifest/manifest_test.go new file mode 100644 index 0000000000000..8d01ad78206b2 --- /dev/null +++ b/internal/storagev2/storage/manifest/manifest_test.go @@ -0,0 +1,290 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package manifest + +import ( + "sync" + "testing" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/milvus-io/milvus/internal/storagev2/common/utils" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/storage/lock" + "github.com/milvus-io/milvus/internal/storagev2/storage/schema" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// Test Manifest +func TestManifest(t *testing.T) { + pkField := arrow.Field{ + Name: "pk_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + vsField := arrow.Field{ + Name: "vs_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + vecField := arrow.Field{ + Name: "vec_field", + Type: arrow.DataType(&arrow.FixedSizeBinaryType{ByteWidth: 16}), + Nullable: false, + } + fields := []arrow.Field{pkField, vsField, vecField} + + as := arrow.NewSchema(fields, nil) + schemaOptions := &schema.SchemaOptions{ + PrimaryColumn: "pk_field", + VersionColumn: "vs_field", + VectorColumn: "vec_field", + } + + sc := schema.NewSchema(as, schemaOptions) + err := sc.Validate() + assert.NoError(t, err) + + maniFest := NewManifest(sc) + + f1 := fragment.NewFragment() + f1.SetFragmentId(1) + f1.AddFile("scalar1") + f1.AddFile("scalar2") + maniFest.AddScalarFragment(f1) + + f2 := fragment.NewFragment() + f2.SetFragmentId(2) + f2.AddFile("vector1") + f2.AddFile("vector2") + maniFest.AddVectorFragment(f2) + + f3 := fragment.NewFragment() + f3.SetFragmentId(3) + f3.AddFile("delete1") + maniFest.AddDeleteFragment(f3) + + require.Equal(t, len(maniFest.GetScalarFragments()), 1) + require.Equal(t, len(maniFest.GetVectorFragments()), 1) + require.Equal(t, len(maniFest.GetDeleteFragments()), 1) + require.Equal(t, sc, maniFest.GetSchema()) +} + +// Test ManifestCommitOp +func TestManifestCommitOp(t *testing.T) { + tmpDir := t.TempDir() + f, err := fs.BuildFileSystem("file:///" + tmpDir) + + // create manifest path + err = f.MkdirAll(utils.GetManifestDir(tmpDir), 0o755) + assert.NoError(t, err) + + // create manifest file + manifest := NewManifest(schema.NewSchema(arrow.NewSchema(nil, nil), schema.DefaultSchemaOptions())) + manifest.SetVersion(0) + + mc := ManifestCommit{ + ops: []ManifestCommitOp{}, + rw: NewManifestReaderWriter(f, tmpDir), + lock: lock.NewMemoryLockManager(), + } + + err = mc.rw.Write(manifest) + assert.NoError(t, err) + + mc.AddOp(AddScalarFragmentOp{ScalarFragment: fragment.NewFragment()}) + mc.AddOp(AddVectorFragmentOp{VectorFragment: fragment.NewFragment()}) + mc.AddOp(AddDeleteFragmentOp{DeleteFragment: fragment.NewFragment()}) + _, err = mc.Commit() + assert.NoError(t, err) +} + +// Test ManifestReaderWriter Read +func TestManifestReaderWriter_Read(t *testing.T) { + tmpDir := t.TempDir() + f, err := fs.BuildFileSystem("file:///" + tmpDir) + + // create manifest path + err = f.MkdirAll(utils.GetManifestDir(tmpDir), 0o755) + assert.NoError(t, err) + + // create manifest file + manifest := NewManifest(schema.NewSchema(arrow.NewSchema(nil, nil), schema.DefaultSchemaOptions())) + manifest.SetVersion(0) + err = NewManifestReaderWriter(f, tmpDir).Write(manifest) + assert.NoError(t, err) + + // read manifest file + m, err := NewManifestReaderWriter(f, tmpDir).Read(0) + assert.NoError(t, err) + assert.Equal(t, manifest.version, m.version) +} + +// Test ManifestReaderWriter MaxVersion +func TestManifestReaderWriter_MaxVersion(t *testing.T) { + tmpDir := t.TempDir() + f, err := fs.BuildFileSystem("file:///" + tmpDir) + + // create manifest path + err = f.MkdirAll(utils.GetManifestDir(tmpDir), 0o755) + assert.NoError(t, err) + + // create manifest file + manifest := NewManifest(schema.NewSchema(arrow.NewSchema(nil, nil), schema.DefaultSchemaOptions())) + manifest.SetVersion(0) + err = NewManifestReaderWriter(f, tmpDir).Write(manifest) + assert.NoError(t, err) + + // read manifest file + m, err := NewManifestReaderWriter(f, tmpDir).MaxVersion() + assert.NoError(t, err) + assert.Equal(t, manifest.version, m) +} + +// Test ManifestReaderWriter Write +func TestManifestReaderWriter_Write(t *testing.T) { + tmpDir := t.TempDir() + f, err := fs.BuildFileSystem("file:///" + tmpDir) + + // create manifest path + err = f.MkdirAll(utils.GetManifestDir(tmpDir), 0o755) + assert.NoError(t, err) + + // create manifest file + manifest := NewManifest(schema.NewSchema(arrow.NewSchema(nil, nil), schema.DefaultSchemaOptions())) + manifest.SetVersion(0) + err = NewManifestReaderWriter(f, tmpDir).Write(manifest) + assert.NoError(t, err) +} + +// Test ManifestReaderWriter concurrency write +func TestManifestReaderWriter_concurrency(t *testing.T) { + tmpDir := t.TempDir() + f, err := fs.BuildFileSystem("file:///" + tmpDir) + + // create manifest path + err = f.MkdirAll(utils.GetManifestDir(tmpDir), 0o755) + assert.NoError(t, err) + + // create manifest file + manifest := NewManifest(schema.NewSchema(arrow.NewSchema(nil, nil), schema.DefaultSchemaOptions())) + manifest.SetVersion(0) + err = NewManifestReaderWriter(f, tmpDir).Write(manifest) + assert.NoError(t, err) + + // read manifest file + m, err := NewManifestReaderWriter(f, tmpDir).Read(0) + assert.NoError(t, err) + assert.Equal(t, manifest.version, m.version) + + // write manifest file + manifest.SetVersion(1) + err = NewManifestReaderWriter(f, tmpDir).Write(manifest) + assert.NoError(t, err) + + // read manifest file + m, err = NewManifestReaderWriter(f, tmpDir).Read(1) + assert.NoError(t, err) + + // write manifest file concurrently + wg := sync.WaitGroup{} + + for i := 0; i < 100; i++ { + wg.Add(1) + i := i + go func() { + defer wg.Done() + manifest.SetVersion(int64(i)) + err = NewManifestReaderWriter(f, tmpDir).Write(manifest) + assert.NoError(t, err) + }() + } + + wg.Wait() + + // read manifest file + m, err = NewManifestReaderWriter(f, tmpDir).Read(99) + assert.NoError(t, err) + assert.NotEqual(t, 99, m.version) +} + +// Test Manifest commit concurrency +func TestManifestCommit_concurrency(t *testing.T) { + tmpDir := t.TempDir() + f, err := fs.BuildFileSystem("file:///" + tmpDir) + + // create manifest path + err = f.MkdirAll(utils.GetManifestDir(tmpDir), 0o755) + assert.NoError(t, err) + + sc := createNewSchema() + // create manifest file + manifest := NewManifest(sc) + manifest.SetVersion(0) + mrw := NewManifestReaderWriter(f, tmpDir) + err = mrw.Write(manifest) + assert.NoError(t, err) + + l := lock.NewMemoryLockManager() + + // use commit to write manifest file concurrently + wg := sync.WaitGroup{} + for i := 0; i < 5; i++ { + wg.Add(1) + go func() { + mc := ManifestCommit{ + ops: []ManifestCommitOp{}, + rw: mrw, + lock: l, + } + mc.AddOp(AddScalarFragmentOp{ScalarFragment: fragment.NewFragment()}) + mc.AddOp(AddVectorFragmentOp{VectorFragment: fragment.NewFragment()}) + mc.AddOp(AddDeleteFragmentOp{DeleteFragment: fragment.NewFragment()}) + _, err = mc.Commit() + wg.Done() + }() + } + wg.Wait() +} + +func createNewSchema() *schema.Schema { + pkField := arrow.Field{ + Name: "pk_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + vsField := arrow.Field{ + Name: "vs_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + vecField := arrow.Field{ + Name: "vec_field", + Type: arrow.DataType(&arrow.FixedSizeBinaryType{ByteWidth: 10}), + Nullable: false, + } + fields := []arrow.Field{pkField, vsField, vecField} + + as := arrow.NewSchema(fields, nil) + schemaOptions := &schema.SchemaOptions{ + PrimaryColumn: "pk_field", + VersionColumn: "vs_field", + VectorColumn: "vec_field", + } + + sc := schema.NewSchema(as, schemaOptions) + return sc +} diff --git a/internal/storagev2/storage/manifest/reader_writer.go b/internal/storagev2/storage/manifest/reader_writer.go new file mode 100644 index 0000000000000..da489558ab148 --- /dev/null +++ b/internal/storagev2/storage/manifest/reader_writer.go @@ -0,0 +1,119 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package manifest + +import ( + "errors" + "fmt" + "path/filepath" + + "github.com/milvus-io/milvus/internal/storagev2/common/constant" + "github.com/milvus-io/milvus/internal/storagev2/common/log" + "github.com/milvus-io/milvus/internal/storagev2/common/utils" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" +) + +var ErrManifestNotFound = errors.New("manifest not found") + +type ManifestReaderWriter struct { + fs fs.Fs + root string +} + +func findAllManifest(fs fs.Fs, path string) ([]fs.FileEntry, error) { + files, err := fs.List(path) + log.Debug("list all manifest:", log.Any("files", files)) + if err != nil { + return nil, err + } + return files, nil +} + +func (rw ManifestReaderWriter) Read(version int64) (*Manifest, error) { + manifests, err := findAllManifest(rw.fs, utils.GetManifestDir(rw.root)) + if err != nil { + return nil, err + } + + var maxVersionManifest string + var maxVersion int64 = -1 + for _, m := range manifests { + ver := utils.ParseVersionFromFileName(filepath.Base(m.Path)) + if ver == -1 { + continue + } + + if version != constant.LatestManifestVersion { + if ver == version { + return ParseFromFile(rw.fs, m.Path) + } + } else if ver > maxVersion { + maxVersion = ver + maxVersionManifest = m.Path + } + } + + if maxVersion != -1 { + return ParseFromFile(rw.fs, maxVersionManifest) + } + return nil, ErrManifestNotFound +} + +func (rw ManifestReaderWriter) MaxVersion() (int64, error) { + manifests, err := findAllManifest(rw.fs, utils.GetManifestDir(rw.root)) + if err != nil { + return -1, err + } + var max int64 = -1 + for _, m := range manifests { + ver := utils.ParseVersionFromFileName(filepath.Base(m.Path)) + if ver == -1 { + continue + } + + if ver > max { + max = ver + } + + } + + if max == -1 { + return -1, ErrManifestNotFound + } + return max, nil +} + +func (rw ManifestReaderWriter) Write(m *Manifest) error { + tmpManifestFilePath := utils.GetManifestTmpFilePath(rw.root, m.Version()) + manifestFilePath := utils.GetManifestFilePath(rw.root, m.Version()) + log.Debug("path", log.String("tmpManifestFilePath", tmpManifestFilePath), log.String("manifestFilePath", manifestFilePath)) + output, err := rw.fs.OpenFile(tmpManifestFilePath) + if err != nil { + return fmt.Errorf("open file error: %w", err) + } + if err = WriteManifestFile(m, output); err != nil { + return err + } + err = rw.fs.Rename(tmpManifestFilePath, manifestFilePath) + if err != nil { + return fmt.Errorf("rename file error: %w", err) + } + log.Debug("save manifest file success", log.String("path", manifestFilePath)) + return nil +} + +func NewManifestReaderWriter(fs fs.Fs, root string) ManifestReaderWriter { + return ManifestReaderWriter{fs, root} +} diff --git a/internal/storagev2/storage/options/options.go b/internal/storagev2/storage/options/options.go new file mode 100644 index 0000000000000..f7fa2de9f5b5e --- /dev/null +++ b/internal/storagev2/storage/options/options.go @@ -0,0 +1,144 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package options + +import ( + "math" + + "github.com/milvus-io/milvus/internal/storagev2/common/constant" + "github.com/milvus-io/milvus/internal/storagev2/filter" + "github.com/milvus-io/milvus/internal/storagev2/storage/lock" + "github.com/milvus-io/milvus/internal/storagev2/storage/schema" +) + +type Options struct { + Schema *schema.Schema // optional + Version int64 // optional + LockManager lock.LockManager // optional, no lock manager as default +} + +type SpaceOptionsBuilder struct { + options Options +} + +func (b *SpaceOptionsBuilder) SetSchema(schema *schema.Schema) *SpaceOptionsBuilder { + b.options.Schema = schema + return b +} + +func (b *SpaceOptionsBuilder) SetVersion(version int64) *SpaceOptionsBuilder { + b.options.Version = version + return b +} + +func (b *SpaceOptionsBuilder) SetLockManager(lockManager lock.LockManager) *SpaceOptionsBuilder { + b.options.LockManager = lockManager + return b +} + +func (b *SpaceOptionsBuilder) Reset() { + b.options = Options{LockManager: &lock.EmptyLockManager{}} +} + +func (b *SpaceOptionsBuilder) Build() Options { return b.options } + +func NewSpaceOptionBuilder() *SpaceOptionsBuilder { + return &SpaceOptionsBuilder{ + options: Options{ + Version: constant.LatestManifestVersion, + LockManager: &lock.EmptyLockManager{}, + }, + } +} + +func DefaultOptions() *Options { + return &Options{} +} + +type WriteOptions struct { + MaxRecordPerFile int64 +} + +var DefaultWriteOptions = WriteOptions{ + MaxRecordPerFile: 1024, +} + +func NewWriteOption() *WriteOptions { + return &WriteOptions{ + MaxRecordPerFile: 1024, + } +} + +type FsType int8 + +const ( + InMemory FsType = iota + LocalFS + S3 +) + +type SpaceOptions struct { + Fs FsType + VectorColumns []string +} + +// TODO: Change to FilterSet type +type FilterSet []filter.Filter + +var version int64 = math.MaxInt64 + +type ReadOptions struct { + // Filters map[string]filter.Filter + Filters map[string]filter.Filter + FiltersV2 FilterSet + Columns []string + ManifestVersion int64 + version int64 +} + +func NewReadOptions() *ReadOptions { + return &ReadOptions{ + Filters: make(map[string]filter.Filter), + FiltersV2: make(FilterSet, 0), + Columns: make([]string, 0), + ManifestVersion: constant.LatestManifestVersion, + version: math.MaxInt64, + } +} + +func (o *ReadOptions) AddFilter(filter filter.Filter) { + o.Filters[filter.GetColumnName()] = filter + o.FiltersV2 = append(o.FiltersV2, filter) +} + +func (o *ReadOptions) AddColumn(column string) { + o.Columns = append(o.Columns, column) +} + +func (o *ReadOptions) SetColumns(columns []string) { + o.Columns = columns +} + +func (o *ReadOptions) SetVersion(version int64) { + o.version = version +} + +func (o *ReadOptions) GetVersion() int64 { + return o.version +} + +func (o *ReadOptions) OutputColumns() []string { + return o.Columns +} diff --git a/internal/storagev2/storage/schema/schema.go b/internal/storagev2/storage/schema/schema.go new file mode 100644 index 0000000000000..c312ae9ef8015 --- /dev/null +++ b/internal/storagev2/storage/schema/schema.go @@ -0,0 +1,149 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package schema + +import ( + "github.com/apache/arrow/go/v12/arrow" + "github.com/milvus-io/milvus/internal/storagev2/common/constant" + "github.com/milvus-io/milvus/internal/storagev2/common/utils" + "github.com/milvus-io/milvus/internal/storagev2/proto/schema_proto" +) + +// Schema is a wrapper of arrow schema +type Schema struct { + schema *arrow.Schema + scalarSchema *arrow.Schema + vectorSchema *arrow.Schema + deleteSchema *arrow.Schema + + options *SchemaOptions +} + +func (s *Schema) Schema() *arrow.Schema { + return s.schema +} + +func (s *Schema) Options() *SchemaOptions { + return s.options +} + +func NewSchema(schema *arrow.Schema, options *SchemaOptions) *Schema { + return &Schema{ + schema: schema, + options: options, + } +} + +func (s *Schema) Validate() error { + err := s.options.Validate(s.schema) + if err != nil { + return err + } + err = s.BuildScalarSchema() + if err != nil { + return err + } + err = s.BuildVectorSchema() + if err != nil { + return err + } + err = s.BuildDeleteSchema() + if err != nil { + return err + } + return nil +} + +func (s *Schema) ScalarSchema() *arrow.Schema { + return s.scalarSchema +} + +func (s *Schema) VectorSchema() *arrow.Schema { + return s.vectorSchema +} + +func (s *Schema) DeleteSchema() *arrow.Schema { + return s.deleteSchema +} + +func (s *Schema) FromProtobuf(schema *schema_proto.Schema) error { + schemaType, err := utils.FromProtobufSchema(schema.ArrowSchema) + if err != nil { + return err + } + + s.schema = schemaType + s.options.FromProtobuf(schema.GetSchemaOptions()) + s.BuildScalarSchema() + s.BuildVectorSchema() + s.BuildDeleteSchema() + return nil +} + +func (s *Schema) ToProtobuf() (*schema_proto.Schema, error) { + schema := &schema_proto.Schema{} + arrowSchema, err := utils.ToProtobufSchema(s.schema) + if err != nil { + return nil, err + } + schema.ArrowSchema = arrowSchema + schema.SchemaOptions = s.options.ToProtobuf() + return schema, nil +} + +func (s *Schema) BuildScalarSchema() error { + fields := make([]arrow.Field, 0, len(s.schema.Fields())) + for _, field := range s.schema.Fields() { + if field.Name == s.options.VectorColumn { + continue + } + fields = append(fields, field) + } + offsetFiled := arrow.Field{Name: constant.OffsetFieldName, Type: arrow.DataType(&arrow.Int64Type{})} + fields = append(fields, offsetFiled) + s.scalarSchema = arrow.NewSchema(fields, nil) + + return nil +} + +func (s *Schema) BuildVectorSchema() error { + fields := make([]arrow.Field, 0, len(s.schema.Fields())) + for _, field := range s.schema.Fields() { + if field.Name == s.options.VectorColumn || + field.Name == s.options.PrimaryColumn || + field.Name == s.options.VersionColumn { + fields = append(fields, field) + } + } + s.vectorSchema = arrow.NewSchema(fields, nil) + + return nil +} + +func (s *Schema) BuildDeleteSchema() error { + pkColumn, ok := s.schema.FieldsByName(s.options.PrimaryColumn) + if !ok { + return ErrPrimaryColumnNotFound + } + versionField, ok := s.schema.FieldsByName(s.options.VersionColumn) + if !ok { + return ErrVersionColumnNotFound + } + fields := make([]arrow.Field, 0, 2) + fields = append(fields, pkColumn[0]) + fields = append(fields, versionField[0]) + s.deleteSchema = arrow.NewSchema(fields, nil) + return nil +} diff --git a/internal/storagev2/storage/schema/schema_option.go b/internal/storagev2/storage/schema/schema_option.go new file mode 100644 index 0000000000000..37f8f67181c3e --- /dev/null +++ b/internal/storagev2/storage/schema/schema_option.go @@ -0,0 +1,97 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package schema + +import ( + "errors" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/milvus-io/milvus/internal/storagev2/proto/schema_proto" +) + +var ( + ErrPrimaryColumnNotFound = errors.New("primary column not found") + ErrPrimaryColumnType = errors.New("primary column is not int64 or string") + ErrPrimaryColumnEmpty = errors.New("primary column is empty") + ErrVersionColumnNotFound = errors.New("version column not found") + ErrVersionColumnType = errors.New("version column is not int64") + ErrVectorColumnNotFound = errors.New("vector column not found") + ErrVectorColumnType = errors.New("vector column is not fixed size binary or fixed size list") + ErrVectorColumnEmpty = errors.New("vector column is empty") +) + +type SchemaOptions struct { + PrimaryColumn string + VersionColumn string + VectorColumn string +} + +func DefaultSchemaOptions() *SchemaOptions { + return &SchemaOptions{ + PrimaryColumn: "", + VersionColumn: "", + VectorColumn: "", + } +} + +func (o *SchemaOptions) ToProtobuf() *schema_proto.SchemaOptions { + options := &schema_proto.SchemaOptions{} + options.PrimaryColumn = o.PrimaryColumn + options.VersionColumn = o.VersionColumn + options.VectorColumn = o.VectorColumn + return options +} + +func (o *SchemaOptions) FromProtobuf(options *schema_proto.SchemaOptions) { + o.PrimaryColumn = options.PrimaryColumn + o.VersionColumn = options.VersionColumn + o.VectorColumn = options.VectorColumn +} + +func (o *SchemaOptions) Validate(schema *arrow.Schema) error { + if o.PrimaryColumn != "" { + primaryField, ok := schema.FieldsByName(o.PrimaryColumn) + if !ok { + return ErrPrimaryColumnNotFound + } else if primaryField[0].Type.ID() != arrow.STRING && primaryField[0].Type.ID() != arrow.INT64 { + return ErrPrimaryColumnType + } + } else { + return ErrPrimaryColumnEmpty + } + if o.VersionColumn != "" { + versionField, ok := schema.FieldsByName(o.VersionColumn) + if !ok { + return ErrVersionColumnNotFound + } else if versionField[0].Type.ID() != arrow.INT64 { + return ErrVersionColumnType + } + } + if o.VectorColumn != "" { + vectorField, b := schema.FieldsByName(o.VectorColumn) + if !b { + return ErrVectorColumnNotFound + } else if vectorField[0].Type.ID() != arrow.FIXED_SIZE_BINARY && vectorField[0].Type.ID() != arrow.FIXED_SIZE_LIST { + return ErrVectorColumnType + } + } else { + return ErrVectorColumnEmpty + } + return nil +} + +func (o *SchemaOptions) HasVersionColumn() bool { + return o.VersionColumn != "" +} diff --git a/internal/storagev2/storage/schema/schema_test.go b/internal/storagev2/storage/schema/schema_test.go new file mode 100644 index 0000000000000..0967782add993 --- /dev/null +++ b/internal/storagev2/storage/schema/schema_test.go @@ -0,0 +1,53 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package schema + +import ( + "testing" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/stretchr/testify/assert" +) + +// Test Schema.Schema +func TestBuildSchema(t *testing.T) { + pkField := arrow.Field{ + Name: "pk_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + vsField := arrow.Field{ + Name: "vs_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + vecField := arrow.Field{ + Name: "vec_field", + Type: arrow.DataType(&arrow.FixedSizeBinaryType{ByteWidth: 16}), + Nullable: false, + } + fields := []arrow.Field{pkField, vsField, vecField} + + as := arrow.NewSchema(fields, nil) + schemaOptions := &SchemaOptions{ + PrimaryColumn: "pk_field", + VersionColumn: "vs_field", + VectorColumn: "vec_field", + } + + sc := NewSchema(as, schemaOptions) + err := sc.Validate() + assert.NoError(t, err) +} diff --git a/internal/storagev2/storage/space.go b/internal/storagev2/storage/space.go new file mode 100644 index 0000000000000..2893276c09977 --- /dev/null +++ b/internal/storagev2/storage/space.go @@ -0,0 +1,219 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package storage + +import ( + "math" + + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/milvus-io/milvus/internal/storagev2/common/errors" + "github.com/milvus-io/milvus/internal/storagev2/common/log" + "github.com/milvus-io/milvus/internal/storagev2/common/utils" + "github.com/milvus-io/milvus/internal/storagev2/file/blob" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" + "github.com/milvus-io/milvus/internal/storagev2/filter" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/reader/record_reader" + "github.com/milvus-io/milvus/internal/storagev2/storage/lock" + "github.com/milvus-io/milvus/internal/storagev2/storage/manifest" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" + "github.com/milvus-io/milvus/internal/storagev2/storage/transaction" +) + +type Space struct { + path string + fs fs.Fs + deleteFragments fragment.DeleteFragmentVector + manifest *manifest.Manifest + lockManager lock.LockManager +} + +func (s *Space) init() error { + for _, f := range s.manifest.GetDeleteFragments() { + deleteFragment := fragment.Make(s.fs, s.manifest.GetSchema(), f) + s.deleteFragments = append(s.deleteFragments, deleteFragment) + } + return nil +} + +func NewSpace(f fs.Fs, path string, m *manifest.Manifest, lockManager lock.LockManager) *Space { + deleteFragments := fragment.DeleteFragmentVector{} + return &Space{ + fs: f, + path: path, + manifest: m, + deleteFragments: deleteFragments, + lockManager: lockManager, + } +} + +func (s *Space) NewTransaction() transaction.Transaction { + return transaction.NewConcurrentWriteTransaction(s) +} + +func (s *Space) Write(reader array.RecordReader, options *options.WriteOptions) error { + return transaction.NewConcurrentWriteTransaction(s).Write(reader, options).Commit() +} + +func (s *Space) Delete(reader array.RecordReader) error { + return transaction.NewConcurrentWriteTransaction(s).Delete(reader).Commit() +} + +// Open opened a space or create if the space does not exist. +// If space does not exist. schema should not be nullptr, or an error will be returned. +// If space exists and version is specified, it will restore to the state at this version, +// or it will choose the latest version. +func Open(uri string, opt options.Options) (*Space, error) { + var f fs.Fs + var m *manifest.Manifest + var path string + f, err := fs.BuildFileSystem(uri) + if err != nil { + return nil, err + } + + path = f.Path() + log.Debug("open space", log.String("path", path)) + + log.Debug(utils.GetManifestDir(path)) + // create if not exist + if err = f.CreateDir(utils.GetManifestDir(path)); err != nil { + return nil, err + } + if err = f.CreateDir(utils.GetScalarDataDir(path)); err != nil { + return nil, err + } + if err = f.CreateDir(utils.GetVectorDataDir(path)); err != nil { + return nil, err + } + if err = f.CreateDir(utils.GetBlobDir(path)); err != nil { + return nil, err + } + if err = f.CreateDir(utils.GetDeleteDataDir(path)); err != nil { + return nil, err + } + + rw := manifest.NewManifestReaderWriter(f, path) + m, err = rw.Read(opt.Version) + if err != nil { + // create the first manifest file + if err == manifest.ErrManifestNotFound { + if opt.Schema == nil { + log.Error("schema is nil") + return nil, errors.ErrSchemaIsNil + } + if err = opt.Schema.Validate(); err != nil { + return nil, err + } + m = manifest.NewManifest(opt.Schema) + m.SetVersion(0) // TODO: check if this is necessary + if err = rw.Write(m); err != nil { + return nil, err + } + } else { + return nil, err + } + } + space := NewSpace(f, path, m, opt.LockManager) + return space, nil +} + +func (s *Space) readManifest(version int64) error { + rw := manifest.NewManifestReaderWriter(s.fs, s.path) + manifest, err := rw.Read(version) + if err != nil { + return err + } + s.manifest = manifest + return nil +} + +func (s *Space) Read(readOptions *options.ReadOptions) (array.RecordReader, error) { + if s.manifest == nil || readOptions.ManifestVersion != s.manifest.Version() { + if err := s.readManifest(readOptions.ManifestVersion); err != nil { + return nil, err + } + } + if s.manifest.GetSchema().Options().HasVersionColumn() { + f := filter.NewConstantFilter(filter.LessThanOrEqual, s.manifest.GetSchema().Options().VersionColumn, int64(math.MaxInt64)) + readOptions.AddFilter(f) + readOptions.AddColumn(s.manifest.GetSchema().Options().VersionColumn) + } + log.Debug("read", log.Any("readOption", readOptions)) + + return record_reader.MakeRecordReader(s.manifest, s.manifest.GetSchema(), s.fs, s.deleteFragments, readOptions), nil +} + +func (s *Space) WriteBlob(content []byte, name string, replace bool) error { + return transaction.NewConcurrentWriteTransaction(s).WriteBlob(content, name, replace).Commit() +} + +func (s *Space) ReadBlob(name string, output []byte) (int, error) { + blob, ok := s.manifest.GetBlob(name) + if !ok { + return -1, errors.ErrBlobNotExist + } + + f, err := s.fs.OpenFile(blob.File) + if err != nil { + return -1, err + } + + return f.Read(output) +} + +func (s *Space) GetBlobByteSize(name string) (int64, error) { + blob, ok := s.manifest.GetBlob(name) + if !ok { + return -1, errors.ErrBlobNotExist + } + return blob.Size, nil +} + +func (s *Space) GetCurrentVersion() int64 { + return s.manifest.Version() +} + +func (s *Space) ScanDelete() (array.RecordReader, error) { + return record_reader.MakeScanDeleteReader(s.manifest, s.fs), nil +} + +func (s *Space) Path() string { + return s.path +} + +func (s *Space) Fs() fs.Fs { + return s.fs +} + +func (s *Space) Manifest() *manifest.Manifest { + return s.manifest +} + +func (s *Space) SetManifest(manifest *manifest.Manifest) { + s.manifest = manifest +} + +func (s *Space) LockManager() lock.LockManager { + return s.lockManager +} + +func (s *Space) SetLockManager(lockManager lock.LockManager) { + s.lockManager = lockManager +} + +func (s *Space) StatisticsBlobs() []blob.Blob { + return s.manifest.GetBlobs() +} diff --git a/internal/storagev2/storage/space_test.go b/internal/storagev2/storage/space_test.go new file mode 100644 index 0000000000000..28413e38940b2 --- /dev/null +++ b/internal/storagev2/storage/space_test.go @@ -0,0 +1,343 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package storage_test + +import ( + "sync" + "testing" + + "github.com/milvus-io/milvus/internal/storagev2/storage/options" + "github.com/milvus-io/milvus/internal/storagev2/storage/schema" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/apache/arrow/go/v12/arrow/memory" + "github.com/milvus-io/milvus/internal/storagev2/filter" + "github.com/milvus-io/milvus/internal/storagev2/storage" + "github.com/milvus-io/milvus/internal/storagev2/storage/lock" + "github.com/stretchr/testify/suite" +) + +type SpaceTestSuite struct { + suite.Suite +} + +func createSchema() *schema.Schema { + pkField := arrow.Field{ + Name: "pk_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + vsField := arrow.Field{ + Name: "vs_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + vecField := arrow.Field{ + Name: "vec_field", + Type: arrow.DataType(&arrow.FixedSizeBinaryType{ByteWidth: 10}), + Nullable: false, + } + columnField := arrow.Field{ + Name: "column_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + fields := []arrow.Field{pkField, vsField, vecField, columnField} + + as := arrow.NewSchema(fields, nil) + schemaOptions := &schema.SchemaOptions{ + PrimaryColumn: "pk_field", + VersionColumn: "vs_field", + VectorColumn: "vec_field", + } + + sc := schema.NewSchema(as, schemaOptions) + return sc +} + +func recordReader() array.RecordReader { + pkBuilder := array.NewInt64Builder(memory.DefaultAllocator) + pkBuilder.AppendValues([]int64{1, 2, 3}, nil) + pkArr := pkBuilder.NewArray() + + vsBuilder := array.NewInt64Builder(memory.DefaultAllocator) + vsBuilder.AppendValues([]int64{1, 2, 3}, nil) + vsArr := vsBuilder.NewArray() + + vecBuilder := array.NewFixedSizeBinaryBuilder(memory.DefaultAllocator, &arrow.FixedSizeBinaryType{ByteWidth: 10}) + vecBuilder.AppendValues([][]byte{ + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + }, nil) + vecArr := vecBuilder.NewArray() + + columnBuilder := array.NewInt64Builder(memory.DefaultAllocator) + columnBuilder.AppendValues([]int64{1, 2, 3}, nil) + columnArr := columnBuilder.NewArray() + + arrs := []arrow.Array{pkArr, vsArr, vecArr, columnArr} + + rec := array.NewRecord(createSchema().Schema(), arrs, 3) + recReader, err := array.NewRecordReader(createSchema().Schema(), []arrow.Record{rec}) + if err != nil { + panic(err) + } + return recReader +} + +func deleteRecordReader() array.RecordReader { + pkField := arrow.Field{ + Name: "pk_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + vsField := arrow.Field{ + Name: "vs_field", + Type: arrow.DataType(&arrow.Int64Type{}), + Nullable: false, + } + + deleteArrowSchema := arrow.NewSchema([]arrow.Field{pkField, vsField}, nil) + + deletePkBuilder := array.NewInt64Builder(memory.DefaultAllocator) + deletePkBuilder.AppendValues([]int64{1}, nil) + deletePkArr := deletePkBuilder.NewArray() + + deleteVsBuilder := array.NewInt64Builder(memory.DefaultAllocator) + deleteVsBuilder.AppendValues([]int64{1}, nil) + deleteVsArr := deleteVsBuilder.NewArray() + + deleteArray := []arrow.Array{deletePkArr, deleteVsArr} + rec := array.NewRecord(deleteArrowSchema, deleteArray, 1) + recReader, err := array.NewRecordReader(deleteArrowSchema, []arrow.Record{rec}) + if err != nil { + panic(err) + } + return recReader +} + +func (suite *SpaceTestSuite) TestSpaceReadWrite() { + sc := createSchema() + err := sc.Validate() + suite.NoError(err) + + opts := options.NewSpaceOptionBuilder().SetSchema(sc).SetVersion(0).Build() + + space, err := storage.Open("file:///"+suite.T().TempDir(), opts) + suite.NoError(err) + + writeOpt := &options.WriteOptions{MaxRecordPerFile: 1000} + err = space.Write(recordReader(), writeOpt) + suite.NoError(err) + + f := filter.NewConstantFilter(filter.Equal, "pk_field", int64(1)) + readOpt := options.NewReadOptions() + readOpt.AddFilter(f) + readOpt.AddColumn("pk_field") + readReader, err := space.Read(readOpt) + suite.NoError(err) + var resVals []int64 + for readReader.Next() { + rec := readReader.Record() + cols := rec.Columns() + values := cols[0].(*array.Int64).Int64Values() + resVals = append(resVals, values...) + } + + suite.ElementsMatch([]int64{1}, resVals) +} + +func (suite *SpaceTestSuite) TestSpaceReadWriteConcurrency() { + sc := createSchema() + err := sc.Validate() + suite.NoError(err) + + opts := options.Options{ + Version: 0, + LockManager: lock.NewMemoryLockManager(), + Schema: sc, + } + + space, err := storage.Open("file:///"+suite.T().TempDir(), opts) + suite.NoError(err) + + writeOpt := &options.WriteOptions{MaxRecordPerFile: 1000} + + wg := sync.WaitGroup{} + for i := 0; i < 100; i++ { + wg.Add(1) + go func() { + err = space.Write(recordReader(), writeOpt) + wg.Done() + }() + } + + wg.Wait() +} + +func (suite *SpaceTestSuite) TestSpaceDelete() { + sc := createSchema() + err := sc.Validate() + suite.NoError(err) + + opts := options.NewSpaceOptionBuilder().SetSchema(sc).SetVersion(0).Build() + + space, err := storage.Open("file:///"+suite.T().TempDir(), opts) + suite.NoError(err) + + err = space.Delete(deleteRecordReader()) + suite.NoError(err) +} + +func (suite *SpaceTestSuite) TestSpaceReadWithFilter() { + sc := createSchema() + err := sc.Validate() + suite.NoError(err) + + opts := options.NewSpaceOptionBuilder().SetSchema(sc).SetVersion(0).Build() + + space, err := storage.Open("file:///"+suite.T().TempDir(), opts) + suite.NoError(err) + + writeOpt := &options.WriteOptions{MaxRecordPerFile: 1000} + err = space.Write(recordReader(), writeOpt) + suite.NoError(err) + + f := filter.NewConstantFilter(filter.Equal, "pk_field", int64(1)) + readOpt := options.NewReadOptions() + readOpt.AddFilter(f) + readOpt.AddColumn("pk_field") + readReader, err := space.Read(readOpt) + suite.NoError(err) + var resValues []int64 + for readReader.Next() { + rec := readReader.Record() + cols := rec.Columns() + values := cols[0].(*array.Int64).Int64Values() + resValues = append(resValues, values...) + } + suite.ElementsMatch([]int64{1}, resValues) + + f = filter.NewConstantFilter(filter.GreaterThan, "pk_field", int64(1)) + readOpt = options.NewReadOptions() + readOpt.AddFilter(f) + readOpt.AddColumn("pk_field") + readReader, err = space.Read(readOpt) + suite.NoError(err) + resValues = []int64{} + for readReader.Next() { + rec := readReader.Record() + cols := rec.Columns() + values := cols[0].(*array.Int64).Int64Values() + resValues = append(resValues, values...) + } + suite.ElementsMatch([]int64{2, 3}, resValues) + + f = filter.NewConstantFilter(filter.NotEqual, "pk_field", int64(1)) + readOpt = options.NewReadOptions() + readOpt.AddFilter(f) + readOpt.AddColumn("pk_field") + readReader, err = space.Read(readOpt) + suite.NoError(err) + resValues = []int64{} + for readReader.Next() { + rec := readReader.Record() + cols := rec.Columns() + values := cols[0].(*array.Int64).Int64Values() + resValues = append(resValues, values...) + } + suite.ElementsMatch([]int64{2, 3}, resValues) + + f = filter.NewConstantFilter(filter.LessThan, "pk_field", int64(1)) + readOpt = options.NewReadOptions() + readOpt.AddFilter(f) + readOpt.AddColumn("pk_field") + readReader, err = space.Read(readOpt) + suite.NoError(err) + resValues = []int64{} + for readReader.Next() { + rec := readReader.Record() + cols := rec.Columns() + values := cols[0].(*array.Int64).Int64Values() + resValues = append(resValues, values...) + } + suite.ElementsMatch([]int64{}, resValues) + + f = filter.NewConstantFilter(filter.LessThan, "pk_field", int64(1)) + readOpt = options.NewReadOptions() + readOpt.AddFilter(f) + readOpt.AddColumn("pk_field") + readReader, err = space.Read(readOpt) + suite.NoError(err) + resValues = []int64{} + for readReader.Next() { + rec := readReader.Record() + cols := rec.Columns() + values := cols[0].(*array.Int64).Int64Values() + resValues = append(resValues, values...) + } + suite.ElementsMatch([]int64{}, resValues) + + f = filter.NewConstantFilter(filter.LessThanOrEqual, "pk_field", int64(1)) + readOpt = options.NewReadOptions() + readOpt.AddFilter(f) + readOpt.AddColumn("pk_field") + readReader, err = space.Read(readOpt) + suite.NoError(err) + resValues = []int64{} + for readReader.Next() { + rec := readReader.Record() + cols := rec.Columns() + values := cols[0].(*array.Int64).Int64Values() + resValues = append(resValues, values...) + } + suite.ElementsMatch([]int64{1}, resValues) + + f = filter.NewConstantFilter(filter.GreaterThanOrEqual, "pk_field", int64(1)) + readOpt = options.NewReadOptions() + readOpt.AddFilter(f) + readOpt.AddColumn("pk_field") + readReader, err = space.Read(readOpt) + suite.NoError(err) + resValues = []int64{} + for readReader.Next() { + rec := readReader.Record() + cols := rec.Columns() + values := cols[0].(*array.Int64).Int64Values() + resValues = append(resValues, values...) + } + suite.ElementsMatch([]int64{1, 2, 3}, resValues) + + f = filter.NewConstantFilter(filter.GreaterThan, "pk_field", int64(2)) + readOpt = options.NewReadOptions() + readOpt.AddFilter(f) + readOpt.AddColumn("pk_field") + readReader, err = space.Read(readOpt) + suite.NoError(err) + resValues = []int64{} + for readReader.Next() { + rec := readReader.Record() + cols := rec.Columns() + values := cols[0].(*array.Int64).Int64Values() + resValues = append(resValues, values...) + } + suite.ElementsMatch([]int64{3}, resValues) +} + +func TestSpaceTestSuite(t *testing.T) { + suite.Run(t, new(SpaceTestSuite)) +} diff --git a/internal/storagev2/storage/transaction/transaction.go b/internal/storagev2/storage/transaction/transaction.go new file mode 100644 index 0000000000000..4df7e9343af10 --- /dev/null +++ b/internal/storagev2/storage/transaction/transaction.go @@ -0,0 +1,326 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package transaction + +import ( + "fmt" + + "github.com/apache/arrow/go/v12/arrow" + "github.com/apache/arrow/go/v12/arrow/array" + "github.com/apache/arrow/go/v12/arrow/memory" + "github.com/milvus-io/milvus/internal/storagev2/common/errors" + "github.com/milvus-io/milvus/internal/storagev2/common/log" + "github.com/milvus-io/milvus/internal/storagev2/common/utils" + "github.com/milvus-io/milvus/internal/storagev2/file/blob" + "github.com/milvus-io/milvus/internal/storagev2/file/fragment" + "github.com/milvus-io/milvus/internal/storagev2/io/format" + "github.com/milvus-io/milvus/internal/storagev2/io/format/parquet" + "github.com/milvus-io/milvus/internal/storagev2/io/fs" + "github.com/milvus-io/milvus/internal/storagev2/storage/lock" + "github.com/milvus-io/milvus/internal/storagev2/storage/manifest" + "github.com/milvus-io/milvus/internal/storagev2/storage/options" +) + +type SpaceMeta interface { + Path() string + Fs() fs.Fs + Manifest() *manifest.Manifest + LockManager() lock.LockManager + SetManifest(manifest *manifest.Manifest) +} + +type Transaction interface { + Write(reader array.RecordReader, options *options.WriteOptions) Transaction + Delete(reader array.RecordReader) Transaction + WriteBlob(content []byte, name string, replace bool) Transaction + Commit() error +} + +type ConcurrentWriteTransaction struct { + operations []Operation + commit manifest.ManifestCommit + space SpaceMeta +} + +func (t *ConcurrentWriteTransaction) Write(reader array.RecordReader, options *options.WriteOptions) Transaction { + operation := &WriteOperation{ + reader: reader, + options: options, + space: t.space, + transaction: t, + } + t.operations = append(t.operations, operation) + return t +} + +func (t *ConcurrentWriteTransaction) Delete(reader array.RecordReader) Transaction { + operation := &DeleteOperation{ + reader: reader, + space: t.space, + transaction: t, + } + t.operations = append(t.operations, operation) + return t +} + +func (t *ConcurrentWriteTransaction) WriteBlob(content []byte, name string, replace bool) Transaction { + operation := &WriteBlobOperation{ + content: content, + name: name, + replace: replace, + space: t.space, + transaction: t, + } + t.operations = append(t.operations, operation) + return t +} + +func (t *ConcurrentWriteTransaction) Commit() error { + for _, op := range t.operations { + op.Execute() + } + nxtManifest, err := t.commit.Commit() + if err != nil { + return err + } + t.space.SetManifest(nxtManifest) + return nil +} + +func NewConcurrentWriteTransaction(space SpaceMeta) *ConcurrentWriteTransaction { + return &ConcurrentWriteTransaction{ + operations: make([]Operation, 0), + commit: manifest.NewManifestCommit(space.LockManager(), manifest.NewManifestReaderWriter(space.Fs(), space.Path())), + space: space, + } +} + +type Operation interface { + Execute() error +} + +type WriteOperation struct { + reader array.RecordReader + options *options.WriteOptions + space SpaceMeta + transaction *ConcurrentWriteTransaction +} + +func (w *WriteOperation) Execute() error { + if !w.space.Manifest().GetSchema().Schema().Equal(w.reader.Schema()) { + return errors.ErrSchemaNotMatch + } + + scalarSchema, vectorSchema := w.space.Manifest().GetSchema().ScalarSchema(), w.space.Manifest().GetSchema().VectorSchema() + var ( + scalarWriter format.Writer + vectorWriter format.Writer + ) + scalarFragment := fragment.NewFragment() + vectorFragment := fragment.NewFragment() + + isEmpty := true + for w.reader.Next() { + rec := w.reader.Record() + + if rec.NumRows() == 0 { + continue + } + + var err error + scalarWriter, err = w.write(scalarSchema, rec, scalarWriter, &scalarFragment, w.options, true) + if err != nil { + return err + } + vectorWriter, err = w.write(vectorSchema, rec, vectorWriter, &vectorFragment, w.options, false) + if err != nil { + return err + } + isEmpty = false + } + + if scalarWriter != nil { + if err := scalarWriter.Close(); err != nil { + return err + } + } + if vectorWriter != nil { + if err := vectorWriter.Close(); err != nil { + return err + } + } + + if isEmpty { + return nil + } + + op1 := manifest.AddScalarFragmentOp{ScalarFragment: scalarFragment} + op2 := manifest.AddVectorFragmentOp{VectorFragment: vectorFragment} + w.transaction.commit.AddOp(op1, op2) + return nil +} + +func (w *WriteOperation) write( + schema *arrow.Schema, + rec arrow.Record, + writer format.Writer, + fragment *fragment.Fragment, + opt *options.WriteOptions, + isScalar bool, +) (format.Writer, error) { + var columns []arrow.Array + cols := rec.Columns() + for k := range cols { + _, has := schema.FieldsByName(rec.ColumnName(k)) + if has { + columns = append(columns, cols[k]) + } + } + + var rootPath string + if isScalar { + // add offset column for scalar + offsetValues := make([]int64, rec.NumRows()) + for i := 0; i < int(rec.NumRows()); i++ { + offsetValues[i] = int64(i) + } + builder := array.NewInt64Builder(memory.DefaultAllocator) + builder.AppendValues(offsetValues, nil) + offsetColumn := builder.NewArray() + columns = append(columns, offsetColumn) + rootPath = utils.GetScalarDataDir(w.space.Path()) + } else { + rootPath = utils.GetVectorDataDir(w.space.Path()) + } + + var err error + + record := array.NewRecord(schema, columns, rec.NumRows()) + + if writer == nil { + filePath := utils.GetNewParquetFilePath(rootPath) + writer, err = parquet.NewFileWriter(schema, w.space.Fs(), filePath) + if err != nil { + return nil, err + } + fragment.AddFile(filePath) + } + + err = writer.Write(record) + if err != nil { + return nil, err + } + + if writer.Count() >= opt.MaxRecordPerFile { + log.Debug("close writer", log.Any("count", writer.Count())) + err = writer.Close() + if err != nil { + return nil, err + } + writer = nil + } + + return writer, nil +} + +type DeleteOperation struct { + reader array.RecordReader + space SpaceMeta + transaction *ConcurrentWriteTransaction +} + +func (o *DeleteOperation) Execute() error { + schema := o.space.Manifest().GetSchema().DeleteSchema() + fragment := fragment.NewFragment() + var ( + err error + writer format.Writer + deleteFile string + ) + + for o.reader.Next() { + rec := o.reader.Record() + if rec.NumRows() == 0 { + continue + } + + if writer == nil { + deleteFile = utils.GetNewParquetFilePath(utils.GetDeleteDataDir(o.space.Path())) + writer, err = parquet.NewFileWriter(schema, o.space.Fs(), deleteFile) + if err != nil { + return err + } + fragment.AddFile(deleteFile) + } + + if err = writer.Write(rec); err != nil { + return err + } + } + + if writer != nil { + if err = writer.Close(); err != nil { + return err + } + + op := manifest.AddDeleteFragmentOp{DeleteFragment: fragment} + o.transaction.commit.AddOp(op) + } + return nil +} + +type WriteBlobOperation struct { + content []byte + name string + replace bool + space SpaceMeta + transaction *ConcurrentWriteTransaction +} + +func (o *WriteBlobOperation) Execute() error { + if !o.replace && o.space.Manifest().HasBlob(o.name) { + return errors.ErrBlobAlreadyExist + } + + blobFile := utils.GetBlobFilePath(o.space.Path()) + f, err := o.space.Fs().OpenFile(blobFile) + if err != nil { + return err + } + + n, err := f.Write(o.content) + if err != nil { + return err + } + + if n != len(o.content) { + return fmt.Errorf("blob not writen completely, writen %d but expect %d", n, len(o.content)) + } + + if err = f.Close(); err != nil { + return err + } + + op := manifest.AddBlobOp{ + Replace: o.replace, + Blob: blob.Blob{ + Name: o.name, + Size: int64(len(o.content)), + File: blobFile, + }, + } + o.transaction.commit.AddOp(op) + return nil +} diff --git a/internal/storagev2/storage/transaction/transaction_test.go b/internal/storagev2/storage/transaction/transaction_test.go new file mode 100644 index 0000000000000..fd0899dd9105a --- /dev/null +++ b/internal/storagev2/storage/transaction/transaction_test.go @@ -0,0 +1,15 @@ +// Copyright 2023 Zilliz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package transaction diff --git a/scripts/core_build.sh b/scripts/core_build.sh index 7169ce05c7da6..fecdc3acdf2c5 100755 --- a/scripts/core_build.sh +++ b/scripts/core_build.sh @@ -101,6 +101,7 @@ USE_ASAN="OFF" USE_DYNAMIC_SIMD="ON" USE_OPENDAL="OFF" INDEX_ENGINE="KNOWHERE" +ENABLE_AZURE_FS="OFF" : "${ENABLE_GCP_NATIVE:="OFF"}" while getopts "p:d:t:s:f:n:i:y:a:x:o:ulrcghzmebZ" arg; do @@ -257,7 +258,8 @@ ${CMAKE_EXTRA_ARGS} \ -DCPU_ARCH=${CPU_ARCH} \ -DUSE_OPENDAL=${USE_OPENDAL} \ -DINDEX_ENGINE=${INDEX_ENGINE} \ --DENABLE_GCP_NATIVE=${ENABLE_GCP_NATIVE} " +-DENABLE_GCP_NATIVE=${ENABLE_GCP_NATIVE} \ +-DENABLE_AZURE_FS=${ENABLE_AZURE_FS} " if [ -z "$BUILD_WITHOUT_AZURE" ]; then CMAKE_CMD=${CMAKE_CMD}"-DAZURE_BUILD_DIR=${AZURE_BUILD_DIR} \ -DVCPKG_TARGET_TRIPLET=${VCPKG_TARGET_TRIPLET} "