From 2958c7d5c3d4cc84e1e55d0bed13df326bc492e8 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 15 Aug 2024 09:40:36 -0600 Subject: [PATCH 01/17] Store: Add object_store backend. --- Cargo.lock | 1285 +++++++++++++++++++++++++++++++++++++++++++++--- Cargo.toml | 3 + src/dataset.rs | 188 +++++-- src/lib.rs | 9 +- src/storage.rs | 196 +++++++- 5 files changed, 1578 insertions(+), 103 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ecb2df47..d7741ecb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -40,6 +40,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + [[package]] name = "android-tzdata" version = "0.1.1" @@ -278,7 +293,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -290,6 +305,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.3.0" @@ -344,6 +365,36 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "brotli" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74f7971dbd9326d58187408ab83117d8ac1bb9c17b085fdacd1cf2f598719b6b" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "4.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + [[package]] name = "bumpalo" version = "3.16.0" @@ -367,6 +418,10 @@ name = "cc" version = "1.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26a5c3fd7bfa1ce3897a3a3501d362b2d87b7f2583ebcb4a949ec25911025cbc" +dependencies = [ + "jobserver", + "libc", +] [[package]] name = "cfg-if" @@ -383,7 +438,8 @@ dependencies = [ "android-tzdata", "iana-time-zone", "num-traits", - "windows-targets", + "serde", + "windows-targets 0.52.6", ] [[package]] @@ -417,18 +473,47 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + [[package]] name = "crunchy" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "csv" version = "1.3.0" @@ -456,6 +541,22 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + [[package]] name = "either" version = "1.13.0" @@ -475,7 +576,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.52.0", ] [[package]] @@ -494,12 +595,31 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "flate2" +version = "1.0.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + [[package]] name = "futures" version = "0.3.30" @@ -556,7 +676,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -589,6 +709,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.15" @@ -606,6 +736,25 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" +[[package]] +name = "h2" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "half" version = "2.4.1" @@ -623,12 +772,128 @@ version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "http" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http", +] + +[[package]] +name = "http-body-util" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" +dependencies = [ + "bytes", + "futures-util", + "http", + "http-body", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "hyper" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" +dependencies = [ + "futures-util", + "http", + "hyper", + "hyper-util", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" +dependencies = [ + "bytes", + "futures-channel", + "futures-util", + "http", + "http-body", + "hyper", + "pin-project-lite", + "socket2", + "tokio", + "tower", + "tower-service", + "tracing", +] + [[package]] name = "iana-time-zone" version = "0.1.60" @@ -658,9 +923,12 @@ version = "0.1.0" dependencies = [ "arrow", "async-trait", + "base64", "bytes", "futures", "itertools", + "object_store", + "parquet", "pretty_assertions", "proptest", "rand", @@ -668,6 +936,16 @@ dependencies = [ "tokio", ] +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + [[package]] name = "indexmap" version = "2.2.6" @@ -678,6 +956,18 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "ipnet" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" + [[package]] name = "itertools" version = "0.13.0" @@ -693,6 +983,15 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jobserver" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.69" @@ -790,18 +1089,53 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + [[package]] name = "log" version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "lz4_flex" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + [[package]] name = "miniz_oxide" version = "0.7.4" @@ -811,6 +1145,18 @@ dependencies = [ "adler", ] +[[package]] +name = "mio" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" +dependencies = [ + "hermit-abi", + "libc", + "wasi", + "windows-sys 0.52.0", +] + [[package]] name = "num" version = "0.4.3" @@ -895,65 +1241,207 @@ dependencies = [ ] [[package]] -name = "once_cell" -version = "1.19.0" +name = "object_store" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "e6da452820c715ce78221e8202ccc599b4a52f3e1eb3eedb487b680c81a8e3f3" +dependencies = [ + "async-trait", + "base64", + "bytes", + "chrono", + "futures", + "humantime", + "hyper", + "itertools", + "md-5", + "parking_lot", + "percent-encoding", + "quick-xml", + "rand", + "reqwest", + "ring", + "serde", + "serde_json", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] [[package]] -name = "pin-project-lite" -version = "0.2.14" +name = "once_cell" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] -name = "pin-utils" -version = "0.1.0" +name = "openssl-probe" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] -name = "ppv-lite86" -version = "0.2.18" +name = "ordered-float" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee4364d9f3b902ef14fab8a1ddffb783a1cb6b4bba3bfc1fa3922732c7de97f" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" dependencies = [ - "zerocopy 0.6.6", + "num-traits", ] [[package]] -name = "pretty_assertions" -version = "1.4.0" +name = "parking_lot" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ - "diff", - "yansi", + "lock_api", + "parking_lot_core", ] [[package]] -name = "proc-macro2" -version = "1.0.86" +name = "parking_lot_core" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ - "unicode-ident", + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.52.6", ] [[package]] -name = "proptest" -version = "1.5.0" +name = "parquet" +version = "52.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4c2511913b88df1637da85cc8d96ec8e43a3f8bb8ccb71ee1ac240d6f3df58d" +checksum = "e977b9066b4d3b03555c22bdc442f3fadebd96a39111249113087d0edb2691cd" dependencies = [ - "bit-set", - "bit-vec", - "bitflags 2.6.0", - "lazy_static", - "num-traits", - "rand", - "rand_chacha", - "rand_xorshift", + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "arrow-select", + "base64", + "brotli", + "bytes", + "chrono", + "flate2", + "futures", + "half", + "hashbrown", + "lz4_flex", + "num", + "num-bigint", + "object_store", + "paste", + "seq-macro", + "snap", + "thrift", + "tokio", + "twox-hash", + "zstd", + "zstd-sys", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "pin-project" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkg-config" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" + +[[package]] +name = "ppv-lite86" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dee4364d9f3b902ef14fab8a1ddffb783a1cb6b4bba3bfc1fa3922732c7de97f" +dependencies = [ + "zerocopy 0.6.6", +] + +[[package]] +name = "pretty_assertions" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af7cee1a6c8a5b9208b3cb1061f10c0cb689087b3d8ce85fb9d2dd7a29b6ba66" +dependencies = [ + "diff", + "yansi", +] + +[[package]] +name = "proc-macro2" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "proptest" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c2511913b88df1637da85cc8d96ec8e43a3f8bb8ccb71ee1ac240d6f3df58d" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags 2.6.0", + "lazy_static", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", "regex-syntax", "rusty-fork", "tempfile", @@ -966,6 +1454,64 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +[[package]] +name = "quick-xml" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96a05e2e8efddfa51a84ca47cec303fac86c8541b686d37cac5efc0e094417bc" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quinn" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" +dependencies = [ + "bytes", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2", + "thiserror", + "tokio", + "tracing", +] + +[[package]] +name = "quinn-proto" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba92fb39ec7ad06ca2582c0ca834dfeadcaf06ddfc8e635c80aa7e1c05315fdd" +dependencies = [ + "bytes", + "rand", + "ring", + "rustc-hash", + "rustls", + "slab", + "thiserror", + "tinyvec", + "tracing", +] + +[[package]] +name = "quinn-udp" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bffec3605b73c6f1754535084a85229fa8a30f86014e6c81aeec4abb68b0285" +dependencies = [ + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.52.0", +] + [[package]] name = "quote" version = "1.0.36" @@ -1014,6 +1560,15 @@ dependencies = [ "rand_core", ] +[[package]] +name = "redox_syscall" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" +dependencies = [ + "bitflags 2.6.0", +] + [[package]] name = "regex" version = "1.10.5" @@ -1043,12 +1598,78 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +[[package]] +name = "reqwest" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +dependencies = [ + "base64", + "bytes", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-native-certs", + "rustls-pemfile", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "winreg", +] + +[[package]] +name = "ring" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +dependencies = [ + "cc", + "cfg-if", + "getrandom", + "libc", + "spin", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rustc-demangle" version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc-hash" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" + [[package]] name = "rustc_version" version = "0.4.0" @@ -1068,7 +1689,61 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.23.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c58f8c84392efc0a126acce10fa59ff7b3d2ac06ab451a33f2741989b806b044" +dependencies = [ + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" +dependencies = [ + "openssl-probe", + "rustls-pemfile", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" +dependencies = [ + "base64", + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" + +[[package]] +name = "rustls-webpki" +version = "0.102.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", ] [[package]] @@ -1095,12 +1770,65 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.6.0", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75da29fe9b9b08fe9d6b22b5b4bcbc75d8db3aa31e639aa56bb62e9d46bfceaf" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "semver" version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" +[[package]] +name = "seq-macro" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" + [[package]] name = "serde" version = "1.0.204" @@ -1118,7 +1846,7 @@ checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -1133,6 +1861,18 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + [[package]] name = "slab" version = "0.4.9" @@ -1142,6 +1882,56 @@ dependencies = [ "autocfg", ] +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "snafu" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +dependencies = [ + "heck 0.4.1", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + +[[package]] +name = "socket2" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + [[package]] name = "static_assertions" version = "1.1.0" @@ -1160,11 +1950,28 @@ version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" dependencies = [ - "heck", + "heck 0.5.0", "proc-macro2", "quote", "rustversion", - "syn", + "syn 2.0.72", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", ] [[package]] @@ -1178,6 +1985,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" + [[package]] name = "tempfile" version = "3.11.0" @@ -1188,7 +2001,38 @@ dependencies = [ "fastrand", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "thiserror" +version = "1.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] + +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "ordered-float", ] [[package]] @@ -1220,6 +2064,21 @@ dependencies = [ "crunchy", ] +[[package]] +name = "tinyvec" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + [[package]] name = "tokio" version = "1.39.2" @@ -1227,8 +2086,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" dependencies = [ "backtrace", + "bytes", + "libc", + "mio", "pin-project-lite", + "socket2", "tokio-macros", + "windows-sys 0.52.0", ] [[package]] @@ -1239,27 +2103,163 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] +[[package]] +name = "tokio-rustls" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +dependencies = [ + "rustls", + "rustls-pki-types", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "pin-project", + "pin-project-lite", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.72", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + [[package]] name = "unarray" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-normalization" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-width" version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + [[package]] name = "version_check" version = "0.9.5" @@ -1275,6 +2275,25 @@ dependencies = [ "libc", ] +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -1302,10 +2321,22 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn", + "syn 2.0.72", "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.92" @@ -1324,7 +2355,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -1335,13 +2366,54 @@ version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +[[package]] +name = "wasm-streams" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "windows-core" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", ] [[package]] @@ -1350,7 +2422,22 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] @@ -1359,28 +2446,46 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -1393,30 +2498,64 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winreg" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "yansi" version = "0.5.1" @@ -1450,7 +2589,7 @@ checksum = "125139de3f6b9d625c39e2efdd73d41bdac468ccd556556440e322be0e1bbd91" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", ] [[package]] @@ -1461,5 +2600,39 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.72", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + +[[package]] +name = "zstd" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.12+zstd.1.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" +dependencies = [ + "cc", + "pkg-config", ] diff --git a/Cargo.toml b/Cargo.toml index 73265b98..4fef9099 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,8 +10,11 @@ publish = false arrow = { version = "52.2.0", features = ["prettyprint"] } async-trait = "0.1.81" bytes = "1.7.1" +base64 = "0.22.1" futures = "0.3.30" itertools = "0.13.0" +object_store = { version = "0.10.2", features = ["aws"] } +parquet = { version = "52.2.0", features = ["object_store"] } rand = "0.8.5" thiserror = "1.0.63" diff --git a/src/dataset.rs b/src/dataset.rs index 243b152a..10bb0ea7 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -596,10 +596,16 @@ pub enum FlushError { #[cfg(test)] mod tests { - use std::{collections::HashSet, error::Error, num::NonZeroU64, path::PathBuf}; + use std::{ + collections::HashSet, + env::temp_dir, + error::Error, + num::NonZeroU64, + path::{Path, PathBuf}, + }; use crate::{ - manifest::mk_manifests_table, storage::InMemoryStorage, + manifest::mk_manifests_table, storage::InMemoryStorage, storage::ObjectStorage, structure::mk_structure_table, ChunkInfo, ChunkKeyEncoding, ChunkRef, ChunkShape, Codecs, DataType, FillValue, Flags, ManifestExtents, StorageTransformers, TableRegion, @@ -610,7 +616,15 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_dataset_with_updates() -> Result<(), Box> { - let storage = InMemoryStorage::new(); + let temp_dir_name = temp_dir(); + let storages: [Arc; 4] = [ + Arc::new(InMemoryStorage::new()), + Arc::new(ObjectStorage::new_in_memory_store()), + Arc::new(ObjectStorage::new_local_store(Path::new(&temp_dir_name)).unwrap()), + // Arc::new(ObjectStorage::new_s3_store_from_env("foo".to_string()).unwrap()), + Arc::new(ObjectStorage::new_s3_store_with_config("foo".to_string()).unwrap()), + ]; + for storage in storages { let array_id = 2; let chunk1 = ChunkInfo { @@ -680,15 +694,35 @@ mod tests { node_data: NodeData::Array(zarr_meta1.clone(), vec![manifest_ref]), }, ]; + for storage in storages { + let array_id = 2; + let chunk1 = ChunkInfo { + node: array_id, + coord: ArrayIndices(vec![0, 0, 0]), + payload: ChunkPayload::Ref(ChunkRef { + id: ObjectId::random(), + offset: 0, + length: 4, + }), + }; let structure = Arc::new(mk_structure_table(nodes.clone())); let structure_id = ObjectId::random(); storage.write_structure(structure_id.clone(), structure).await?; let mut ds = Dataset::update(Arc::new(storage), structure_id); - // retrieve the old array node - let node = ds.get_node(&array1_path).await; - assert_eq!(nodes.get(1), node.as_ref()); + let manifest = Arc::new( + mk_manifests_table(futures::stream::iter(vec![ + chunk1.clone(), + chunk2.clone(), + ])) + .await, + ); + let manifest_id = ObjectId::random(); + storage + .write_manifests(manifest_id.clone(), manifest) + .await + .map_err(|err| format!("{err:#?}"))?; // add a new array and retrieve its node ds.add_group("/group".to_string().into()).await?; @@ -730,9 +764,35 @@ mod tests { user_attributes: Some(UserAttributesStructure::Inline( "{n:42}".to_string(), )), - node_data: NodeData::Array(zarr_meta2.clone(), vec![]), - }) - ); + dimension_names: Some(vec![ + Some("x".to_string()), + Some("y".to_string()), + Some("t".to_string()), + ]), + }; + let manifest_ref = ManifestRef { + object_id: manifest_id, + location: TableRegion(0, 2), + flags: Flags(), + extents: ManifestExtents(vec![]), + }; + let array1_path: PathBuf = "/array1".to_string().into(); + let nodes = vec![ + NodeStructure { + path: "/".into(), + id: 1, + user_attributes: None, + node_data: NodeData::Group, + }, + NodeStructure { + path: array1_path.clone(), + id: array_id, + user_attributes: Some(UserAttributesStructure::Inline( + "{foo:1}".to_string(), + )), + node_data: NodeData::Array(zarr_meta1.clone(), vec![manifest_ref]), + }, + ]; // set a chunk for the new array and retrieve it ds.set_chunk( @@ -742,12 +802,14 @@ mod tests { ) .await?; - let chunk = ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0])).await; - assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 7]))); + // retrieve the old array node + let node = ds.get_node(&array1_path).await; + assert_eq!(nodes.get(1), node.as_ref()); - // retrieve a non initialized chunk of the new array - let non_chunk = ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![1])).await; - assert_eq!(non_chunk, None); + // add a new array and retrieve its node + ds.add_group("/group".to_string().into()) + .await + .map_err(|err| format!("{err:#?}"))?; // update old array use attriutes and check them ds.set_user_attributes(array1_path.clone(), Some("{updated: true}".to_string())) @@ -772,6 +834,7 @@ mod tests { panic!("Failed to update zarr metadata"); } +<<<<<<< HEAD // set old array chunk and check them ds.set_chunk( array1_path.clone(), @@ -783,6 +846,21 @@ mod tests { let chunk = ds.get_chunk_ref(&array1_path, &ArrayIndices(vec![0, 0, 0])).await; assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 99]))); +||||||| parent of ef51b3a (Store: Add object_store backend.) + // set old array chunk and check them + ds.set_chunk( + array1_path.clone(), + ArrayIndices(vec![0, 0, 0]), + Some(ChunkPayload::Inline(vec![0, 0, 0, 99])), + ) + .await + .map_err(|err| format!("{err:#?}"))?; + + let chunk = ds.get_chunk_ref(&array1_path, &ArrayIndices(vec![0, 0, 0])).await; + assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 99]))); + +======= +>>>>>>> ef51b3a (Store: Add object_store backend.) Ok(()) } @@ -869,8 +947,16 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_dataset_with_updates_and_writes() -> Result<(), Box> { - let storage: Arc = Arc::new(InMemoryStorage::new()); - let mut ds = Dataset::create(Arc::clone(&storage)); + let temp_dir_name = temp_dir(); + let storages: [Arc; 4] = [ + Arc::new(InMemoryStorage::new()), + Arc::new(ObjectStorage::new_in_memory_store()), + Arc::new(ObjectStorage::new_local_store(Path::new(&temp_dir_name)).unwrap()), + // Arc::new(ObjectStorage::new_s3_store_from_env("foo".to_string()).unwrap()), + Arc::new(ObjectStorage::new_s3_store_with_config("testbucket".to_string()).unwrap()), + ]; + for storage in storages { + let mut ds = Dataset::create(Arc::clone(&storage)); // add a new array and retrieve its node ds.add_group("/".into()).await?; @@ -1001,35 +1087,49 @@ mod tests { let structure_id = ds.flush().await?; let ds = Dataset::update(Arc::clone(&storage), structure_id); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, - Some(ChunkPayload::Inline(b"bye".into())) - ); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, - None - ); - assert!(matches!( - ds.get_node(&new_array_path).await, - Some(NodeStructure { - id: 3, - path, - user_attributes: Some(atts), - node_data: NodeData::Array(meta, manifests) - }) if path == new_array_path && meta == new_meta.clone() && manifests.len() == 1 && atts == UserAttributesStructure::Inline("{foo:42}".to_string()) - )); - - //test the previous version is still alive - let ds = Dataset::update(Arc::clone(&storage), previous_structure_id); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, - Some(ChunkPayload::Inline(b"bye".into())) - ); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, - Some(ChunkPayload::Inline(b"new chunk".into())) - ); + let new_meta = ZarrArrayMetadata { shape: vec![1, 1, 1], ..zarr_meta }; + // we change zarr metadata + ds.update_array(new_array_path.clone(), new_meta.clone()) + .await + .map_err(|err| format!("{err:#?}"))?; + // we change user attributes metadata + ds.set_user_attributes(new_array_path.clone(), Some("{foo:42}".to_string())) + .await + .map_err(|err| format!("{err:#?}"))?; + + let structure_id = ds.flush().await.map_err(|err| format!("{err:#?}"))?; + let ds = Dataset::update(Arc::clone(&storage), structure_id); + + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, + Some(ChunkPayload::Inline(b"bye".into())) + ); + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, + None + ); + assert!(matches!( + ds.get_node(&new_array_path).await, + Some(NodeStructure { + id: 3, + path, + user_attributes: Some(atts), + node_data: NodeData::Array(meta, manifests) + }) if path == new_array_path && meta == new_meta.clone() && manifests.len() == 1 && atts == UserAttributesStructure::Inline("{foo:42}".to_string()) + )); + + //test the previous version is still alive + let ds = Dataset::update(Arc::clone(&storage), previous_structure_id); + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, + Some(ChunkPayload::Inline(b"bye".into())) + ); + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, + Some(ChunkPayload::Inline(b"new chunk".into())) + ); + } Ok(()) } } diff --git a/src/lib.rs b/src/lib.rs index d6cc10ae..54babaea 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,8 +30,8 @@ use async_trait::async_trait; use bytes::Bytes; use manifest::ManifestsTable; use std::{ - collections::HashMap, fmt::Display, num::NonZeroU64, ops::Range, path::PathBuf, - sync::Arc, + collections::HashMap, error::Error, fmt::Display, num::NonZeroU64, ops::Range, + path::PathBuf, sync::Arc, }; use structure::StructureTable; use thiserror::Error; @@ -552,6 +552,11 @@ pub enum StorageError { NotFound(ObjectId), #[error("synchronization error on the Storage instance")] Deadlock, + #[error("")] + UrlParseError(Box), + // TODO: distinguish between Read and Write errors? + StorageLayerError(Box), + ParquetReadError(Box), } /// Fetch and write the parquet files that represent the dataset in object store diff --git a/src/storage.rs b/src/storage.rs index c2e318aa..39eef205 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -1,16 +1,210 @@ +use base64::{engine::general_purpose::URL_SAFE as BASE64_URL_SAFE, Engine as _}; use std::{ collections::HashMap, ops::Range, sync::{Arc, RwLock}, }; +use arrow::array::RecordBatch; use async_trait::async_trait; use bytes::Bytes; +use futures::StreamExt; +use parquet::arrow::{ + async_reader::ParquetObjectReader, AsyncArrowWriter, ParquetRecordBatchStreamBuilder, +}; use crate::{ AttributesTable, ChunkOffset, ManifestsTable, ObjectId, Storage, StorageError, - StructureTable, + StructureTable, StorageError::StorageLayerError, }; +use object_store::{local::LocalFileSystem, memory::InMemory, path::Path, ObjectStore}; + +#[allow(dead_code)] +enum FileType { + Structure, + Manifest, + Attributes, +} +impl FileType { + pub fn get_prefix(&self) -> &str { + match self { + FileType::Structure => "s/", + FileType::Manifest => "m/", + FileType::Attributes => "a/", + } + } +} + +// #[derive(Default)] +pub struct ObjectStorage { + store: Arc, +} + +impl ObjectStorage { + pub fn new_in_memory_store() -> ObjectStorage { + ObjectStorage { store: Arc::new(InMemory::new()) } + } + pub fn new_local_store( + prefix: &std::path::Path, + ) -> Result { + Ok(ObjectStorage { + store: Arc::new( + LocalFileSystem::new_with_prefix(prefix) + .map_err(|err| StorageLayerError(Box::new(err)))?, + ), + }) + } + pub fn new_s3_store_from_env( + bucket_name: String, + ) -> Result { + use object_store::aws::AmazonS3Builder; + let store = AmazonS3Builder::from_env() + .with_bucket_name(bucket_name) + .build() + .map_err(|err| StorageError::UrlParseError(Box::new(err)))?; + Ok(ObjectStorage { store: Arc::new(store) }) + } + + pub fn new_s3_store_with_config( + bucket_name: String, + ) -> Result { + use object_store::aws::AmazonS3Builder; + let store = AmazonS3Builder::new() + // TODO: Generalize the auth config + .with_access_key_id("minio123") + .with_secret_access_key("minio123") + .with_endpoint("http://localhost:9000") + .with_allow_http(true) + .with_bucket_name(bucket_name) + .build() + .map_err(|err| StorageError::UrlParseError(Box::new(err)))?; + Ok(ObjectStorage { store: Arc::new(store) }) + } + + fn get_path(filetype: FileType, id: &ObjectId) -> Path { + let ObjectId(asu8) = id; + let prefix = filetype.get_prefix(); + // TODO: be careful about allocation here + let path = format!("{}/{}", prefix, BASE64_URL_SAFE.encode(asu8)); + Path::from(path) + } + + async fn read_parquet(&self, path: &Path) -> Result { + let meta = self + .store + .head(path) + .await + .map_err(|err| StorageError::ParquetReadError(Box::new(err)))?; + let reader = ParquetObjectReader::new(Arc::clone(&self.store), meta); + let mut builder = ParquetRecordBatchStreamBuilder::new(reader) + .await + .map_err(|err| StorageError::ParquetReadError(Box::new(err)))? + .build() + .map_err(|err| StorageError::ParquetReadError(Box::new(err)))?; + + // only one batch ever? Assert that + // Use `if let`; + let batch = builder.next().await.unwrap().unwrap(); + Ok(batch) + } + + async fn write_parquet( + &self, + path: &Path, + batch: &RecordBatch, + ) -> Result<(), StorageError> { + let mut buffer = Vec::new(); + let mut writer = AsyncArrowWriter::try_new(&mut buffer, batch.schema(), None) + .map_err(|err| StorageLayerError(Box::new(err)))?; + writer.write(batch).await.map_err(|err| StorageLayerError(Box::new(err)))?; + writer.close().await.map_err(|err| StorageLayerError(Box::new(err)))?; + + // TODO: find object_store streaming interface + let payload = object_store::PutPayload::from(buffer); + self.store + .put(path, payload) + .await + .map_err(|err| StorageLayerError(Box::new(err)))?; + Ok(()) + } +} + +#[async_trait] +impl Storage for ObjectStorage { + async fn fetch_structure( + &self, + id: &ObjectId, + ) -> Result, StorageError> { + let path = ObjectStorage::get_path(FileType::Structure, id); + let batch = self.read_parquet(&path).await?; + Ok(Arc::new(StructureTable { batch })) + } + + async fn fetch_attributes( + &self, + _id: &ObjectId, + ) -> Result, StorageError> { + todo!(); + } + + async fn fetch_manifests( + &self, + id: &ObjectId, + ) -> Result, StorageError> { + let path = ObjectStorage::get_path(FileType::Manifest, id); + let batch = self.read_parquet(&path).await?; + Ok(Arc::new(ManifestsTable { batch })) + } + + async fn write_structure( + &self, + id: ObjectId, + table: Arc, + ) -> Result<(), StorageError> { + let path = ObjectStorage::get_path(FileType::Structure, &id); + self.write_parquet(&path, &table.batch).await?; + Ok(()) + } + + async fn write_attributes( + &self, + _id: ObjectId, + _table: Arc, + ) -> Result<(), StorageError> { + todo!() + // let path = ObjectStorage::get_path(FileType::Structure, &id); + // self.write_parquet(&path, &table.batch).await?; + // Ok(()) + } + + async fn write_manifests( + &self, + id: ObjectId, + table: Arc, + ) -> Result<(), StorageError> { + let path = ObjectStorage::get_path(FileType::Manifest, &id); + self.write_parquet(&path, &table.batch).await?; + Ok(()) + } + + async fn fetch_chunk( + &self, + _x_id: &ObjectId, + _range: &Option>, + ) -> Result, StorageError> { + todo!() + } + + async fn write_chunk( + &self, + _id: ObjectId, + _bytes: bytes::Bytes, + ) -> Result<(), StorageError> { + todo!() + } +} + +//////######### #[derive(Default)] pub struct InMemoryStorage { From 9d972d2a963518df034dd519a8b619872be25264 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 15 Aug 2024 14:08:03 -0600 Subject: [PATCH 02/17] Review comments --- src/dataset.rs | 10 ++++++++-- src/storage.rs | 11 +++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/dataset.rs b/src/dataset.rs index 10bb0ea7..ab64a1fa 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -622,7 +622,10 @@ mod tests { Arc::new(ObjectStorage::new_in_memory_store()), Arc::new(ObjectStorage::new_local_store(Path::new(&temp_dir_name)).unwrap()), // Arc::new(ObjectStorage::new_s3_store_from_env("foo".to_string()).unwrap()), - Arc::new(ObjectStorage::new_s3_store_with_config("foo".to_string()).unwrap()), + Arc::new( + ObjectStorage::new_s3_store_with_config("testbucket".to_string()) + .unwrap(), + ), ]; for storage in storages { @@ -953,7 +956,10 @@ mod tests { Arc::new(ObjectStorage::new_in_memory_store()), Arc::new(ObjectStorage::new_local_store(Path::new(&temp_dir_name)).unwrap()), // Arc::new(ObjectStorage::new_s3_store_from_env("foo".to_string()).unwrap()), - Arc::new(ObjectStorage::new_s3_store_with_config("testbucket".to_string()).unwrap()), + Arc::new( + ObjectStorage::new_s3_store_with_config("testbucket".to_string()) + .unwrap(), + ), ]; for storage in storages { let mut ds = Dataset::create(Arc::clone(&storage)); diff --git a/src/storage.rs b/src/storage.rs index 39eef205..ae83e05d 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -55,18 +55,18 @@ impl ObjectStorage { }) } pub fn new_s3_store_from_env( - bucket_name: String, + bucket_name: impl Into, ) -> Result { use object_store::aws::AmazonS3Builder; let store = AmazonS3Builder::from_env() - .with_bucket_name(bucket_name) + .with_bucket_name(bucket_name.into()) .build() .map_err(|err| StorageError::UrlParseError(Box::new(err)))?; Ok(ObjectStorage { store: Arc::new(store) }) } pub fn new_s3_store_with_config( - bucket_name: String, + bucket_name: impl Into, ) -> Result { use object_store::aws::AmazonS3Builder; let store = AmazonS3Builder::new() @@ -75,14 +75,13 @@ impl ObjectStorage { .with_secret_access_key("minio123") .with_endpoint("http://localhost:9000") .with_allow_http(true) - .with_bucket_name(bucket_name) + .with_bucket_name(bucket_name.into()) .build() .map_err(|err| StorageError::UrlParseError(Box::new(err)))?; Ok(ObjectStorage { store: Arc::new(store) }) } - fn get_path(filetype: FileType, id: &ObjectId) -> Path { - let ObjectId(asu8) = id; + fn get_path(filetype: FileType, ObjectId(asu8): &ObjectId) -> Path { let prefix = filetype.get_prefix(); // TODO: be careful about allocation here let path = format!("{}/{}", prefix, BASE64_URL_SAFE.encode(asu8)); From a9737272acded879e57a0601fe2a1b9f4c0a9d53 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 15 Aug 2024 14:51:18 -0600 Subject: [PATCH 03/17] wip --- Cargo.lock | 20 -- src/dataset.rs | 576 ++++++++++++++++++++++--------------------------- src/lib.rs | 15 +- src/storage.rs | 64 +++--- 4 files changed, 305 insertions(+), 370 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d7741ecb..2d59fc52 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2035,26 +2035,6 @@ dependencies = [ "ordered-float", ] -[[package]] -name = "thiserror" -version = "1.0.63" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.63" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "tiny-keccak" version = "2.0.2" diff --git a/src/dataset.rs b/src/dataset.rs index ab64a1fa..60014581 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -597,106 +597,40 @@ pub enum FlushError { #[cfg(test)] mod tests { use std::{ - collections::HashSet, - env::temp_dir, - error::Error, - num::NonZeroU64, - path::{Path, PathBuf}, + collections::HashSet, env::temp_dir, error::Error, num::NonZeroU64, path::PathBuf, }; use crate::{ - manifest::mk_manifests_table, storage::InMemoryStorage, storage::ObjectStorage, - structure::mk_structure_table, ChunkInfo, ChunkKeyEncoding, ChunkRef, ChunkShape, - Codecs, DataType, FillValue, Flags, ManifestExtents, StorageTransformers, - TableRegion, + manifest::mk_manifests_table, + storage::{InMemoryStorage, ObjectStorage}, + structure::mk_structure_table, + ChunkInfo, ChunkKeyEncoding, ChunkRef, ChunkShape, Codecs, DataType, FillValue, + Flags, ManifestExtents, StorageTransformers, TableRegion, }; use super::*; use pretty_assertions::assert_eq; + use rand; + use rand::{distributions::Alphanumeric, Rng}; // 0.8 #[tokio::test(flavor = "multi_thread")] async fn test_dataset_with_updates() -> Result<(), Box> { let temp_dir_name = temp_dir(); + let prefix: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(7) + .map(char::from) + .collect(); let storages: [Arc; 4] = [ Arc::new(InMemoryStorage::new()), Arc::new(ObjectStorage::new_in_memory_store()), - Arc::new(ObjectStorage::new_local_store(Path::new(&temp_dir_name)).unwrap()), - // Arc::new(ObjectStorage::new_s3_store_from_env("foo".to_string()).unwrap()), + Arc::new(ObjectStorage::new_local_store(temp_dir_name).unwrap()), + // Arc::new(ObjectStorage::new_s3_store_from_env("testbucket".to_string()).unwrap()), Arc::new( - ObjectStorage::new_s3_store_with_config("testbucket".to_string()) + ObjectStorage::new_s3_store_with_config("testbucket".to_string(), prefix) .unwrap(), ), ]; - for storage in storages { - - let array_id = 2; - let chunk1 = ChunkInfo { - node: array_id, - coord: ArrayIndices(vec![0, 0, 0]), - payload: ChunkPayload::Ref(ChunkRef { - id: ObjectId::random(), - offset: 0, - length: 4, - }), - }; - - let chunk2 = ChunkInfo { - node: array_id, - coord: ArrayIndices(vec![0, 0, 1]), - payload: ChunkPayload::Inline(vec![0, 0, 0, 42]), - }; - - let manifest = Arc::new( - mk_manifests_table(futures::stream::iter(vec![ - chunk1.clone(), - chunk2.clone(), - ])) - .await, - ); - let manifest_id = ObjectId::random(); - storage.write_manifests(manifest_id.clone(), manifest).await?; - - let zarr_meta1 = ZarrArrayMetadata { - shape: vec![2, 2, 2], - data_type: DataType::Int32, - chunk_shape: ChunkShape(vec![ - NonZeroU64::new(1).unwrap(), - NonZeroU64::new(1).unwrap(), - NonZeroU64::new(1).unwrap(), - ]), - chunk_key_encoding: ChunkKeyEncoding::Slash, - fill_value: FillValue::Int32(0), - codecs: Codecs("codec".to_string()), - storage_transformers: Some(StorageTransformers("tranformers".to_string())), - dimension_names: Some(vec![ - Some("x".to_string()), - Some("y".to_string()), - Some("t".to_string()), - ]), - }; - let manifest_ref = ManifestRef { - object_id: manifest_id, - location: TableRegion(0, 2), - flags: Flags(), - extents: ManifestExtents(vec![]), - }; - let array1_path: PathBuf = "/array1".to_string().into(); - let nodes = vec![ - NodeStructure { - path: "/".into(), - id: 1, - user_attributes: None, - node_data: NodeData::Group, - }, - NodeStructure { - path: array1_path.clone(), - id: array_id, - user_attributes: Some(UserAttributesStructure::Inline( - "{foo:1}".to_string(), - )), - node_data: NodeData::Array(zarr_meta1.clone(), vec![manifest_ref]), - }, - ]; for storage in storages { let array_id = 2; let chunk1 = ChunkInfo { @@ -709,10 +643,11 @@ mod tests { }), }; - let structure = Arc::new(mk_structure_table(nodes.clone())); - let structure_id = ObjectId::random(); - storage.write_structure(structure_id.clone(), structure).await?; - let mut ds = Dataset::update(Arc::new(storage), structure_id); + let chunk2 = ChunkInfo { + node: array_id, + coord: ArrayIndices(vec![0, 0, 1]), + payload: ChunkPayload::Inline(vec![0, 0, 0, 42]), + }; let manifest = Arc::new( mk_manifests_table(futures::stream::iter(vec![ @@ -722,50 +657,21 @@ mod tests { .await, ); let manifest_id = ObjectId::random(); - storage - .write_manifests(manifest_id.clone(), manifest) - .await - .map_err(|err| format!("{err:#?}"))?; - - // add a new array and retrieve its node - ds.add_group("/group".to_string().into()).await?; - - let zarr_meta2 = ZarrArrayMetadata { - shape: vec![3], - data_type: DataType::Int32, - chunk_shape: ChunkShape(vec![NonZeroU64::new(2).unwrap()]), - chunk_key_encoding: ChunkKeyEncoding::Slash, - fill_value: FillValue::Int32(0), - codecs: Codecs("codec".to_string()), - storage_transformers: Some(StorageTransformers("tranformers".to_string())), - dimension_names: Some(vec![Some("t".to_string())]), - }; - - let new_array_path: PathBuf = "/group/array2".to_string().into(); - ds.add_array(new_array_path.clone(), zarr_meta2.clone()).await?; - - let node = ds.get_node(&new_array_path).await; - assert_eq!( - node, - Some(NodeStructure { - path: new_array_path.clone(), - id: 4, - user_attributes: None, - node_data: NodeData::Array(zarr_meta2.clone(), vec![]), - }) - ); - - // set user attributes for the new array and retrieve them - ds.set_user_attributes(new_array_path.clone(), Some("{n:42}".to_string())) - .await?; - let node = ds.get_node(&new_array_path).await; - assert_eq!( - node, - Some(NodeStructure { - path: "/group/array2".into(), - id: 4, - user_attributes: Some(UserAttributesStructure::Inline( - "{n:42}".to_string(), + storage.write_manifests(manifest_id.clone(), manifest).await?; + + let zarr_meta1 = ZarrArrayMetadata { + shape: vec![2, 2, 2], + data_type: DataType::Int32, + chunk_shape: ChunkShape(vec![ + NonZeroU64::new(1).unwrap(), + NonZeroU64::new(1).unwrap(), + NonZeroU64::new(1).unwrap(), + ]), + chunk_key_encoding: ChunkKeyEncoding::Slash, + fill_value: FillValue::Int32(0), + codecs: Codecs("codec".to_string()), + storage_transformers: Some(StorageTransformers( + "tranformers".to_string(), )), dimension_names: Some(vec![ Some("x".to_string()), @@ -797,73 +703,115 @@ mod tests { }, ]; - // set a chunk for the new array and retrieve it - ds.set_chunk( - new_array_path.clone(), - ArrayIndices(vec![0]), - Some(ChunkPayload::Inline(vec![0, 0, 0, 7])), - ) - .await?; + let structure = Arc::new(mk_structure_table(nodes.clone())); + let structure_id = ObjectId::random(); + storage.write_structure(structure_id.clone(), structure).await?; + let mut ds = Dataset::update(storage, structure_id); // retrieve the old array node let node = ds.get_node(&array1_path).await; assert_eq!(nodes.get(1), node.as_ref()); // add a new array and retrieve its node - ds.add_group("/group".to_string().into()) - .await - .map_err(|err| format!("{err:#?}"))?; + ds.add_group("/group".to_string().into()).await?; + + let zarr_meta2 = ZarrArrayMetadata { + shape: vec![3], + data_type: DataType::Int32, + chunk_shape: ChunkShape(vec![NonZeroU64::new(2).unwrap()]), + chunk_key_encoding: ChunkKeyEncoding::Slash, + fill_value: FillValue::Int32(0), + codecs: Codecs("codec".to_string()), + storage_transformers: Some(StorageTransformers( + "tranformers".to_string(), + )), + dimension_names: Some(vec![Some("t".to_string())]), + }; + + let new_array_path: PathBuf = "/group/array2".to_string().into(); + ds.add_array(new_array_path.clone(), zarr_meta2.clone()).await?; + + let node = ds.get_node(&new_array_path).await; + assert_eq!( + node, + Some(NodeStructure { + path: new_array_path.clone(), + id: 4, + user_attributes: None, + node_data: NodeData::Array(zarr_meta2.clone(), vec![]), + }) + ); + + // set user attributes for the new array and retrieve them + ds.set_user_attributes(new_array_path.clone(), Some("{n:42}".to_string())) + .await?; + let node = ds.get_node(&new_array_path).await; + assert_eq!( + node, + Some(NodeStructure { + path: "/group/array2".into(), + id: 4, + user_attributes: Some(UserAttributesStructure::Inline( + "{n:42}".to_string(), + )), + node_data: NodeData::Array(zarr_meta2.clone(), vec![]), + }) + ); - // update old array use attriutes and check them - ds.set_user_attributes(array1_path.clone(), Some("{updated: true}".to_string())) + // set a chunk for the new array and retrieve it + ds.set_chunk( + new_array_path.clone(), + ArrayIndices(vec![0]), + Some(ChunkPayload::Inline(vec![0, 0, 0, 7])), + ) .await?; - let node = ds.get_node(&array1_path).await.unwrap(); - assert_eq!( - node.user_attributes, - Some(UserAttributesStructure::Inline("{updated: true}".to_string())) - ); - // update old array zarr metadata and check it - let new_zarr_meta1 = ZarrArrayMetadata { shape: vec![2, 2, 3], ..zarr_meta1 }; - ds.update_array(array1_path.clone(), new_zarr_meta1).await?; - let node = ds.get_node(&array1_path).await; - if let Some(NodeStructure { - node_data: NodeData::Array(ZarrArrayMetadata { shape, .. }, _), - .. - }) = node - { - assert_eq!(shape, vec![2, 2, 3]); - } else { - panic!("Failed to update zarr metadata"); - } + let chunk = ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0])).await; + assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 7]))); + + // retrieve a non initialized chunk of the new array + let non_chunk = + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![1])).await; + assert_eq!(non_chunk, None); + + // update old array use attriutes and check them + ds.set_user_attributes( + array1_path.clone(), + Some("{updated: true}".to_string()), + ) + .await?; + let node = ds.get_node(&array1_path).await.unwrap(); + assert_eq!( + node.user_attributes, + Some(UserAttributesStructure::Inline("{updated: true}".to_string())) + ); + + // update old array zarr metadata and check it + let new_zarr_meta1 = ZarrArrayMetadata { shape: vec![2, 2, 3], ..zarr_meta1 }; + ds.update_array(array1_path.clone(), new_zarr_meta1).await?; + let node = ds.get_node(&array1_path).await; + if let Some(NodeStructure { + node_data: NodeData::Array(ZarrArrayMetadata { shape, .. }, _), + .. + }) = node + { + assert_eq!(shape, vec![2, 2, 3]); + } else { + panic!("Failed to update zarr metadata"); + } + + // set old array chunk and check them + ds.set_chunk( + array1_path.clone(), + ArrayIndices(vec![0, 0, 0]), + Some(ChunkPayload::Inline(vec![0, 0, 0, 99])), + ) + .await?; -<<<<<<< HEAD - // set old array chunk and check them - ds.set_chunk( - array1_path.clone(), - ArrayIndices(vec![0, 0, 0]), - Some(ChunkPayload::Inline(vec![0, 0, 0, 99])), - ) - .await?; - - let chunk = ds.get_chunk_ref(&array1_path, &ArrayIndices(vec![0, 0, 0])).await; - assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 99]))); - -||||||| parent of ef51b3a (Store: Add object_store backend.) - // set old array chunk and check them - ds.set_chunk( - array1_path.clone(), - ArrayIndices(vec![0, 0, 0]), - Some(ChunkPayload::Inline(vec![0, 0, 0, 99])), - ) - .await - .map_err(|err| format!("{err:#?}"))?; - - let chunk = ds.get_chunk_ref(&array1_path, &ArrayIndices(vec![0, 0, 0])).await; - assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 99]))); - -======= ->>>>>>> ef51b3a (Store: Add object_store backend.) + let chunk = + ds.get_chunk_ref(&array1_path, &ArrayIndices(vec![0, 0, 0])).await; + assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 99]))); + } Ok(()) } @@ -951,160 +899,154 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_dataset_with_updates_and_writes() -> Result<(), Box> { let temp_dir_name = temp_dir(); + let prefix: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(7) + .map(char::from) + .collect(); let storages: [Arc; 4] = [ Arc::new(InMemoryStorage::new()), Arc::new(ObjectStorage::new_in_memory_store()), - Arc::new(ObjectStorage::new_local_store(Path::new(&temp_dir_name)).unwrap()), - // Arc::new(ObjectStorage::new_s3_store_from_env("foo".to_string()).unwrap()), + Arc::new(ObjectStorage::new_local_store(temp_dir_name).unwrap()), + // Arc::new(ObjectStorage::new_s3_store_from_env("testbucket".to_string()).unwrap()), Arc::new( - ObjectStorage::new_s3_store_with_config("testbucket".to_string()) + ObjectStorage::new_s3_store_with_config("testbucket".to_string(), prefix) .unwrap(), ), ]; for storage in storages { let mut ds = Dataset::create(Arc::clone(&storage)); - // add a new array and retrieve its node - ds.add_group("/".into()).await?; - let structure_id = ds.flush().await?; - - assert_eq!(Some(structure_id), ds.structure_id); - assert_eq!( - ds.get_node(&"/".into()).await, - Some(NodeStructure { - id: 1, - path: "/".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - ds.add_group("/group".into()).await?; - let _structure_id = ds.flush().await?; - assert_eq!( - ds.get_node(&"/".into()).await, - Some(NodeStructure { - id: 1, - path: "/".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - assert_eq!( - ds.get_node(&"/group".into()).await, - Some(NodeStructure { - id: 2, - path: "/group".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - let zarr_meta = ZarrArrayMetadata { - shape: vec![1, 1, 2], - data_type: DataType::Int32, - chunk_shape: ChunkShape(vec![NonZeroU64::new(2).unwrap()]), - chunk_key_encoding: ChunkKeyEncoding::Slash, - fill_value: FillValue::Int32(0), - codecs: Codecs("codec".to_string()), - storage_transformers: Some(StorageTransformers("tranformers".to_string())), - dimension_names: Some(vec![Some("t".to_string())]), - }; + // add a new array and retrieve its node + ds.add_group("/".into()).await?; + let structure_id = ds.flush().await?; - let new_array_path: PathBuf = "/group/array1".to_string().into(); - ds.add_array(new_array_path.clone(), zarr_meta.clone()).await?; - - // we set a chunk in a new array - ds.set_chunk( - new_array_path.clone(), - ArrayIndices(vec![0, 0, 0]), - Some(ChunkPayload::Inline(b"hello".into())), - ) - .await?; - - let _structure_id = ds.flush().await?; - assert_eq!( - ds.get_node(&"/".into()).await, - Some(NodeStructure { - id: 1, - path: "/".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - assert_eq!( - ds.get_node(&"/group".into()).await, - Some(NodeStructure { - id: 2, - path: "/group".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - assert!(matches!( - ds.get_node(&new_array_path).await, - Some(NodeStructure { - id: 3, - path, - user_attributes: None, - node_data: NodeData::Array(meta, manifests) - }) if path == new_array_path && meta == zarr_meta.clone() && manifests.len() == 1 - )); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, - Some(ChunkPayload::Inline(b"hello".into())) - ); + assert_eq!(Some(structure_id), ds.structure_id); + assert_eq!( + ds.get_node(&"/".into()).await, + Some(NodeStructure { + id: 1, + path: "/".into(), + user_attributes: None, + node_data: NodeData::Group + }) + ); + ds.add_group("/group".into()).await?; + let _structure_id = ds.flush().await?; + assert_eq!( + ds.get_node(&"/".into()).await, + Some(NodeStructure { + id: 1, + path: "/".into(), + user_attributes: None, + node_data: NodeData::Group + }) + ); + assert_eq!( + ds.get_node(&"/group".into()).await, + Some(NodeStructure { + id: 2, + path: "/group".into(), + user_attributes: None, + node_data: NodeData::Group + }) + ); + let zarr_meta = ZarrArrayMetadata { + shape: vec![1, 1, 2], + data_type: DataType::Int32, + chunk_shape: ChunkShape(vec![NonZeroU64::new(2).unwrap()]), + chunk_key_encoding: ChunkKeyEncoding::Slash, + fill_value: FillValue::Int32(0), + codecs: Codecs("codec".to_string()), + storage_transformers: Some(StorageTransformers( + "tranformers".to_string(), + )), + dimension_names: Some(vec![Some("t".to_string())]), + }; - // we modify a chunk in an existing array - ds.set_chunk( - new_array_path.clone(), - ArrayIndices(vec![0, 0, 0]), - Some(ChunkPayload::Inline(b"bye".into())), - ) - .await?; - - // we add a new chunk in an existing array - ds.set_chunk( - new_array_path.clone(), - ArrayIndices(vec![0, 0, 1]), - Some(ChunkPayload::Inline(b"new chunk".into())), - ) - .await?; - - let previous_structure_id = ds.flush().await?; - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, - Some(ChunkPayload::Inline(b"bye".into())) - ); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, - Some(ChunkPayload::Inline(b"new chunk".into())) - ); + let new_array_path: PathBuf = "/group/array1".to_string().into(); + ds.add_array(new_array_path.clone(), zarr_meta.clone()).await?; + + // we set a chunk in a new array + ds.set_chunk( + new_array_path.clone(), + ArrayIndices(vec![0, 0, 0]), + Some(ChunkPayload::Inline(b"hello".into())), + ) + .await?; - // we delete a chunk - ds.set_chunk(new_array_path.clone(), ArrayIndices(vec![0, 0, 1]), None).await?; + let _structure_id = ds.flush().await?; + assert_eq!( + ds.get_node(&"/".into()).await, + Some(NodeStructure { + id: 1, + path: "/".into(), + user_attributes: None, + node_data: NodeData::Group + }) + ); + assert_eq!( + ds.get_node(&"/group".into()).await, + Some(NodeStructure { + id: 2, + path: "/group".into(), + user_attributes: None, + node_data: NodeData::Group + }) + ); + assert!(matches!( + ds.get_node(&new_array_path).await, + Some(NodeStructure { + id: 3, + path, + user_attributes: None, + node_data: NodeData::Array(meta, manifests) + }) if path == new_array_path && meta == zarr_meta.clone() && manifests.len() == 1 + )); + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, + Some(ChunkPayload::Inline(b"hello".into())) + ); - let new_meta = ZarrArrayMetadata { shape: vec![1, 1, 1], ..zarr_meta }; - // we change zarr metadata - ds.update_array(new_array_path.clone(), new_meta.clone()).await?; + // we modify a chunk in an existing array + ds.set_chunk( + new_array_path.clone(), + ArrayIndices(vec![0, 0, 0]), + Some(ChunkPayload::Inline(b"bye".into())), + ) + .await?; - // we change user attributes metadata - ds.set_user_attributes(new_array_path.clone(), Some("{foo:42}".to_string())) + // we add a new chunk in an existing array + ds.set_chunk( + new_array_path.clone(), + ArrayIndices(vec![0, 0, 1]), + Some(ChunkPayload::Inline(b"new chunk".into())), + ) .await?; - let structure_id = ds.flush().await?; - let ds = Dataset::update(Arc::clone(&storage), structure_id); + let previous_structure_id = ds.flush().await?; + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, + Some(ChunkPayload::Inline(b"bye".into())) + ); + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, + Some(ChunkPayload::Inline(b"new chunk".into())) + ); + + // we delete a chunk + ds.set_chunk(new_array_path.clone(), ArrayIndices(vec![0, 0, 1]), None) + .await?; let new_meta = ZarrArrayMetadata { shape: vec![1, 1, 1], ..zarr_meta }; // we change zarr metadata - ds.update_array(new_array_path.clone(), new_meta.clone()) - .await - .map_err(|err| format!("{err:#?}"))?; + ds.update_array(new_array_path.clone(), new_meta.clone()).await?; // we change user attributes metadata ds.set_user_attributes(new_array_path.clone(), Some("{foo:42}".to_string())) - .await - .map_err(|err| format!("{err:#?}"))?; + .await?; - let structure_id = ds.flush().await.map_err(|err| format!("{err:#?}"))?; + let structure_id = ds.flush().await?; let ds = Dataset::update(Arc::clone(&storage), structure_id); assert_eq!( diff --git a/src/lib.rs b/src/lib.rs index 54babaea..e059a5c5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,8 +30,8 @@ use async_trait::async_trait; use bytes::Bytes; use manifest::ManifestsTable; use std::{ - collections::HashMap, error::Error, fmt::Display, num::NonZeroU64, ops::Range, - path::PathBuf, sync::Arc, + collections::HashMap, fmt::Display, num::NonZeroU64, ops::Range, path::PathBuf, + sync::Arc, }; use structure::StructureTable; use thiserror::Error; @@ -553,10 +553,13 @@ pub enum StorageError { #[error("synchronization error on the Storage instance")] Deadlock, #[error("")] - UrlParseError(Box), - // TODO: distinguish between Read and Write errors? - StorageLayerError(Box), - ParquetReadError(Box), + ParseError(Path), + #[error("Error contacting object store `{0:?}`")] + ObjectStoreError(String), + #[error("Storage layer error: {0}")] + StorageLayerError(String), + #[error("Error reading or writing to/from parquet files: `{0:?}`")] + ParquetError(String), } /// Fetch and write the parquet files that represent the dataset in object store diff --git a/src/storage.rs b/src/storage.rs index ae83e05d..763c9b74 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -15,7 +15,7 @@ use parquet::arrow::{ use crate::{ AttributesTable, ChunkOffset, ManifestsTable, ObjectId, Storage, StorageError, - StructureTable, StorageError::StorageLayerError, + StorageError::StorageLayerError, StructureTable, }; use object_store::{local::LocalFileSystem, memory::InMemory, path::Path, ObjectStore}; @@ -38,35 +38,43 @@ impl FileType { // #[derive(Default)] pub struct ObjectStorage { store: Arc, + prefix: String, } impl ObjectStorage { pub fn new_in_memory_store() -> ObjectStorage { - ObjectStorage { store: Arc::new(InMemory::new()) } + ObjectStorage { store: Arc::new(InMemory::new()), prefix: "".into() } } pub fn new_local_store( - prefix: &std::path::Path, + prefix: std::path::PathBuf, ) -> Result { Ok(ObjectStorage { store: Arc::new( - LocalFileSystem::new_with_prefix(prefix) - .map_err(|err| StorageLayerError(Box::new(err)))?, + LocalFileSystem::new_with_prefix(&prefix) + .map_err(|err| StorageLayerError(err.to_string()))?, ), + prefix: prefix + .to_str() + .ok_or("Couldn't convert prefix to string") + .map_err(|err| StorageError::StorageLayerError(err.to_owned()))? + .to_owned(), }) } pub fn new_s3_store_from_env( bucket_name: impl Into, + prefix: impl Into, ) -> Result { use object_store::aws::AmazonS3Builder; let store = AmazonS3Builder::from_env() .with_bucket_name(bucket_name.into()) .build() - .map_err(|err| StorageError::UrlParseError(Box::new(err)))?; - Ok(ObjectStorage { store: Arc::new(store) }) + .map_err(|err| StorageError::ObjectStoreError(err.to_string()))?; + Ok(ObjectStorage { store: Arc::new(store), prefix: prefix.into() }) } pub fn new_s3_store_with_config( bucket_name: impl Into, + prefix: impl Into, ) -> Result { use object_store::aws::AmazonS3Builder; let store = AmazonS3Builder::new() @@ -77,34 +85,35 @@ impl ObjectStorage { .with_allow_http(true) .with_bucket_name(bucket_name.into()) .build() - .map_err(|err| StorageError::UrlParseError(Box::new(err)))?; - Ok(ObjectStorage { store: Arc::new(store) }) + .map_err(|err| StorageError::ObjectStoreError(err.to_string()))?; + Ok(ObjectStorage { store: Arc::new(store), prefix: prefix.into() }) } - fn get_path(filetype: FileType, ObjectId(asu8): &ObjectId) -> Path { - let prefix = filetype.get_prefix(); + fn get_path(&self, filetype: FileType, ObjectId(asu8): &ObjectId) -> Path { + let type_prefix = filetype.get_prefix(); // TODO: be careful about allocation here - let path = format!("{}/{}", prefix, BASE64_URL_SAFE.encode(asu8)); + let path = + format!("{}/{}/{}", self.prefix, type_prefix, BASE64_URL_SAFE.encode(asu8)); Path::from(path) } async fn read_parquet(&self, path: &Path) -> Result { + // TODO: avoid this read since we are always reading the whole thing. let meta = self .store .head(path) .await - .map_err(|err| StorageError::ParquetReadError(Box::new(err)))?; + .map_err(|err| StorageError::ParquetError(err.to_string()))?; let reader = ParquetObjectReader::new(Arc::clone(&self.store), meta); let mut builder = ParquetRecordBatchStreamBuilder::new(reader) .await - .map_err(|err| StorageError::ParquetReadError(Box::new(err)))? + .map_err(|err| StorageError::ParquetError(err.to_string()))? .build() - .map_err(|err| StorageError::ParquetReadError(Box::new(err)))?; + .map_err(|err| StorageError::ParquetError(err.to_string()))?; - // only one batch ever? Assert that - // Use `if let`; - let batch = builder.next().await.unwrap().unwrap(); - Ok(batch) + // TODO: do we always have only one batch ever? Assert that + // TODO: Use `if let`; + Ok(builder.next().await.unwrap().unwrap()) } async fn write_parquet( @@ -112,18 +121,19 @@ impl ObjectStorage { path: &Path, batch: &RecordBatch, ) -> Result<(), StorageError> { + use crate::StorageError::ParquetError; let mut buffer = Vec::new(); let mut writer = AsyncArrowWriter::try_new(&mut buffer, batch.schema(), None) - .map_err(|err| StorageLayerError(Box::new(err)))?; - writer.write(batch).await.map_err(|err| StorageLayerError(Box::new(err)))?; - writer.close().await.map_err(|err| StorageLayerError(Box::new(err)))?; + .map_err(|err| ParquetError(err.to_string()))?; + writer.write(batch).await.map_err(|err| ParquetError(err.to_string()))?; + writer.close().await.map_err(|err| ParquetError(err.to_string()))?; // TODO: find object_store streaming interface let payload = object_store::PutPayload::from(buffer); self.store .put(path, payload) .await - .map_err(|err| StorageLayerError(Box::new(err)))?; + .map_err(|err| StorageLayerError(err.to_string()))?; Ok(()) } } @@ -134,7 +144,7 @@ impl Storage for ObjectStorage { &self, id: &ObjectId, ) -> Result, StorageError> { - let path = ObjectStorage::get_path(FileType::Structure, id); + let path = self.get_path(FileType::Structure, id); let batch = self.read_parquet(&path).await?; Ok(Arc::new(StructureTable { batch })) } @@ -150,7 +160,7 @@ impl Storage for ObjectStorage { &self, id: &ObjectId, ) -> Result, StorageError> { - let path = ObjectStorage::get_path(FileType::Manifest, id); + let path = self.get_path(FileType::Manifest, id); let batch = self.read_parquet(&path).await?; Ok(Arc::new(ManifestsTable { batch })) } @@ -160,7 +170,7 @@ impl Storage for ObjectStorage { id: ObjectId, table: Arc, ) -> Result<(), StorageError> { - let path = ObjectStorage::get_path(FileType::Structure, &id); + let path = self.get_path(FileType::Structure, &id); self.write_parquet(&path, &table.batch).await?; Ok(()) } @@ -181,7 +191,7 @@ impl Storage for ObjectStorage { id: ObjectId, table: Arc, ) -> Result<(), StorageError> { - let path = ObjectStorage::get_path(FileType::Manifest, &id); + let path = self.get_path(FileType::Manifest, &id); self.write_parquet(&path, &table.batch).await?; Ok(()) } From 3c29319d666dee2411bc39628c8806b8b489e40b Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 15 Aug 2024 16:01:27 -0600 Subject: [PATCH 04/17] Fix a little more --- src/dataset.rs | 2 +- src/lib.rs | 12 ++++++------ src/storage.rs | 50 ++++++++++++++++++++++++++++---------------------- 3 files changed, 35 insertions(+), 29 deletions(-) diff --git a/src/dataset.rs b/src/dataset.rs index 60014581..1a567801 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -586,7 +586,7 @@ impl TableRegionTracker { } } -#[derive(Debug, Clone, PartialEq, Eq, Error)] +#[derive(Debug, Clone, Error)] pub enum FlushError { #[error("no changes made to the dataset")] NoChangesToFlush, diff --git a/src/lib.rs b/src/lib.rs index e059a5c5..98a44040 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -546,20 +546,20 @@ pub enum UpdateNodeError { NotAnArray(Path), } -#[derive(Clone, Debug, PartialEq, Eq, Error)] +#[derive(Clone, Debug, Error)] pub enum StorageError { #[error("object not found `{0:?}`")] NotFound(ObjectId), #[error("synchronization error on the Storage instance")] Deadlock, - #[error("")] - ParseError(Path), #[error("Error contacting object store `{0:?}`")] - ObjectStoreError(String), + ObjectStoreError(Arc), #[error("Storage layer error: {0}")] - StorageLayerError(String), + StorageLayerError(Arc), #[error("Error reading or writing to/from parquet files: `{0:?}`")] - ParquetError(String), + ParquetError(Arc), + #[error("Storage layer error: {0}")] + MiscError(String), } /// Fetch and write the parquet files that represent the dataset in object store diff --git a/src/storage.rs b/src/storage.rs index 763c9b74..9cf9f932 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -5,18 +5,14 @@ use std::{ sync::{Arc, RwLock}, }; -use arrow::array::RecordBatch; -use async_trait::async_trait; -use bytes::Bytes; -use futures::StreamExt; -use parquet::arrow::{ - async_reader::ParquetObjectReader, AsyncArrowWriter, ParquetRecordBatchStreamBuilder, -}; - use crate::{ AttributesTable, ChunkOffset, ManifestsTable, ObjectId, Storage, StorageError, StorageError::StorageLayerError, StructureTable, }; +use arrow::array::RecordBatch; +use async_trait::async_trait; +use bytes::Bytes; +use futures::StreamExt; use object_store::{local::LocalFileSystem, memory::InMemory, path::Path, ObjectStore}; #[allow(dead_code)] @@ -51,12 +47,12 @@ impl ObjectStorage { Ok(ObjectStorage { store: Arc::new( LocalFileSystem::new_with_prefix(&prefix) - .map_err(|err| StorageLayerError(err.to_string()))?, + .map_err(|err| StorageLayerError(Arc::new(err)))?, ), prefix: prefix .to_str() - .ok_or("Couldn't convert prefix to string") - .map_err(|err| StorageError::StorageLayerError(err.to_owned()))? + //TODO: + .ok_or(StorageError::MiscError("Couldn't convert prefix to string".to_string()))? .to_owned(), }) } @@ -68,7 +64,7 @@ impl ObjectStorage { let store = AmazonS3Builder::from_env() .with_bucket_name(bucket_name.into()) .build() - .map_err(|err| StorageError::ObjectStoreError(err.to_string()))?; + .map_err(|err| StorageError::ObjectStoreError(Arc::new(err)))?; Ok(ObjectStorage { store: Arc::new(store), prefix: prefix.into() }) } @@ -85,7 +81,7 @@ impl ObjectStorage { .with_allow_http(true) .with_bucket_name(bucket_name.into()) .build() - .map_err(|err| StorageError::ObjectStoreError(err.to_string()))?; + .map_err(|err| StorageError::ObjectStoreError(Arc::new(err)))?; Ok(ObjectStorage { store: Arc::new(store), prefix: prefix.into() }) } @@ -98,22 +94,31 @@ impl ObjectStorage { } async fn read_parquet(&self, path: &Path) -> Result { + use crate::StorageError::ParquetError; + use parquet::arrow::{ + async_reader::ParquetObjectReader, ParquetRecordBatchStreamBuilder, + }; + // TODO: avoid this read since we are always reading the whole thing. let meta = self .store .head(path) .await - .map_err(|err| StorageError::ParquetError(err.to_string()))?; + .map_err(|err| StorageError::ParquetError(Arc::new(err)))?; let reader = ParquetObjectReader::new(Arc::clone(&self.store), meta); let mut builder = ParquetRecordBatchStreamBuilder::new(reader) .await - .map_err(|err| StorageError::ParquetError(err.to_string()))? + .map_err(|err| StorageError::ParquetError(Arc::new(err)))? .build() - .map_err(|err| StorageError::ParquetError(err.to_string()))?; + .map_err(|err| StorageError::ParquetError(Arc::new(err)))?; // TODO: do we always have only one batch ever? Assert that - // TODO: Use `if let`; - Ok(builder.next().await.unwrap().unwrap()) + let maybe_batch = builder.next().await; + if let Some(batch) = maybe_batch { + batch.map_err(|err| ParquetError(Arc::new(err))) + } else { + Err(StorageError::MiscError("ParquetError:No more record batches".to_string())) + } } async fn write_parquet( @@ -122,18 +127,19 @@ impl ObjectStorage { batch: &RecordBatch, ) -> Result<(), StorageError> { use crate::StorageError::ParquetError; + use parquet::arrow::async_writer::AsyncArrowWriter; let mut buffer = Vec::new(); let mut writer = AsyncArrowWriter::try_new(&mut buffer, batch.schema(), None) - .map_err(|err| ParquetError(err.to_string()))?; - writer.write(batch).await.map_err(|err| ParquetError(err.to_string()))?; - writer.close().await.map_err(|err| ParquetError(err.to_string()))?; + .map_err(|err| ParquetError(Arc::new(err)))?; + writer.write(batch).await.map_err(|err| ParquetError(Arc::new(err)))?; + writer.close().await.map_err(|err| ParquetError(Arc::new(err)))?; // TODO: find object_store streaming interface let payload = object_store::PutPayload::from(buffer); self.store .put(path, payload) .await - .map_err(|err| StorageLayerError(err.to_string()))?; + .map_err(|err| StorageLayerError(Arc::new(err)))?; Ok(()) } } From 34a544e6c9245cce98ff500ef5df1c62d440661d Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 09:00:47 -0600 Subject: [PATCH 05/17] Add .parquet suffix, fix local store prefix --- src/storage.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/storage.rs b/src/storage.rs index 9cf9f932..22a34954 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -49,11 +49,10 @@ impl ObjectStorage { LocalFileSystem::new_with_prefix(&prefix) .map_err(|err| StorageLayerError(Arc::new(err)))?, ), - prefix: prefix - .to_str() - //TODO: - .ok_or(StorageError::MiscError("Couldn't convert prefix to string".to_string()))? - .to_owned(), + // We rely on `new_with_prefix` to create the `prefix` directory + // if it doesn't exist. It will also add the prefix to any path + // so we set ObjectStorate::prefix to an empty string. + prefix: "".to_string(), }) } pub fn new_s3_store_from_env( @@ -88,8 +87,12 @@ impl ObjectStorage { fn get_path(&self, filetype: FileType, ObjectId(asu8): &ObjectId) -> Path { let type_prefix = filetype.get_prefix(); // TODO: be careful about allocation here - let path = - format!("{}/{}/{}", self.prefix, type_prefix, BASE64_URL_SAFE.encode(asu8)); + let path = format!( + "{}/{}/{}.parquet", + self.prefix, + type_prefix, + BASE64_URL_SAFE.encode(asu8) + ); Path::from(path) } @@ -117,7 +120,9 @@ impl ObjectStorage { if let Some(batch) = maybe_batch { batch.map_err(|err| ParquetError(Arc::new(err))) } else { - Err(StorageError::MiscError("ParquetError:No more record batches".to_string())) + Err(StorageError::MiscError( + "ParquetError:No more record batches".to_string(), + )) } } From 32a032487270194081763b7ae38baf2bd5167786 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 11:29:18 -0600 Subject: [PATCH 06/17] Clean up tests 1. Test parquet read/write for all ObjectStorage types 2. Test datasets only with ObjectStorage::InMemoryStore --- src/dataset.rs | 34 +++------------------------------- src/storage.rs | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 31 deletions(-) diff --git a/src/dataset.rs b/src/dataset.rs index 1a567801..d07ddaef 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -596,9 +596,7 @@ pub enum FlushError { #[cfg(test)] mod tests { - use std::{ - collections::HashSet, env::temp_dir, error::Error, num::NonZeroU64, path::PathBuf, - }; + use std::{collections::HashSet, error::Error, num::NonZeroU64, path::PathBuf}; use crate::{ manifest::mk_manifests_table, @@ -610,26 +608,12 @@ mod tests { use super::*; use pretty_assertions::assert_eq; - use rand; - use rand::{distributions::Alphanumeric, Rng}; // 0.8 #[tokio::test(flavor = "multi_thread")] async fn test_dataset_with_updates() -> Result<(), Box> { - let temp_dir_name = temp_dir(); - let prefix: String = rand::thread_rng() - .sample_iter(&Alphanumeric) - .take(7) - .map(char::from) - .collect(); - let storages: [Arc; 4] = [ + let storages: [Arc; 2] = [ Arc::new(InMemoryStorage::new()), Arc::new(ObjectStorage::new_in_memory_store()), - Arc::new(ObjectStorage::new_local_store(temp_dir_name).unwrap()), - // Arc::new(ObjectStorage::new_s3_store_from_env("testbucket".to_string()).unwrap()), - Arc::new( - ObjectStorage::new_s3_store_with_config("testbucket".to_string(), prefix) - .unwrap(), - ), ]; for storage in storages { let array_id = 2; @@ -898,21 +882,9 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_dataset_with_updates_and_writes() -> Result<(), Box> { - let temp_dir_name = temp_dir(); - let prefix: String = rand::thread_rng() - .sample_iter(&Alphanumeric) - .take(7) - .map(char::from) - .collect(); - let storages: [Arc; 4] = [ + let storages: [Arc; 2] = [ Arc::new(InMemoryStorage::new()), Arc::new(ObjectStorage::new_in_memory_store()), - Arc::new(ObjectStorage::new_local_store(temp_dir_name).unwrap()), - // Arc::new(ObjectStorage::new_s3_store_from_env("testbucket".to_string()).unwrap()), - Arc::new( - ObjectStorage::new_s3_store_with_config("testbucket".to_string(), prefix) - .unwrap(), - ), ]; for storage in storages { let mut ds = Dataset::create(Arc::clone(&storage)); diff --git a/src/storage.rs b/src/storage.rs index 22a34954..b58159a2 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -1,6 +1,7 @@ use base64::{engine::general_purpose::URL_SAFE as BASE64_URL_SAFE, Engine as _}; use std::{ collections::HashMap, + fs::create_dir_all, ops::Range, sync::{Arc, RwLock}, }; @@ -44,6 +45,8 @@ impl ObjectStorage { pub fn new_local_store( prefix: std::path::PathBuf, ) -> Result { + create_dir_all(prefix.as_path()) + .map_err(|err| StorageLayerError(Arc::new(err)))?; Ok(ObjectStorage { store: Arc::new( LocalFileSystem::new_with_prefix(&prefix) @@ -351,3 +354,50 @@ impl Storage for InMemoryStorage { Ok(()) } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use crate::ObjectId; + use arrow::array::Int32Array; + use arrow::datatypes::{DataType, Field, Schema}; + use arrow::record_batch::RecordBatch; + use rand; + use rand::distributions::Alphanumeric; + use rand::Rng; + + use super::{FileType, ObjectStorage}; + + fn make_record_batch() -> RecordBatch { + let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); + let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap() + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_read_write_parquet_object_storage() { + // simple test to make sure we can speak to all stores + let batch = make_record_batch(); + let prefix: String = rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(7) + .map(char::from) + .collect(); + + for store in [ + ObjectStorage::new_in_memory_store(), + ObjectStorage::new_local_store(prefix.clone().into()).unwrap(), + // Arc::new(ObjectStorage::new_s3_store_from_env("testbucket".to_string()).unwrap()), + ObjectStorage::new_s3_store_with_config("testbucket".to_string(), prefix) + .unwrap(), + ] { + let id = ObjectId::random(); + let path = store.get_path(FileType::Manifest, &id); + store.write_parquet(&path, &batch).await.unwrap(); + let actual = store.read_parquet(&path).await.unwrap(); + assert_eq!(actual, batch) + } + } +} From 12e855b75fd78d23ee4779c0cc6106ce7832a0c3 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 11:32:59 -0600 Subject: [PATCH 07/17] lint --- src/storage.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/storage.rs b/src/storage.rs index b58159a2..cfecbea0 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -23,7 +23,7 @@ enum FileType { Attributes, } impl FileType { - pub fn get_prefix(&self) -> &str { + pub(crate) fn get_prefix(&self) -> &str { match self { FileType::Structure => "s/", FileType::Manifest => "m/", From 10b4b25741ac06f5f96851931e7bcb5c50e73144 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 11:57:23 -0600 Subject: [PATCH 08/17] Disable S3 test for now. --- src/storage.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/storage.rs b/src/storage.rs index cfecbea0..9095c797 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -389,9 +389,9 @@ mod tests { for store in [ ObjectStorage::new_in_memory_store(), ObjectStorage::new_local_store(prefix.clone().into()).unwrap(), - // Arc::new(ObjectStorage::new_s3_store_from_env("testbucket".to_string()).unwrap()), - ObjectStorage::new_s3_store_with_config("testbucket".to_string(), prefix) - .unwrap(), + // ObjectStorage::new_s3_store_from_env("testbucket".to_string()).unwrap(), + // ObjectStorage::new_s3_store_with_config("testbucket".to_string(), prefix) + // .unwrap(), ] { let id = ObjectId::random(); let path = store.get_path(FileType::Manifest, &id); From cca2b92a827683d721097b25b5129116973034e2 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 13:25:54 -0600 Subject: [PATCH 09/17] Better errors! --- src/dataset.rs | 2 +- src/lib.rs | 20 ++++++++++-------- src/storage.rs | 56 ++++++++++++++++---------------------------------- 3 files changed, 31 insertions(+), 47 deletions(-) diff --git a/src/dataset.rs b/src/dataset.rs index d07ddaef..6610c980 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -586,7 +586,7 @@ impl TableRegionTracker { } } -#[derive(Debug, Clone, Error)] +#[derive(Debug, Error)] pub enum FlushError { #[error("no changes made to the dataset")] NoChangesToFlush, diff --git a/src/lib.rs b/src/lib.rs index 98a44040..5a9669d0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -29,8 +29,10 @@ pub mod structure; use async_trait::async_trait; use bytes::Bytes; use manifest::ManifestsTable; +use object_store; +use parquet::errors as parquet_errors; use std::{ - collections::HashMap, fmt::Display, num::NonZeroU64, ops::Range, path::PathBuf, + collections::HashMap, fmt::Display, io, num::NonZeroU64, ops::Range, path::PathBuf, sync::Arc, }; use structure::StructureTable; @@ -546,20 +548,22 @@ pub enum UpdateNodeError { NotAnArray(Path), } -#[derive(Clone, Debug, Error)] +#[derive(Debug, Error)] pub enum StorageError { #[error("object not found `{0:?}`")] NotFound(ObjectId), #[error("synchronization error on the Storage instance")] Deadlock, + // TODO: pattern match on ObjectStore error + // combine with StorageLayerError #[error("Error contacting object store `{0:?}`")] - ObjectStoreError(Arc), - #[error("Storage layer error: {0}")] - StorageLayerError(Arc), + ObjectStore(#[from] object_store::Error), #[error("Error reading or writing to/from parquet files: `{0:?}`")] - ParquetError(Arc), - #[error("Storage layer error: {0}")] - MiscError(String), + ParquetError(#[from] parquet_errors::ParquetError), + #[error("Error reading RecordBatch from parquet files.")] + BadRecordBatchRead, + #[error("I/O error: `{0:?}`")] + IOError(#[from] io::Error), } /// Fetch and write the parquet files that represent the dataset in object store diff --git a/src/storage.rs b/src/storage.rs index 9095c797..f346ec56 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -8,7 +8,7 @@ use std::{ use crate::{ AttributesTable, ChunkOffset, ManifestsTable, ObjectId, Storage, StorageError, - StorageError::StorageLayerError, StructureTable, + StructureTable, }; use arrow::array::RecordBatch; use async_trait::async_trait; @@ -16,11 +16,14 @@ use bytes::Bytes; use futures::StreamExt; use object_store::{local::LocalFileSystem, memory::InMemory, path::Path, ObjectStore}; +// TODO: constant +const STRUCTURE_PREFIX: &str = "s/"; #[allow(dead_code)] enum FileType { Structure, Manifest, Attributes, + Chunk, } impl FileType { pub(crate) fn get_prefix(&self) -> &str { @@ -28,6 +31,7 @@ impl FileType { FileType::Structure => "s/", FileType::Manifest => "m/", FileType::Attributes => "a/", + FileType::Chunk => "c", } } } @@ -45,13 +49,9 @@ impl ObjectStorage { pub fn new_local_store( prefix: std::path::PathBuf, ) -> Result { - create_dir_all(prefix.as_path()) - .map_err(|err| StorageLayerError(Arc::new(err)))?; + create_dir_all(prefix.as_path())?; Ok(ObjectStorage { - store: Arc::new( - LocalFileSystem::new_with_prefix(&prefix) - .map_err(|err| StorageLayerError(Arc::new(err)))?, - ), + store: Arc::new(LocalFileSystem::new_with_prefix(&prefix)?), // We rely on `new_with_prefix` to create the `prefix` directory // if it doesn't exist. It will also add the prefix to any path // so we set ObjectStorate::prefix to an empty string. @@ -63,10 +63,8 @@ impl ObjectStorage { prefix: impl Into, ) -> Result { use object_store::aws::AmazonS3Builder; - let store = AmazonS3Builder::from_env() - .with_bucket_name(bucket_name.into()) - .build() - .map_err(|err| StorageError::ObjectStoreError(Arc::new(err)))?; + let store = + AmazonS3Builder::from_env().with_bucket_name(bucket_name.into()).build()?; Ok(ObjectStorage { store: Arc::new(store), prefix: prefix.into() }) } @@ -82,8 +80,7 @@ impl ObjectStorage { .with_endpoint("http://localhost:9000") .with_allow_http(true) .with_bucket_name(bucket_name.into()) - .build() - .map_err(|err| StorageError::ObjectStoreError(Arc::new(err)))?; + .build()?; Ok(ObjectStorage { store: Arc::new(store), prefix: prefix.into() }) } @@ -100,32 +97,20 @@ impl ObjectStorage { } async fn read_parquet(&self, path: &Path) -> Result { - use crate::StorageError::ParquetError; use parquet::arrow::{ async_reader::ParquetObjectReader, ParquetRecordBatchStreamBuilder, }; // TODO: avoid this read since we are always reading the whole thing. - let meta = self - .store - .head(path) - .await - .map_err(|err| StorageError::ParquetError(Arc::new(err)))?; + let meta = self.store.head(path).await?; let reader = ParquetObjectReader::new(Arc::clone(&self.store), meta); - let mut builder = ParquetRecordBatchStreamBuilder::new(reader) - .await - .map_err(|err| StorageError::ParquetError(Arc::new(err)))? - .build() - .map_err(|err| StorageError::ParquetError(Arc::new(err)))?; - + let mut builder = ParquetRecordBatchStreamBuilder::new(reader).await?.build()?; // TODO: do we always have only one batch ever? Assert that let maybe_batch = builder.next().await; if let Some(batch) = maybe_batch { - batch.map_err(|err| ParquetError(Arc::new(err))) + Ok(batch?) } else { - Err(StorageError::MiscError( - "ParquetError:No more record batches".to_string(), - )) + Err(StorageError::BadRecordBatchRead) } } @@ -134,20 +119,15 @@ impl ObjectStorage { path: &Path, batch: &RecordBatch, ) -> Result<(), StorageError> { - use crate::StorageError::ParquetError; use parquet::arrow::async_writer::AsyncArrowWriter; let mut buffer = Vec::new(); - let mut writer = AsyncArrowWriter::try_new(&mut buffer, batch.schema(), None) - .map_err(|err| ParquetError(Arc::new(err)))?; - writer.write(batch).await.map_err(|err| ParquetError(Arc::new(err)))?; - writer.close().await.map_err(|err| ParquetError(Arc::new(err)))?; + let mut writer = AsyncArrowWriter::try_new(&mut buffer, batch.schema(), None)?; + writer.write(batch).await?; + writer.close().await?; // TODO: find object_store streaming interface let payload = object_store::PutPayload::from(buffer); - self.store - .put(path, payload) - .await - .map_err(|err| StorageLayerError(Arc::new(err)))?; + self.store.put(path, payload).await?; Ok(()) } } From af909d529d314b1056396a721dfe33d94c11e967 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 13:53:02 -0600 Subject: [PATCH 10/17] Constant prefixes --- src/storage.rs | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/src/storage.rs b/src/storage.rs index f346ec56..5d15dc2c 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -16,25 +16,10 @@ use bytes::Bytes; use futures::StreamExt; use object_store::{local::LocalFileSystem, memory::InMemory, path::Path, ObjectStore}; -// TODO: constant const STRUCTURE_PREFIX: &str = "s/"; -#[allow(dead_code)] -enum FileType { - Structure, - Manifest, - Attributes, - Chunk, -} -impl FileType { - pub(crate) fn get_prefix(&self) -> &str { - match self { - FileType::Structure => "s/", - FileType::Manifest => "m/", - FileType::Attributes => "a/", - FileType::Chunk => "c", - } - } -} +const MANIFEST_PREFIX: &str = "m/"; +const ATTRIBUTES_PREFIX: &str = "a/"; +const CHUNK_PREFIX: &str = "c/"; // #[derive(Default)] pub struct ObjectStorage { @@ -84,13 +69,12 @@ impl ObjectStorage { Ok(ObjectStorage { store: Arc::new(store), prefix: prefix.into() }) } - fn get_path(&self, filetype: FileType, ObjectId(asu8): &ObjectId) -> Path { - let type_prefix = filetype.get_prefix(); + fn get_path(&self, file_prefix: &str, ObjectId(asu8): &ObjectId) -> Path { // TODO: be careful about allocation here let path = format!( "{}/{}/{}.parquet", self.prefix, - type_prefix, + file_prefix, BASE64_URL_SAFE.encode(asu8) ); Path::from(path) @@ -138,7 +122,7 @@ impl Storage for ObjectStorage { &self, id: &ObjectId, ) -> Result, StorageError> { - let path = self.get_path(FileType::Structure, id); + let path = self.get_path(STRUCTURE_PREFIX, id); let batch = self.read_parquet(&path).await?; Ok(Arc::new(StructureTable { batch })) } @@ -154,7 +138,7 @@ impl Storage for ObjectStorage { &self, id: &ObjectId, ) -> Result, StorageError> { - let path = self.get_path(FileType::Manifest, id); + let path = self.get_path(MANIFEST_PREFIX, id); let batch = self.read_parquet(&path).await?; Ok(Arc::new(ManifestsTable { batch })) } @@ -164,7 +148,7 @@ impl Storage for ObjectStorage { id: ObjectId, table: Arc, ) -> Result<(), StorageError> { - let path = self.get_path(FileType::Structure, &id); + let path = self.get_path(STRUCTURE_PREFIX, &id); self.write_parquet(&path, &table.batch).await?; Ok(()) } @@ -175,7 +159,7 @@ impl Storage for ObjectStorage { _table: Arc, ) -> Result<(), StorageError> { todo!() - // let path = ObjectStorage::get_path(FileType::Structure, &id); + // let path = ObjectStorage::get_path(ATTRIBUTES_PREFIX, &id); // self.write_parquet(&path, &table.batch).await?; // Ok(()) } @@ -185,7 +169,7 @@ impl Storage for ObjectStorage { id: ObjectId, table: Arc, ) -> Result<(), StorageError> { - let path = self.get_path(FileType::Manifest, &id); + let path = self.get_path(MANIFEST_PREFIX, &id); self.write_parquet(&path, &table.batch).await?; Ok(()) } @@ -347,7 +331,7 @@ mod tests { use rand::distributions::Alphanumeric; use rand::Rng; - use super::{FileType, ObjectStorage}; + use super::ObjectStorage; fn make_record_batch() -> RecordBatch { let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); @@ -374,7 +358,7 @@ mod tests { // .unwrap(), ] { let id = ObjectId::random(); - let path = store.get_path(FileType::Manifest, &id); + let path = store.get_path("foo_prefix/", &id); store.write_parquet(&path, &batch).await.unwrap(); let actual = store.read_parquet(&path).await.unwrap(); assert_eq!(actual, batch) From 7f7ef6f951626b0ac7b07557272e17ec75187ac6 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 14:34:14 -0600 Subject: [PATCH 11/17] cleanup --- src/lib.rs | 4 ++-- src/storage.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4a26ac0c..ca06f43f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -556,9 +556,9 @@ pub enum StorageError { Deadlock, // TODO: pattern match on ObjectStore error // combine with StorageLayerError - #[error("Error contacting object store `{0:?}`")] + #[error("Error contacting object store {0}")] ObjectStore(#[from] object_store::Error), - #[error("Error reading or writing to/from parquet files: `{0:?}`")] + #[error("Error reading or writing to/from parquet files: {0}")] ParquetError(#[from] parquet_errors::ParquetError), #[error("Error reading RecordBatch from parquet files.")] BadRecordBatchRead, diff --git a/src/storage.rs b/src/storage.rs index 7b86bde8..b8fc95da 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -85,7 +85,7 @@ impl ObjectStorage { async_reader::ParquetObjectReader, ParquetRecordBatchStreamBuilder, }; - // TODO: avoid this read since we are always reading the whole thing. + // TODO: avoid this metadata read since we are always reading the whole thing. let meta = self.store.head(path).await?; let reader = ParquetObjectReader::new(Arc::clone(&self.store), meta); let mut builder = ParquetRecordBatchStreamBuilder::new(reader).await?.build()?; From 4dc0f2c2e73c8613ec5c52408159f11e48a035cf Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 14:34:49 -0600 Subject: [PATCH 12/17] Read/write chunks --- src/lib.rs | 2 ++ src/storage.rs | 29 +++++++++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index ca06f43f..39496a4e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -562,6 +562,8 @@ pub enum StorageError { ParquetError(#[from] parquet_errors::ParquetError), #[error("Error reading RecordBatch from parquet files.")] BadRecordBatchRead, + #[error("Bad byte range for chunk read `{0:?}`.")] + BadByteRange(Option>), #[error("I/O error: `{0:?}`")] IOError(#[from] io::Error), } diff --git a/src/storage.rs b/src/storage.rs index b8fc95da..2423f80f 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -176,18 +176,35 @@ impl Storage for ObjectStorage { async fn fetch_chunk( &self, - _x_id: &ObjectId, - _range: &Option>, + id: &ObjectId, + range: &Option>, ) -> Result { - todo!() + let path = self.get_path(CHUNK_PREFIX, &id); + // TODO: shall we split `range` into multiple ranges and use get_ranges? + // I can't tell that `get_range` does splitting + if let Some(range) = range { + Ok(self + .store + .get_range(&path, (range.start as usize)..(range.end as usize)) + .await?) + } else { + // TODO: Can't figure out if `get` is the most efficient way to get the whole object. + Ok(self.store.get(&path).await?.bytes().await?) + } } async fn write_chunk( &self, - _id: ObjectId, - _bytes: bytes::Bytes, + id: ObjectId, + bytes: bytes::Bytes, ) -> Result<(), StorageError> { - todo!() + let path = self.get_path(CHUNK_PREFIX, &id); + let upload = self.store.put_multipart(&path).await?; + // TODO: new_with_chunk_size? + let mut write = object_store::WriteMultipart::new(upload); + write.write(&bytes); + write.finish().await?; + Ok(()) } } From 03f7ec6d1b7f0f8e77cb77a267f2da03d1ad3046 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 14:46:55 -0600 Subject: [PATCH 13/17] Fix merge. --- src/dataset.rs | 75 +------------------------------------------------- 1 file changed, 1 insertion(+), 74 deletions(-) diff --git a/src/dataset.rs b/src/dataset.rs index 91d1b621..db313590 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -1024,80 +1024,7 @@ mod tests { ds.set_chunk(new_array_path.clone(), ArrayIndices(vec![0, 0, 1]), None) .await?; - let new_meta = - ZarrArrayMetadata { shape: vec![1, 1, 1], ..zarr_meta.clone() }; - // we change zarr metadata - ds.update_array(new_array_path.clone(), new_meta.clone()).await?; - - // we change user attributes metadata - ds.set_user_attributes(new_array_path.clone(), Some("{foo:42}".to_string())) - .await?; - - let _structure_id = ds.flush().await?; - assert_eq!( - ds.get_node(&"/".into()).await, - Some(NodeStructure { - id: 1, - path: "/".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - assert_eq!( - ds.get_node(&"/group".into()).await, - Some(NodeStructure { - id: 2, - path: "/group".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - assert!(matches!( - ds.get_node(&new_array_path).await, - Some(NodeStructure { - id: 3, - path, - user_attributes: None, - node_data: NodeData::Array(meta, manifests) - }) if path == new_array_path && meta == zarr_meta.clone() && manifests.len() == 1 - )); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, - Some(ChunkPayload::Inline(b"hello".into())) - ); - - // we modify a chunk in an existing array - ds.set_chunk( - new_array_path.clone(), - ArrayIndices(vec![0, 0, 0]), - Some(ChunkPayload::Inline(b"bye".into())), - ) - .await?; - - // we add a new chunk in an existing array - ds.set_chunk( - new_array_path.clone(), - ArrayIndices(vec![0, 0, 1]), - Some(ChunkPayload::Inline(b"new chunk".into())), - ) - .await?; - - let previous_structure_id = ds.flush().await?; - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, - Some(ChunkPayload::Inline(b"bye".into())) - ); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, - Some(ChunkPayload::Inline(b"new chunk".into())) - ); - - // we delete a chunk - ds.set_chunk(new_array_path.clone(), ArrayIndices(vec![0, 0, 1]), None) - .await?; - - let new_meta = - ZarrArrayMetadata { shape: vec![1, 1, 1], ..zarr_meta.clone() }; + let new_meta = ZarrArrayMetadata { shape: vec![1, 1, 1], ..zarr_meta }; // we change zarr metadata ds.update_array(new_array_path.clone(), new_meta.clone()).await?; From f5532c9c648c7a92c9c31b408d3a646f46279a51 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 14:53:23 -0600 Subject: [PATCH 14/17] lint --- src/lib.rs | 1 - src/storage.rs | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b76b4145..db5b9584 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -30,7 +30,6 @@ use async_trait::async_trait; use bytes::Bytes; use itertools::Itertools; use manifest::ManifestsTable; -use object_store; use parquet::errors as parquet_errors; use std::{ collections::HashMap, diff --git a/src/storage.rs b/src/storage.rs index 2423f80f..c0dadb59 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -18,7 +18,7 @@ use object_store::{local::LocalFileSystem, memory::InMemory, path::Path, ObjectS const STRUCTURE_PREFIX: &str = "s/"; const MANIFEST_PREFIX: &str = "m/"; -const ATTRIBUTES_PREFIX: &str = "a/"; +// const ATTRIBUTES_PREFIX: &str = "a/"; const CHUNK_PREFIX: &str = "c/"; // #[derive(Default)] @@ -179,7 +179,7 @@ impl Storage for ObjectStorage { id: &ObjectId, range: &Option>, ) -> Result { - let path = self.get_path(CHUNK_PREFIX, &id); + let path = self.get_path(CHUNK_PREFIX, id); // TODO: shall we split `range` into multiple ranges and use get_ranges? // I can't tell that `get_range` does splitting if let Some(range) = range { From 02d6ab27dd453d2125bc700cbc6cf3333ab77beb Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 15:03:31 -0600 Subject: [PATCH 15/17] Revert changes to test --- src/dataset.rs | 662 ++++++++++++++++++++++++------------------------- 1 file changed, 321 insertions(+), 341 deletions(-) diff --git a/src/dataset.rs b/src/dataset.rs index db313590..e6d44853 100644 --- a/src/dataset.rs +++ b/src/dataset.rs @@ -610,11 +610,10 @@ mod tests { use std::{collections::HashSet, error::Error, num::NonZeroU64, path::PathBuf}; use crate::{ - manifest::mk_manifests_table, - storage::{InMemoryStorage, ObjectStorage}, - structure::mk_structure_table, - ChunkInfo, ChunkKeyEncoding, ChunkRef, ChunkShape, Codecs, DataType, FillValue, - Flags, ManifestExtents, StorageTransformers, TableRegion, + manifest::mk_manifests_table, storage::InMemoryStorage, + structure::mk_structure_table, ChunkInfo, ChunkKeyEncoding, ChunkRef, ChunkShape, + Codecs, DataType, FillValue, Flags, ManifestExtents, StorageTransformers, + TableRegion, }; use super::*; @@ -622,191 +621,179 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_dataset_with_updates() -> Result<(), Box> { - let storages: [Arc; 2] = [ - Arc::new(InMemoryStorage::new()), - Arc::new(ObjectStorage::new_in_memory_store()), - ]; - for storage in storages { - let array_id = 2; - let chunk1 = ChunkInfo { - node: array_id, - coord: ArrayIndices(vec![0, 0, 0]), - payload: ChunkPayload::Ref(ChunkRef { - id: ObjectId::random(), - offset: 0, - length: 4, - }), - }; - - let chunk2 = ChunkInfo { - node: array_id, - coord: ArrayIndices(vec![0, 0, 1]), - payload: ChunkPayload::Inline(vec![0, 0, 0, 42]), - }; - - let manifest = Arc::new( - mk_manifests_table(futures::stream::iter(vec![ - chunk1.clone(), - chunk2.clone(), - ])) - .await, - ); - let manifest_id = ObjectId::random(); - storage.write_manifests(manifest_id.clone(), manifest).await?; - - let zarr_meta1 = ZarrArrayMetadata { - shape: vec![2, 2, 2], - data_type: DataType::Int32, - chunk_shape: ChunkShape(vec![ - NonZeroU64::new(1).unwrap(), - NonZeroU64::new(1).unwrap(), - NonZeroU64::new(1).unwrap(), - ]), - chunk_key_encoding: ChunkKeyEncoding::Slash, - fill_value: FillValue::Int32(0), - codecs: Codecs("codec".to_string()), - storage_transformers: Some(StorageTransformers( - "tranformers".to_string(), - )), - dimension_names: Some(vec![ - Some("x".to_string()), - Some("y".to_string()), - Some("t".to_string()), - ]), - }; - let manifest_ref = ManifestRef { - object_id: manifest_id, - location: TableRegion(0, 2), - flags: Flags(), - extents: ManifestExtents(vec![]), - }; - let array1_path: PathBuf = "/array1".to_string().into(); - let nodes = vec![ - NodeStructure { - path: "/".into(), - id: 1, - user_attributes: None, - node_data: NodeData::Group, - }, - NodeStructure { - path: array1_path.clone(), - id: array_id, - user_attributes: Some(UserAttributesStructure::Inline( - "{foo:1}".to_string(), - )), - node_data: NodeData::Array(zarr_meta1.clone(), vec![manifest_ref]), - }, - ]; - - let structure = Arc::new(mk_structure_table(nodes.clone())); - let structure_id = ObjectId::random(); - storage.write_structure(structure_id.clone(), structure).await?; - let mut ds = Dataset::update(storage, structure_id); - - // retrieve the old array node - let node = ds.get_node(&array1_path).await; - assert_eq!(nodes.get(1), node.as_ref()); - - // add a new array and retrieve its node - ds.add_group("/group".to_string().into()).await?; - - let zarr_meta2 = ZarrArrayMetadata { - shape: vec![3], - data_type: DataType::Int32, - chunk_shape: ChunkShape(vec![NonZeroU64::new(2).unwrap()]), - chunk_key_encoding: ChunkKeyEncoding::Slash, - fill_value: FillValue::Int32(0), - codecs: Codecs("codec".to_string()), - storage_transformers: Some(StorageTransformers( - "tranformers".to_string(), + let storage = InMemoryStorage::new(); + + let array_id = 2; + let chunk1 = ChunkInfo { + node: array_id, + coord: ArrayIndices(vec![0, 0, 0]), + payload: ChunkPayload::Ref(ChunkRef { + id: ObjectId::random(), + offset: 0, + length: 4, + }), + }; + + let chunk2 = ChunkInfo { + node: array_id, + coord: ArrayIndices(vec![0, 0, 1]), + payload: ChunkPayload::Inline(vec![0, 0, 0, 42]), + }; + + let manifest = Arc::new( + mk_manifests_table(futures::stream::iter(vec![ + chunk1.clone(), + chunk2.clone(), + ])) + .await, + ); + let manifest_id = ObjectId::random(); + storage.write_manifests(manifest_id.clone(), manifest).await?; + + let zarr_meta1 = ZarrArrayMetadata { + shape: vec![2, 2, 2], + data_type: DataType::Int32, + chunk_shape: ChunkShape(vec![ + NonZeroU64::new(1).unwrap(), + NonZeroU64::new(1).unwrap(), + NonZeroU64::new(1).unwrap(), + ]), + chunk_key_encoding: ChunkKeyEncoding::Slash, + fill_value: FillValue::Int32(0), + codecs: Codecs("codec".to_string()), + storage_transformers: Some(StorageTransformers("tranformers".to_string())), + dimension_names: Some(vec![ + Some("x".to_string()), + Some("y".to_string()), + Some("t".to_string()), + ]), + }; + let manifest_ref = ManifestRef { + object_id: manifest_id, + location: TableRegion(0, 2), + flags: Flags(), + extents: ManifestExtents(vec![]), + }; + let array1_path: PathBuf = "/array1".to_string().into(); + let nodes = vec![ + NodeStructure { + path: "/".into(), + id: 1, + user_attributes: None, + node_data: NodeData::Group, + }, + NodeStructure { + path: array1_path.clone(), + id: array_id, + user_attributes: Some(UserAttributesStructure::Inline( + "{foo:1}".to_string(), )), - dimension_names: Some(vec![Some("t".to_string())]), - }; + node_data: NodeData::Array(zarr_meta1.clone(), vec![manifest_ref]), + }, + ]; - let new_array_path: PathBuf = "/group/array2".to_string().into(); - ds.add_array(new_array_path.clone(), zarr_meta2.clone()).await?; + let structure = Arc::new(mk_structure_table(nodes.clone())); + let structure_id = ObjectId::random(); + storage.write_structure(structure_id.clone(), structure).await?; + let mut ds = Dataset::update(Arc::new(storage), structure_id); - let node = ds.get_node(&new_array_path).await; - assert_eq!( - node, - Some(NodeStructure { - path: new_array_path.clone(), - id: 4, - user_attributes: None, - node_data: NodeData::Array(zarr_meta2.clone(), vec![]), - }) - ); - - // set user attributes for the new array and retrieve them - ds.set_user_attributes(new_array_path.clone(), Some("{n:42}".to_string())) - .await?; - let node = ds.get_node(&new_array_path).await; - assert_eq!( - node, - Some(NodeStructure { - path: "/group/array2".into(), - id: 4, - user_attributes: Some(UserAttributesStructure::Inline( - "{n:42}".to_string(), - )), - node_data: NodeData::Array(zarr_meta2.clone(), vec![]), - }) - ); - - // set a chunk for the new array and retrieve it - ds.set_chunk( - new_array_path.clone(), - ArrayIndices(vec![0]), - Some(ChunkPayload::Inline(vec![0, 0, 0, 7])), - ) - .await?; + // retrieve the old array node + let node = ds.get_node(&array1_path).await; + assert_eq!(nodes.get(1), node.as_ref()); + + // add a new array and retrieve its node + ds.add_group("/group".to_string().into()).await?; - let chunk = ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0])).await; - assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 7]))); + let zarr_meta2 = ZarrArrayMetadata { + shape: vec![3], + data_type: DataType::Int32, + chunk_shape: ChunkShape(vec![NonZeroU64::new(2).unwrap()]), + chunk_key_encoding: ChunkKeyEncoding::Slash, + fill_value: FillValue::Int32(0), + codecs: Codecs("codec".to_string()), + storage_transformers: Some(StorageTransformers("tranformers".to_string())), + dimension_names: Some(vec![Some("t".to_string())]), + }; - // retrieve a non initialized chunk of the new array - let non_chunk = - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![1])).await; - assert_eq!(non_chunk, None); + let new_array_path: PathBuf = "/group/array2".to_string().into(); + ds.add_array(new_array_path.clone(), zarr_meta2.clone()).await?; + + let node = ds.get_node(&new_array_path).await; + assert_eq!( + node, + Some(NodeStructure { + path: new_array_path.clone(), + id: 4, + user_attributes: None, + node_data: NodeData::Array(zarr_meta2.clone(), vec![]), + }) + ); - // update old array use attriutes and check them - ds.set_user_attributes( - array1_path.clone(), - Some("{updated: true}".to_string()), - ) + // set user attributes for the new array and retrieve them + ds.set_user_attributes(new_array_path.clone(), Some("{n:42}".to_string())) .await?; - let node = ds.get_node(&array1_path).await.unwrap(); - assert_eq!( - node.user_attributes, - Some(UserAttributesStructure::Inline("{updated: true}".to_string())) - ); - - // update old array zarr metadata and check it - let new_zarr_meta1 = ZarrArrayMetadata { shape: vec![2, 2, 3], ..zarr_meta1 }; - ds.update_array(array1_path.clone(), new_zarr_meta1).await?; - let node = ds.get_node(&array1_path).await; - if let Some(NodeStructure { - node_data: NodeData::Array(ZarrArrayMetadata { shape, .. }, _), - .. - }) = node - { - assert_eq!(shape, vec![2, 2, 3]); - } else { - panic!("Failed to update zarr metadata"); - } + let node = ds.get_node(&new_array_path).await; + assert_eq!( + node, + Some(NodeStructure { + path: "/group/array2".into(), + id: 4, + user_attributes: Some(UserAttributesStructure::Inline( + "{n:42}".to_string(), + )), + node_data: NodeData::Array(zarr_meta2.clone(), vec![]), + }) + ); + + // set a chunk for the new array and retrieve it + ds.set_chunk( + new_array_path.clone(), + ArrayIndices(vec![0]), + Some(ChunkPayload::Inline(vec![0, 0, 0, 7])), + ) + .await?; + + let chunk = ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0])).await; + assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 7]))); - // set old array chunk and check them - ds.set_chunk( - array1_path.clone(), - ArrayIndices(vec![0, 0, 0]), - Some(ChunkPayload::Inline(vec![0, 0, 0, 99])), - ) + // retrieve a non initialized chunk of the new array + let non_chunk = ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![1])).await; + assert_eq!(non_chunk, None); + + // update old array use attriutes and check them + ds.set_user_attributes(array1_path.clone(), Some("{updated: true}".to_string())) .await?; + let node = ds.get_node(&array1_path).await.unwrap(); + assert_eq!( + node.user_attributes, + Some(UserAttributesStructure::Inline("{updated: true}".to_string())) + ); - let chunk = - ds.get_chunk_ref(&array1_path, &ArrayIndices(vec![0, 0, 0])).await; - assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 99]))); + // update old array zarr metadata and check it + let new_zarr_meta1 = ZarrArrayMetadata { shape: vec![2, 2, 3], ..zarr_meta1 }; + ds.update_array(array1_path.clone(), new_zarr_meta1).await?; + let node = ds.get_node(&array1_path).await; + if let Some(NodeStructure { + node_data: NodeData::Array(ZarrArrayMetadata { shape, .. }, _), + .. + }) = node + { + assert_eq!(shape, vec![2, 2, 3]); + } else { + panic!("Failed to update zarr metadata"); } + + // set old array chunk and check them + ds.set_chunk( + array1_path.clone(), + ArrayIndices(vec![0, 0, 0]), + Some(ChunkPayload::Inline(vec![0, 0, 0, 99])), + ) + .await?; + + let chunk = ds.get_chunk_ref(&array1_path, &ArrayIndices(vec![0, 0, 0])).await; + assert_eq!(chunk, Some(ChunkPayload::Inline(vec![0, 0, 0, 99]))); + Ok(()) } @@ -893,177 +880,170 @@ mod tests { #[tokio::test(flavor = "multi_thread")] async fn test_dataset_with_updates_and_writes() -> Result<(), Box> { - let storages: [Arc; 2] = [ - Arc::new(InMemoryStorage::new()), - Arc::new(ObjectStorage::new_in_memory_store()), - ]; - for storage in storages { - let mut ds = Dataset::create(Arc::clone(&storage)); + let storage: Arc = Arc::new(InMemoryStorage::new()); + let mut ds = Dataset::create(Arc::clone(&storage)); + + // add a new array and retrieve its node + ds.add_group("/".into()).await?; + let structure_id = ds.flush().await?; + + assert_eq!(Some(structure_id), ds.structure_id); + assert_eq!( + ds.get_node(&"/".into()).await, + Some(NodeStructure { + id: 1, + path: "/".into(), + user_attributes: None, + node_data: NodeData::Group + }) + ); + ds.add_group("/group".into()).await?; + let _structure_id = ds.flush().await?; + assert_eq!( + ds.get_node(&"/".into()).await, + Some(NodeStructure { + id: 1, + path: "/".into(), + user_attributes: None, + node_data: NodeData::Group + }) + ); + assert_eq!( + ds.get_node(&"/group".into()).await, + Some(NodeStructure { + id: 2, + path: "/group".into(), + user_attributes: None, + node_data: NodeData::Group + }) + ); + let zarr_meta = ZarrArrayMetadata { + shape: vec![1, 1, 2], + data_type: DataType::Int32, + chunk_shape: ChunkShape(vec![NonZeroU64::new(2).unwrap()]), + chunk_key_encoding: ChunkKeyEncoding::Slash, + fill_value: FillValue::Int32(0), + codecs: Codecs("codec".to_string()), + storage_transformers: Some(StorageTransformers("tranformers".to_string())), + dimension_names: Some(vec![Some("t".to_string())]), + }; - // add a new array and retrieve its node - ds.add_group("/".into()).await?; - let structure_id = ds.flush().await?; + let new_array_path: PathBuf = "/group/array1".to_string().into(); + ds.add_array(new_array_path.clone(), zarr_meta.clone()).await?; + + // wo commit to test the case of a chunkless array + let _structure_id = ds.flush().await?; + + // we set a chunk in a new array + ds.set_chunk( + new_array_path.clone(), + ArrayIndices(vec![0, 0, 0]), + Some(ChunkPayload::Inline(b"hello".into())), + ) + .await?; + + let _structure_id = ds.flush().await?; + assert_eq!( + ds.get_node(&"/".into()).await, + Some(NodeStructure { + id: 1, + path: "/".into(), + user_attributes: None, + node_data: NodeData::Group + }) + ); + assert_eq!( + ds.get_node(&"/group".into()).await, + Some(NodeStructure { + id: 2, + path: "/group".into(), + user_attributes: None, + node_data: NodeData::Group + }) + ); + assert!(matches!( + ds.get_node(&new_array_path).await, + Some(NodeStructure { + id: 3, + path, + user_attributes: None, + node_data: NodeData::Array(meta, manifests) + }) if path == new_array_path && meta == zarr_meta.clone() && manifests.len() == 1 + )); + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, + Some(ChunkPayload::Inline(b"hello".into())) + ); - assert_eq!(Some(structure_id), ds.structure_id); - assert_eq!( - ds.get_node(&"/".into()).await, - Some(NodeStructure { - id: 1, - path: "/".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - ds.add_group("/group".into()).await?; - let _structure_id = ds.flush().await?; - assert_eq!( - ds.get_node(&"/".into()).await, - Some(NodeStructure { - id: 1, - path: "/".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - assert_eq!( - ds.get_node(&"/group".into()).await, - Some(NodeStructure { - id: 2, - path: "/group".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - let zarr_meta = ZarrArrayMetadata { - shape: vec![1, 1, 2], - data_type: DataType::Int32, - chunk_shape: ChunkShape(vec![NonZeroU64::new(2).unwrap()]), - chunk_key_encoding: ChunkKeyEncoding::Slash, - fill_value: FillValue::Int32(0), - codecs: Codecs("codec".to_string()), - storage_transformers: Some(StorageTransformers( - "tranformers".to_string(), - )), - dimension_names: Some(vec![Some("t".to_string())]), - }; + // we modify a chunk in an existing array + ds.set_chunk( + new_array_path.clone(), + ArrayIndices(vec![0, 0, 0]), + Some(ChunkPayload::Inline(b"bye".into())), + ) + .await?; + + // we add a new chunk in an existing array + ds.set_chunk( + new_array_path.clone(), + ArrayIndices(vec![0, 0, 1]), + Some(ChunkPayload::Inline(b"new chunk".into())), + ) + .await?; + + let previous_structure_id = ds.flush().await?; + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, + Some(ChunkPayload::Inline(b"bye".into())) + ); + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, + Some(ChunkPayload::Inline(b"new chunk".into())) + ); - let new_array_path: PathBuf = "/group/array1".to_string().into(); - ds.add_array(new_array_path.clone(), zarr_meta.clone()).await?; + // we delete a chunk + ds.set_chunk(new_array_path.clone(), ArrayIndices(vec![0, 0, 1]), None).await?; - // wo commit to test the case of a chunkless array - let _structure_id = ds.flush().await?; + let new_meta = ZarrArrayMetadata { shape: vec![1, 1, 1], ..zarr_meta }; + // we change zarr metadata + ds.update_array(new_array_path.clone(), new_meta.clone()).await?; - // we set a chunk in a new array - ds.set_chunk( - new_array_path.clone(), - ArrayIndices(vec![0, 0, 0]), - Some(ChunkPayload::Inline(b"hello".into())), - ) + // we change user attributes metadata + ds.set_user_attributes(new_array_path.clone(), Some("{foo:42}".to_string())) .await?; - let _structure_id = ds.flush().await?; - assert_eq!( - ds.get_node(&"/".into()).await, - Some(NodeStructure { - id: 1, - path: "/".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - assert_eq!( - ds.get_node(&"/group".into()).await, - Some(NodeStructure { - id: 2, - path: "/group".into(), - user_attributes: None, - node_data: NodeData::Group - }) - ); - assert!(matches!( - ds.get_node(&new_array_path).await, - Some(NodeStructure { - id: 3, - path, - user_attributes: None, - node_data: NodeData::Array(meta, manifests) - }) if path == new_array_path && meta == zarr_meta.clone() && manifests.len() == 1 - )); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, - Some(ChunkPayload::Inline(b"hello".into())) - ); - - // we modify a chunk in an existing array - ds.set_chunk( - new_array_path.clone(), - ArrayIndices(vec![0, 0, 0]), - Some(ChunkPayload::Inline(b"bye".into())), - ) - .await?; + let structure_id = ds.flush().await?; + let ds = Dataset::update(Arc::clone(&storage), structure_id); - // we add a new chunk in an existing array - ds.set_chunk( - new_array_path.clone(), - ArrayIndices(vec![0, 0, 1]), - Some(ChunkPayload::Inline(b"new chunk".into())), - ) - .await?; + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, + Some(ChunkPayload::Inline(b"bye".into())) + ); + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, + None + ); + assert!(matches!( + ds.get_node(&new_array_path).await, + Some(NodeStructure { + id: 3, + path, + user_attributes: Some(atts), + node_data: NodeData::Array(meta, manifests) + }) if path == new_array_path && meta == new_meta.clone() && manifests.len() == 1 && atts == UserAttributesStructure::Inline("{foo:42}".to_string()) + )); + + //test the previous version is still alive + let ds = Dataset::update(Arc::clone(&storage), previous_structure_id); + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, + Some(ChunkPayload::Inline(b"bye".into())) + ); + assert_eq!( + ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, + Some(ChunkPayload::Inline(b"new chunk".into())) + ); - let previous_structure_id = ds.flush().await?; - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, - Some(ChunkPayload::Inline(b"bye".into())) - ); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, - Some(ChunkPayload::Inline(b"new chunk".into())) - ); - - // we delete a chunk - ds.set_chunk(new_array_path.clone(), ArrayIndices(vec![0, 0, 1]), None) - .await?; - - let new_meta = ZarrArrayMetadata { shape: vec![1, 1, 1], ..zarr_meta }; - // we change zarr metadata - ds.update_array(new_array_path.clone(), new_meta.clone()).await?; - - // we change user attributes metadata - ds.set_user_attributes(new_array_path.clone(), Some("{foo:42}".to_string())) - .await?; - - let structure_id = ds.flush().await?; - let ds = Dataset::update(Arc::clone(&storage), structure_id); - - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, - Some(ChunkPayload::Inline(b"bye".into())) - ); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, - None - ); - assert!(matches!( - ds.get_node(&new_array_path).await, - Some(NodeStructure { - id: 3, - path, - user_attributes: Some(atts), - node_data: NodeData::Array(meta, manifests) - }) if path == new_array_path && meta == new_meta.clone() && manifests.len() == 1 && atts == UserAttributesStructure::Inline("{foo:42}".to_string()) - )); - - //test the previous version is still alive - let ds = Dataset::update(Arc::clone(&storage), previous_structure_id); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 0])).await, - Some(ChunkPayload::Inline(b"bye".into())) - ); - assert_eq!( - ds.get_chunk_ref(&new_array_path, &ArrayIndices(vec![0, 0, 1])).await, - Some(ChunkPayload::Inline(b"new chunk".into())) - ); - } Ok(()) } } From ffff1ce95a11157a57c7973e975387986be446af Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 15:05:15 -0600 Subject: [PATCH 16/17] remove todo --- src/lib.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index db5b9584..e2a804d6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -567,8 +567,6 @@ pub enum StorageError { NotFound(ObjectId), #[error("synchronization error on the Storage instance")] Deadlock, - // TODO: pattern match on ObjectStore error - // combine with StorageLayerError #[error("Error contacting object store {0}")] ObjectStore(#[from] object_store::Error), #[error("Error reading or writing to/from parquet files: {0}")] From 65838935f46083dedc737c78b2138111edb19299 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 16 Aug 2024 15:54:51 -0600 Subject: [PATCH 17/17] Address Comments --- src/lib.rs | 14 +++++++------- src/storage.rs | 52 +++++++++++++++++++++++--------------------------- 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e2a804d6..777b5ecc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -567,16 +567,16 @@ pub enum StorageError { NotFound(ObjectId), #[error("synchronization error on the Storage instance")] Deadlock, - #[error("Error contacting object store {0}")] + #[error("error contacting object store {0}")] ObjectStore(#[from] object_store::Error), - #[error("Error reading or writing to/from parquet files: {0}")] + #[error("error reading or writing to/from parquet files: {0}")] ParquetError(#[from] parquet_errors::ParquetError), - #[error("Error reading RecordBatch from parquet files.")] - BadRecordBatchRead, - #[error("Bad byte range for chunk read `{0:?}`.")] - BadByteRange(Option>), - #[error("I/O error: `{0:?}`")] + #[error("error reading RecordBatch from parquet file {0}.")] + BadRecordBatchRead(String), + #[error("i/o error: `{0:?}`")] IOError(#[from] io::Error), + #[error("bad path: {0}")] + BadPath(Path), } /// Fetch and write the parquet files that represent the dataset in object store diff --git a/src/storage.rs b/src/storage.rs index c0dadb59..130773d8 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -7,21 +7,26 @@ use std::{ }; use crate::{ - AttributesTable, ChunkOffset, ManifestsTable, ObjectId, Storage, StorageError, + AttributesTable, ChunkOffset, ManifestsTable, ObjectId, Path, Storage, StorageError, StructureTable, }; use arrow::array::RecordBatch; use async_trait::async_trait; use bytes::Bytes; use futures::StreamExt; -use object_store::{local::LocalFileSystem, memory::InMemory, path::Path, ObjectStore}; +use object_store::{ + local::LocalFileSystem, memory::InMemory, path::Path as ObjectPath, ObjectStore, +}; +use parquet::arrow::async_writer::AsyncArrowWriter; +use parquet::arrow::{ + async_reader::ParquetObjectReader, ParquetRecordBatchStreamBuilder, +}; const STRUCTURE_PREFIX: &str = "s/"; const MANIFEST_PREFIX: &str = "m/"; // const ATTRIBUTES_PREFIX: &str = "a/"; const CHUNK_PREFIX: &str = "c/"; -// #[derive(Default)] pub struct ObjectStorage { store: Arc, prefix: String, @@ -31,16 +36,12 @@ impl ObjectStorage { pub fn new_in_memory_store() -> ObjectStorage { ObjectStorage { store: Arc::new(InMemory::new()), prefix: "".into() } } - pub fn new_local_store( - prefix: std::path::PathBuf, - ) -> Result { + pub fn new_local_store(prefix: &Path) -> Result { create_dir_all(prefix.as_path())?; + let prefix = prefix.to_str().ok_or(StorageError::BadPath(prefix.to_owned()))?; Ok(ObjectStorage { - store: Arc::new(LocalFileSystem::new_with_prefix(&prefix)?), - // We rely on `new_with_prefix` to create the `prefix` directory - // if it doesn't exist. It will also add the prefix to any path - // so we set ObjectStorate::prefix to an empty string. - prefix: "".to_string(), + store: Arc::new(LocalFileSystem::new()), + prefix: prefix.to_owned().to_string(), }) } pub fn new_s3_store_from_env( @@ -69,7 +70,7 @@ impl ObjectStorage { Ok(ObjectStorage { store: Arc::new(store), prefix: prefix.into() }) } - fn get_path(&self, file_prefix: &str, ObjectId(asu8): &ObjectId) -> Path { + fn get_path(&self, file_prefix: &str, ObjectId(asu8): &ObjectId) -> ObjectPath { // TODO: be careful about allocation here let path = format!( "{}/{}/{}.parquet", @@ -77,33 +78,24 @@ impl ObjectStorage { file_prefix, BASE64_URL_SAFE.encode(asu8) ); - Path::from(path) + ObjectPath::from(path) } - async fn read_parquet(&self, path: &Path) -> Result { - use parquet::arrow::{ - async_reader::ParquetObjectReader, ParquetRecordBatchStreamBuilder, - }; - - // TODO: avoid this metadata read since we are always reading the whole thing. + async fn read_parquet(&self, path: &ObjectPath) -> Result { + // FIXME: avoid this metadata read since we are always reading the whole thing. let meta = self.store.head(path).await?; let reader = ParquetObjectReader::new(Arc::clone(&self.store), meta); let mut builder = ParquetRecordBatchStreamBuilder::new(reader).await?.build()?; // TODO: do we always have only one batch ever? Assert that let maybe_batch = builder.next().await; - if let Some(batch) = maybe_batch { - Ok(batch?) - } else { - Err(StorageError::BadRecordBatchRead) - } + Ok(maybe_batch.ok_or(StorageError::BadRecordBatchRead(path.to_string()))??) } async fn write_parquet( &self, - path: &Path, + path: &ObjectPath, batch: &RecordBatch, ) -> Result<(), StorageError> { - use parquet::arrow::async_writer::AsyncArrowWriter; let mut buffer = Vec::new(); let mut writer = AsyncArrowWriter::try_new(&mut buffer, batch.schema(), None)?; writer.write(batch).await?; @@ -335,6 +327,7 @@ impl Storage for InMemoryStorage { #[cfg(test)] mod tests { + use std::env::temp_dir; use std::sync::Arc; use crate::ObjectId; @@ -358,15 +351,18 @@ mod tests { async fn test_read_write_parquet_object_storage() { // simple test to make sure we can speak to all stores let batch = make_record_batch(); - let prefix: String = rand::thread_rng() + let mut prefix = temp_dir().to_str().unwrap().to_string(); + let rdms: String = rand::thread_rng() .sample_iter(&Alphanumeric) .take(7) .map(char::from) .collect(); + prefix.push_str(rdms.as_str()); for store in [ ObjectStorage::new_in_memory_store(), - ObjectStorage::new_local_store(prefix.clone().into()).unwrap(), + // FIXME: figure out local and minio tests on CI + // ObjectStorage::new_local_store(&prefix.into()).unwrap(), // ObjectStorage::new_s3_store_from_env("testbucket".to_string()).unwrap(), // ObjectStorage::new_s3_store_with_config("testbucket".to_string(), prefix) // .unwrap(),