diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..725ff9c --- /dev/null +++ b/.editorconfig @@ -0,0 +1,25 @@ +root = true + +[*.md] +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +indent_style = space +indent_size = 2 +insert_final_newline = true + +[*.rs] +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +indent_style = space +indent_size = 4 +insert_final_newline = true + +[*.sql] +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +indent_style = space +indent_size = 2 +insert_final_newline = true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a821aa9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ + +/target +**/*.rs.bk +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..a2668ef --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "n5" +version = "0.1.0" +license = "MIT/Apache-2.0" +authors = ["Andrew Champion "] +description = "Rust implementation of the N5 tensor file system format" + +[features] +default = ["bzip", "gzip", "lz", "xz"] + +bzip = ["bzip2"] +gzip = ["flate2"] +lz = ["lz4"] +xz = ["xz2"] + +[dependencies] +byteorder = "1" +fs2 = "0.4" +lazy_static = "1.0" +serde = "1.0" +serde_derive = "1.0" +serde_json = "1.0" +regex = "0.2" + +bzip2 = { version = "0.3", optional = true } +# Use pure-rust Gzip for convenience. TODO: evaluate libflate as alternative. +flate2 = { version = "1.0", features = ["rust_backend"], default-features = false, optional = true } +lz4 = { version = "1.0", optional = true } +xz2 = { version = "0.1", optional = true } + +[dev-dependencies] +tempdir = "0.3" diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 0000000..8f71f43 --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 0000000..e2d6880 --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,19 @@ +Copyright (c) 2018 Andrew S. Champion + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..d577a87 --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# N5 + +A (mostly pure) rust implementation of the [N5 "Not HDF5" n-dimensional tensor file system storage format](https://github.com/saalfeldlab/n5) created by the Saalfeld lab at Janelia Research Campus. + +**NOTE: THIS IMPLEMENTATION IS NOT YET FUNCTIONAL. THIS NOTICE WILL BE REMOVED ONCE IT IS.** + +## Differences from Java N5 +- Dataset paths are relative. The root path in a dataset is `""`, not `"/"`. +- Dataset path are more strict. Calling methods with paths outside the dataset, e.g., `".."`, will return a `Result::Err`. + +## Major TODOs +- No writing +- Mixture of `T` parameter as scalar type versus `Vec` +- Kludge `Foo` type +- Generally, direct translation from Java is unidiomatic and a mess of boxes + +## License + +Licensed under either of + +- Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) +- MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) + +at your option. + +### Contribution + +Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. diff --git a/src/compression/bzip.rs b/src/compression/bzip.rs new file mode 100644 index 0000000..aa54635 --- /dev/null +++ b/src/compression/bzip.rs @@ -0,0 +1,56 @@ +use std::io::Read; + +use bzip2::read::BzDecoder; + +use super::{ + Compression, + Bzip2Parameters, +}; + + +pub struct Bzip2Compression; + +impl Bzip2Compression { + pub fn new(_params: &Bzip2Parameters) -> Bzip2Compression { + Bzip2Compression + } +} + +impl<'a, R: Read + 'a> Compression<'a, R> for Bzip2Compression { + fn decoder(&self, r: R) -> Box { + Box::new(BzDecoder::new(r)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use compression::CompressionType; + + // Example from the n5 documentation spec. + const TEST_BLOCK_I16_BZIP2: [u8; 59] = [ + 0x00, 0x00, + 0x00, 0x03, + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x03, + 0x42, 0x5a, 0x68, 0x39, + 0x31, 0x41, 0x59, 0x26, + 0x53, 0x59, 0x02, 0x3e, + 0x0d, 0xd2, 0x00, 0x00, + 0x00, 0x40, 0x00, 0x7f, + 0x00, 0x20, 0x00, 0x31, + 0x0c, 0x01, 0x0d, 0x31, + 0xa8, 0x73, 0x94, 0x33, + 0x7c, 0x5d, 0xc9, 0x14, + 0xe1, 0x42, 0x40, 0x08, + 0xf8, 0x37, 0x48, + ]; + + #[test] + fn test_read_doc_spec_block() { + ::tests::test_read_doc_spec_block( + &TEST_BLOCK_I16_BZIP2[..], + CompressionType::Bzip2(Bzip2Parameters::default())); + } +} diff --git a/src/compression/gzip.rs b/src/compression/gzip.rs new file mode 100644 index 0000000..37f4394 --- /dev/null +++ b/src/compression/gzip.rs @@ -0,0 +1,53 @@ +use std::io::Read; + +use flate2::read::GzDecoder; + +use super::{ + Compression, + GzipParameters, +}; + + +pub struct GzipCompression; + +impl GzipCompression { + pub fn new(_params: &GzipParameters) -> GzipCompression { + GzipCompression + } +} + +impl<'a, R: Read + 'a> Compression<'a, R> for GzipCompression { + fn decoder(&self, r: R) -> Box { + Box::new(GzDecoder::new(r)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use compression::CompressionType; + + // Example from the n5 documentation spec. + const TEST_BLOCK_I16_GZIP: [u8; 48] = [ + 0x00, 0x00, + 0x00, 0x03, + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x03, + 0x1f, 0x8b, 0x08, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x63, 0x60, + 0x64, 0x60, 0x62, 0x60, + 0x66, 0x60, 0x61, 0x60, + 0x65, 0x60, 0x03, 0x00, + 0xaa, 0xea, 0x6d, 0xbf, + 0x0c, 0x00, 0x00, 0x00, + ]; + + #[test] + fn test_read_doc_spec_block() { + ::tests::test_read_doc_spec_block( + &TEST_BLOCK_I16_GZIP[..], + CompressionType::Gzip(GzipParameters::default())); + } +} diff --git a/src/compression/lz.rs b/src/compression/lz.rs new file mode 100644 index 0000000..1216f61 --- /dev/null +++ b/src/compression/lz.rs @@ -0,0 +1,52 @@ +use std::io::Read; + +use lz4::Decoder; + +use super::{ + Compression, + Lz4Parameters, +}; + + +pub struct Lz4Compression; + +impl Lz4Compression { + pub fn new(_params: &Lz4Parameters) -> Lz4Compression { + Lz4Compression + } +} + +impl<'a, R: Read + 'a> Compression<'a, R> for Lz4Compression { + fn decoder(&self, r: R) -> Box { + Box::new(Decoder::new(r).expect("TODO: LZ4 returns a result here")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use compression::CompressionType; + + const TEST_BLOCK_I16_LZ4: [u8; 47] = [ + 0x00, 0x00, + 0x00, 0x03, + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x03, + 0x04, 0x22, 0x4d, 0x18, + 0x64, 0x40, 0xa7, 0x0c, + 0x00, 0x00, 0x80, 0x00, + 0x01, 0x00, 0x02, 0x00, + 0x03, 0x00, 0x04, 0x00, + 0x05, 0x00, 0x06, 0x00, + 0x00, 0x00, 0x00, 0x41, + 0x37, 0x33, 0x08, + ]; + + #[test] + fn test_read_doc_spec_block() { + ::tests::test_read_doc_spec_block( + &TEST_BLOCK_I16_LZ4[..], + CompressionType::Lz4(Lz4Parameters::default())); + } +} diff --git a/src/compression/mod.rs b/src/compression/mod.rs new file mode 100644 index 0000000..ccd0037 --- /dev/null +++ b/src/compression/mod.rs @@ -0,0 +1,126 @@ +use std::io::Read; + + +pub mod raw; +#[cfg(feature = "bzip")] +pub mod bzip; +#[cfg(feature = "gzip")] +pub mod gzip; +#[cfg(feature = "lz")] +pub mod lz; +#[cfg(feature = "xz")] +pub mod xz; + + +pub trait Compression<'a, R: Read + 'a> { + fn decoder(&self, r: R) -> Box; +} + +#[derive(Serialize, Deserialize, PartialEq, Debug)] +#[serde(rename_all = "lowercase")] +#[serde(tag = "type")] +pub enum CompressionType { + Raw, + Bzip2(Bzip2Parameters), + Gzip(GzipParameters), + Lz4(Lz4Parameters), + Xz(XzParameters), +} + +impl CompressionType { + pub fn get_reader<'a, R: Read + 'a>(&self) -> Box> { + #[allow(unreachable_patterns)] // Ignore the default case. + match *self { + CompressionType::Raw => Box::new(raw::RawCompression), + + #[cfg(feature = "bzip")] + CompressionType::Bzip2(ref params) => + Box::new(bzip::Bzip2Compression::new(params)), + + #[cfg(feature = "gzip")] + CompressionType::Gzip(ref params) => + Box::new(gzip::GzipCompression::new(params)), + + #[cfg(feature = "xz")] + CompressionType::Xz(ref params) => + Box::new(xz::XzCompression::new(params)), + + #[cfg(feature = "lz")] + CompressionType::Lz4(ref params) => + Box::new(lz::Lz4Compression::new(params)), + + // Default case to panic if the requested compression feature is not + // enabled. + _ => unimplemented!(), + } + } +} + +#[derive(Serialize, Deserialize, PartialEq, Debug)] +#[serde(rename_all = "camelCase")] +pub struct Bzip2Parameters { + #[serde(default = "default_bzip_block_size")] + block_size: u8, +} + +// Will never understand why serde decided against $expr defaults. Ugh. +fn default_bzip_block_size() -> u8 {8} + +impl Default for Bzip2Parameters { + fn default() -> Bzip2Parameters { + Bzip2Parameters { + block_size: default_bzip_block_size(), + } + } +} + +#[derive(Serialize, Deserialize, PartialEq, Debug)] +#[serde(rename_all = "camelCase")] +pub struct GzipParameters { + #[serde(default = "default_gzip_level")] + level: i32, +} + +fn default_gzip_level() -> i32 {-1} + +impl Default for GzipParameters { + fn default() -> GzipParameters { + GzipParameters { + level: default_gzip_level(), + } + } +} + +#[derive(Serialize, Deserialize, PartialEq, Debug)] +#[serde(rename_all = "camelCase")] +pub struct Lz4Parameters { + #[serde(default = "default_lz4_block_size")] + block_size: i32, +} + +fn default_lz4_block_size() -> i32 {65_536} + +impl Default for Lz4Parameters { + fn default() -> Lz4Parameters { + Lz4Parameters { + block_size: default_lz4_block_size(), + } + } +} + +#[derive(Serialize, Deserialize, PartialEq, Debug)] +#[serde(rename_all = "camelCase")] +pub struct XzParameters { + #[serde(default = "default_xz_preset")] + preset: i32, +} + +fn default_xz_preset() -> i32 {6} + +impl Default for XzParameters { + fn default() -> XzParameters { + XzParameters { + preset: default_xz_preset(), + } + } +} diff --git a/src/compression/raw.rs b/src/compression/raw.rs new file mode 100644 index 0000000..13f5aa9 --- /dev/null +++ b/src/compression/raw.rs @@ -0,0 +1,37 @@ +use std::io::Read; + +use super::Compression; + + +pub struct RawCompression; + +impl<'a, R: Read + 'a> Compression<'a, R> for RawCompression { + fn decoder(&self, r: R) -> Box { + Box::new(r) + } +} + +#[cfg(test)] +mod tests { + use compression::CompressionType; + + // Example from the n5 documentation spec. + const TEST_BLOCK_I16_RAW: [u8; 28] = [ + 0x00, 0x00, + 0x00, 0x03, + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x03, + 0x00, 0x01, + 0x00, 0x02, + 0x00, 0x03, + 0x00, 0x04, + 0x00, 0x05, + 0x00, 0x06, + ]; + + #[test] + fn test_read_doc_spec_block() { + ::tests::test_read_doc_spec_block(&TEST_BLOCK_I16_RAW[..], CompressionType::Raw); + } +} diff --git a/src/compression/xz.rs b/src/compression/xz.rs new file mode 100644 index 0000000..5ab8ce4 --- /dev/null +++ b/src/compression/xz.rs @@ -0,0 +1,62 @@ +use std::io::Read; + +use xz2::read::XzDecoder; + +use super::{ + Compression, + XzParameters, +}; + + +pub struct XzCompression; + +impl XzCompression { + pub fn new(_params: &XzParameters) -> XzCompression { + XzCompression + } +} + +impl<'a, R: Read + 'a> Compression<'a, R> for XzCompression { + fn decoder(&self, r: R) -> Box { + Box::new(XzDecoder::new(r)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use compression::CompressionType; + + // Example from the n5 documentation spec. + const TEST_BLOCK_I16_XZ: [u8; 84] = [ + 0x00, 0x00, + 0x00, 0x03, + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x03, + 0xfd, 0x37, 0x7a, 0x58, + 0x5a, 0x00, 0x00, 0x04, + 0xe6, 0xd6, 0xb4, 0x46, + 0x02, 0x00, 0x21, 0x01, + 0x16, 0x00, 0x00, 0x00, + 0x74, 0x2f, 0xe5, 0xa3, + 0x01, 0x00, 0x0b, 0x00, + 0x01, 0x00, 0x02, 0x00, + 0x03, 0x00, 0x04, 0x00, + 0x05, 0x00, 0x06, 0x00, + 0x0d, 0x03, 0x09, 0xca, + 0x34, 0xec, 0x15, 0xa7, + 0x00, 0x01, 0x24, 0x0c, + 0xa6, 0x18, 0xd8, 0xd8, + 0x1f, 0xb6, 0xf3, 0x7d, + 0x01, 0x00, 0x00, 0x00, + 0x00, 0x04, 0x59, 0x5a, + ]; + + #[test] + fn test_read_doc_spec_block() { + ::tests::test_read_doc_spec_block( + &TEST_BLOCK_I16_XZ[..], + CompressionType::Xz(XzParameters::default())); + } +} diff --git a/src/filesystem.rs b/src/filesystem.rs new file mode 100644 index 0000000..69185cd --- /dev/null +++ b/src/filesystem.rs @@ -0,0 +1,325 @@ +use std::fs::{ + self, + File, +}; +use std::io::{ + Error, + ErrorKind, + BufReader, + BufWriter, + Read, + Result, + Seek, + SeekFrom, +}; +use std::path::{ + PathBuf, +}; +use std::str::FromStr; + +use fs2::FileExt; +use serde_json::{ + self, + Value, +}; + +use ::{ + DataBlock, + DataBlockCreator, + DataType, + DatasetAttributes, + DefaultBlockReader, + N5Reader, + N5Writer, + Version, +}; + + +const ATTRIBUTES_FILE: &str = "attributes.json"; + + +pub struct N5Filesystem { + base_path: PathBuf, +} + +impl N5Filesystem { + pub fn open(base_path: &str) -> Result { + let reader = N5Filesystem { + base_path: PathBuf::from(base_path), + }; + + if reader.exists("") { + let version = reader.get_version()?; + + if !::VERSION.is_compatible(&version) { + return Err(Error::new(ErrorKind::Other, "TODO: Incompatible version")) + } + } + + Ok(reader) + } + + pub fn open_or_create(base_path: &str) -> Result { + let reader = N5Filesystem { + base_path: PathBuf::from(base_path), + }; + + fs::create_dir_all(base_path)?; + + if reader.get_version().map(|v| !v.is_compatible(&::VERSION)).unwrap_or(false) { + return Err(Error::new(ErrorKind::Other, "TODO: Incompatible version")) + } else { + reader.set_attribute("", ::VERSION_ATTRIBUTE_KEY.to_owned(), ::VERSION.to_string())?; + } + + Ok(reader) + } + + pub fn get_attributes(&self, path_name: &str) -> Result { + if self.exists(path_name) { + let attr_path = self.base_path.join(path_name).join(ATTRIBUTES_FILE); + + if attr_path.exists() && attr_path.is_file() { + let file = File::open(attr_path)?; + file.lock_shared()?; + let reader = BufReader::new(file); + Ok(serde_json::from_reader(reader)?) + } else { + Ok(json!({})) + } + } else { + Err(Error::new(ErrorKind::NotFound, "Path does not exist")) + } + } + + fn get_path(&self, path_name: &str) -> Result { + // Note: cannot use `canonicalize` on both the constructed dataset path + // and `base_path` and check `starts_with`, because `canonicalize` also + // requires the path exist. + use std::path::Component; + + // TODO: cleanup? + let data_path = PathBuf::from(path_name); + if data_path.is_relative() { + let mut nest: i32 = 0; + let mut interior = true; + for component in data_path.components() { + match component { + Component::Prefix(_) => unreachable!(), // Not an absolute path. + Component::RootDir => unreachable!(), // Not an absolute path. + Component::CurDir => continue, + Component::ParentDir => nest -= 1, + Component::Normal(_) => nest += 1, + }; + + if nest < 0 { + interior = false + } + } + + if interior { + return Ok(self.base_path.join(path_name)) + } + } + + Err(Error::new(ErrorKind::NotFound, "Path name is outside this N5 filesystem")) + } + + fn get_data_block_path(&self, path_name: &str, grid_position: &[i64]) -> Result { + let mut path = self.get_path(path_name)?; + for coord in grid_position { + path.push(coord.to_string()); + } + Ok(path) + } + + fn get_attributes_path(&self, path_name: &str) -> Result { + let mut path = self.get_path(path_name)?; + path.push(ATTRIBUTES_FILE); + Ok(path) + } +} + +impl N5Reader for N5Filesystem { + fn get_version(&self) -> Result { + // TODO: dedicated error type should clean this up. + Ok(Version::from_str(self + .get_attributes("")? + .get(::VERSION_ATTRIBUTE_KEY) + .ok_or_else(|| Error::new(ErrorKind::NotFound, "Version attribute not present"))? + .as_str().unwrap_or("") + ).unwrap()) + } + + fn get_dataset_attributes(&self, path_name: &str) -> Result { + let attr_path = self.get_attributes_path(path_name)?; + let reader = BufReader::new(File::open(attr_path)?); + Ok(serde_json::from_reader(reader)?) + } + + fn exists(&self, path_name: &str) -> bool { + let target = self.base_path.join(path_name); + target.is_dir() + } + + fn read_block( + &self, + path_name: &str, + data_attrs: &DatasetAttributes, + grid_position: Vec + ) -> Result>>>> + where DataType: DataBlockCreator> { + let block_file = self.get_data_block_path(path_name, &grid_position)?; + if block_file.is_file() { + let file = File::open(block_file)?; + file.lock_shared()?; + let reader = BufReader::new(file); + Ok(Some(<::Foo as DefaultBlockReader>::read_block( + reader, + data_attrs, + grid_position).expect("read_block failed"))) + } else { + Ok(None) + } + } + + fn list(&self, path_name: &str) -> Result> { + // TODO: shouldn't do this in a closure to not equivocate errors with Nones. + Ok(fs::read_dir(self.get_path(path_name)?)? + .filter_map(|e| { + if let Ok(file) = e { + if file.file_type().map(|f| f.is_dir()).ok() == Some(true) { + file.file_name().into_string().ok() + } else { + None + } + } else { + None + } + }) + .collect()) + } + + // TODO: dupe with get_attributes w/ different empty behaviors + fn list_attributes(&self, path_name: &str) -> Result { + let attr_path = self.get_attributes_path(path_name)?; + let file = File::open(attr_path)?; + file.lock_shared()?; + let reader = BufReader::new(file); + Ok(serde_json::from_reader(reader)?) + } +} + +// From: https://github.com/serde-rs/json/issues/377 +// TODO: Could be much better. +fn merge(a: &mut Value, b: &Value) { + match (a, b) { + (&mut Value::Object(ref mut a), &Value::Object(ref b)) => { + for (k, v) in b { + merge(a.entry(k.clone()).or_insert(Value::Null), v); + } + } + (a, b) => { + *a = b.clone(); + } + } +} + +impl N5Writer for N5Filesystem { + fn set_attributes( + &self, + path_name: &str, + attributes: serde_json::Map, + ) -> Result<()> { + let mut file = fs::OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(self.get_attributes_path(path_name)?)?; + file.lock_exclusive()?; + + let mut existing_buf = String::new(); + file.read_to_string(&mut existing_buf)?; + file.seek(SeekFrom::Start(0))?; + let existing = serde_json::from_str(&existing_buf).unwrap_or_else(|_| json!({})); + let mut merged = existing.clone(); + + let new: Value = attributes.into(); + + merge(&mut merged, &new); + + if new != existing { + let writer = BufWriter::new(file); + serde_json::to_writer(writer, &merged)?; + } + + Ok(()) + } + + fn create_group(&self, path_name: &str) -> Result<()> { + let path = self.get_path(path_name)?; + println!("{:?}", path); + fs::create_dir_all(path) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempdir::TempDir; + + #[test] + fn create_filesystem() { + let dir = TempDir::new("rust_n5_tests").unwrap(); + let path_str = dir.path().to_str().unwrap(); + // let path_str = "tmp"; + + let create = N5Filesystem::open_or_create(path_str) + .expect("Failed to create N5 filesystem"); + create.set_attribute("", "foo".to_owned(), "bar") + .expect("Failed to set attribute"); + + let read = N5Filesystem::open(path_str) + .expect("Failed to open N5 filesystem"); + + assert_eq!(read.get_version().expect("Cannot read version"), *::VERSION); + assert_eq!(read.list_attributes("").unwrap()["foo"], "bar"); + } + + #[test] + fn create_dataset() { + let dir = TempDir::new("rust_n5_tests").unwrap(); + let path_str = dir.path().to_str().unwrap(); + // let path_str = "tmp"; + + let create = N5Filesystem::open_or_create(path_str) + .expect("Failed to create N5 filesystem"); + let data_attrs = DatasetAttributes::new( + vec![10, 10, 10], + vec![5, 5, 5], + DataType::INT32, + ::compression::CompressionType::Raw, + ); + create.create_dataset("foo/bar", &data_attrs) + .expect("Failed to create dataset"); + + let read = N5Filesystem::open(path_str) + .expect("Failed to open N5 filesystem"); + + assert_eq!(read.get_dataset_attributes("foo/bar").unwrap(), data_attrs); + } + + #[test] + fn reject_exterior_paths() { + let dir = TempDir::new("rust_n5_tests").unwrap(); + let path_str = dir.path().to_str().unwrap(); + + let create = N5Filesystem::open_or_create(path_str) + .expect("Failed to create N5 filesystem"); + + assert!(create.get_path("/").is_err()); + assert!(create.get_path("..").is_err()); + assert!(create.get_path("foo/bar/baz/../../..").is_ok()); + assert!(create.get_path("foo/bar/baz/../../../..").is_err()); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..e1f2955 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,457 @@ +extern crate byteorder; +#[cfg(feature = "bzip")] +extern crate bzip2; +#[cfg(feature = "gzip")] +extern crate flate2; +extern crate fs2; +#[macro_use] +extern crate lazy_static; +#[cfg(feature = "lz")] +extern crate lz4; +extern crate serde; +#[macro_use] +extern crate serde_json; +#[macro_use] +extern crate serde_derive; +#[cfg(test)] +extern crate tempdir; +extern crate regex; +#[cfg(feature = "xz")] +extern crate xz2; + + +use std::io::{ + Error, + ErrorKind, +}; + +use byteorder::{BigEndian, ReadBytesExt}; +use serde::Serialize; + + +pub mod compression; +pub mod filesystem; + + +lazy_static! { + static ref VERSION: Version = { + Version::new(1, 0, 0, "") + }; +} + +const VERSION_ATTRIBUTE_KEY: &str = "n5"; + + +pub trait N5Reader { + fn get_version(&self) -> Result; + + fn get_dataset_attributes(&self, path_name: &str) -> Result; + + /// Test whether a group or dataset exists. + fn exists(&self, path_name: &str) -> bool; + + /// Test whether a dataset exists. + fn dataset_exists(&self, path_name: &str) -> bool { + self.exists(path_name) && self.get_dataset_attributes(path_name).is_ok() + } + + fn read_block( + &self, + path_name: &str, + data_attrs: &DatasetAttributes, + grid_position: Vec, + ) -> Result>>>, Error> + where DataType: DataBlockCreator>; + + /// List all groups (including datasets) in a group. + fn list(&self, path_name: &str) -> Result, Error>; + + /// List all attributes of a group. + fn list_attributes(&self, path_name: &str) -> Result; +} + +pub trait N5Writer : N5Reader { + /// Set a single attribute. + fn set_attribute( + &self, // TODO: should this be mut for semantics? + path_name: &str, + key: String, + attribute: T, + ) -> Result<(), Error> { + self.set_attributes( + path_name, + vec![(key, serde_json::to_value(attribute)?)].into_iter().collect()) + } + + /// Set a map of attributes. + fn set_attributes( + &self, // TODO: should this be mut for semantics? + path_name: &str, + attributes: serde_json::Map, + ) -> Result<(), Error>; + + /// Set mandatory dataset attributes. + fn set_dataset_attributes( + &self, + path_name: &str, + data_attrs: &DatasetAttributes, + ) -> Result<(), Error> { + if let serde_json::Value::Object(map) = serde_json::to_value(data_attrs)? { + self.set_attributes(path_name, map) + } else { + panic!("Impossible: DatasetAttributes serializes to object") + } + } + + /// Create a group (directory). + fn create_group(&self, path_name: &str) -> Result<(), Error>; + + fn create_dataset( + &self, + path_name: &str, + data_attrs: &DatasetAttributes, + ) -> Result<(), Error> { + self.create_group(path_name)?; + self.set_dataset_attributes(path_name, data_attrs) + } +} + + +#[derive(Serialize, Deserialize, PartialEq, Debug)] +#[serde(rename_all = "lowercase")] +pub enum DataType { + UINT8, + UINT16, + UINT32, + UINT64, + INT8, + INT16, + INT32, + INT64, + FLOAT32, + FLOAT64, +} + +pub trait DataBlockCreator { + fn create_data_block( + &self, + block_size: Vec, + grid_position: Vec, + num_el: usize, + ) -> Option>>; +} + +macro_rules! data_type_block_creator { + ($d_name:ident, $d_type:ty) => { + impl DataBlockCreator> for DataType { + fn create_data_block( + &self, + block_size: Vec, + grid_position: Vec, + num_el: usize, + ) -> Option>>> { + match *self { + DataType::$d_name => Some(Box::new(VecDataBlock::<$d_type>::new( + block_size, + grid_position, + // Vec::<$d_type>::with_capacity(num_el), + vec![0 as $d_type; num_el], + ))), + _ => None, + } + } + } + } +} + +data_type_block_creator!(UINT8, u8); +data_type_block_creator!(UINT16, u16); +data_type_block_creator!(UINT32, u32); +data_type_block_creator!(UINT64, u64); +data_type_block_creator!(INT8, i8); +data_type_block_creator!(INT16, i16); +data_type_block_creator!(INT32, i32); +data_type_block_creator!(INT64, i64); +data_type_block_creator!(FLOAT32, f32); +data_type_block_creator!(FLOAT64, f64); + +// impl DataType { +// fn create_data_block( +// &self, +// block_size: Vec, +// grid_position: Vec, +// num_el: usize, +// ) -> Box> { +// match *self { +// DataType::UINT8 => +// } +// } +// } + +#[derive(Serialize, Deserialize, PartialEq, Debug)] +#[serde(rename_all = "camelCase")] +pub struct DatasetAttributes { + dimensions: Vec, + block_size: Vec, + data_type: DataType, + compression: compression::CompressionType, +} + +impl DatasetAttributes { + pub fn new( + dimensions: Vec, + block_size: Vec, + data_type: DataType, + compression: compression::CompressionType, + ) -> DatasetAttributes { + DatasetAttributes { + dimensions, + block_size, + data_type, + compression, + } + } +} + + +pub trait ReadableDataBlock { + /// Unlike Java N5, read the stream directly into the block data instead + /// of creating a copied byte buffer. + fn read_data(&mut self, source: &mut std::io::Read) -> std::io::Result<()>; +} + +pub trait DataBlock : ReadableDataBlock { + fn get_size(&self) -> &Vec; + + fn get_grid_position(&self) -> &Vec; + + fn get_data(&self) -> &T; + + fn get_num_elements(&self) -> usize; +} + +pub struct VecDataBlock { + size: Vec, + grid_position: Vec, + data: Vec, +} + +impl VecDataBlock { + pub fn new(size: Vec, grid_position: Vec, data: Vec) -> VecDataBlock { + VecDataBlock { + size, + grid_position, + data, + } + } +} + +macro_rules! vec_data_block_impl { + ($ty_name:ty, $bo_fn:ident) => { + impl ReadableDataBlock for VecDataBlock<$ty_name> { + fn read_data(&mut self, source: &mut std::io::Read) -> std::io::Result<()> { + source.$bo_fn::(&mut self.data) + } + } + } +} + +vec_data_block_impl!(u16, read_u16_into); +vec_data_block_impl!(u32, read_u32_into); +vec_data_block_impl!(u64, read_u64_into); +vec_data_block_impl!(i16, read_i16_into); +vec_data_block_impl!(i32, read_i32_into); +vec_data_block_impl!(i64, read_i64_into); +vec_data_block_impl!(f32, read_f32_into); +vec_data_block_impl!(f64, read_f64_into); + +impl ReadableDataBlock for VecDataBlock { + fn read_data(&mut self, source: &mut std::io::Read) -> std::io::Result<()> { + source.read_exact(&mut self.data) + } +} + +impl ReadableDataBlock for VecDataBlock { + fn read_data(&mut self, source: &mut std::io::Read) -> std::io::Result<()> { + for i in 0..self.data.len() { + self.data[i] = source.read_i8()?; + } + Ok(()) + } +} + +impl DataBlock> for VecDataBlock + where VecDataBlock: ReadableDataBlock { + fn get_size(&self) -> &Vec { + &self.size + } + + fn get_grid_position(&self) -> &Vec { + &self.grid_position + } + + fn get_data(&self) -> &Vec { + &self.data + } + + fn get_num_elements(&self) -> usize { + self.data.len() + } +} + +// pub trait BlockReader, R: std::io::Read> { +// fn read(&mut B, buffer: R) -> std::io::Result<()>; +// } + + +pub trait DefaultBlockReader //: + // BlockReader, VecDataBlock, R> + where DataType: DataBlockCreator> { + fn read_block( + mut buffer: R, + data_attrs: &DatasetAttributes, + grid_position: Vec, + ) -> std::io::Result>>> { + let mode = buffer.read_i16::()?; + let ndim = buffer.read_i16::()?; + let mut dims = vec![0; ndim as usize]; + buffer.read_i32_into::(&mut dims)?; + let num_el = match mode { + 0 => dims.iter().product(), + 1 => buffer.read_i32::()?, + _ => return Err(Error::new(ErrorKind::InvalidData, "Unsupported block mode")) + }; + + let mut block: Box>> = data_attrs.data_type.create_data_block( + dims, + grid_position, + num_el as usize).unwrap(); + let mut decompressed = data_attrs.compression.get_reader().decoder(buffer); + block.read_data(&mut decompressed)?; + + Ok(block) + } +} + +// TODO: needed because cannot invoke type parameterized static trait methods +// directly from trait name in Rust. Symptom of design problems with +// `DefaultBlockReader`, etc. +struct Foo; +impl DefaultBlockReader for Foo + where DataType: DataBlockCreator> {} + + +/// A semantic version. +/// +/// # Examples +/// +/// ``` +/// # use n5::Version; +/// # use std::str::FromStr; +/// let v = Version::from_str("1.2.3-suffix").unwrap(); +/// +/// assert_eq!(v.get_major(), 1); +/// assert_eq!(v.get_minor(), 2); +/// assert_eq!(v.get_patch(), 3); +/// assert_eq!(v.get_suffix(), "-suffix"); +/// assert_eq!(v.to_string(), "1.2.3-suffix"); +/// +/// assert!(v.is_compatible(&Version::from_str("1.1").unwrap())); +/// assert!(!v.is_compatible(&Version::from_str("2.1").unwrap())); +/// ``` +#[derive(Debug, Eq, PartialEq)] +pub struct Version { + major: i32, + minor: i32, + patch: i32, + suffix: String, +} + +impl Version { + pub fn new(major: i32, minor: i32, patch: i32, suffix: &str) -> Version { + Version { + major, + minor, + patch, + suffix: suffix.to_owned(), + } + } + + pub fn get_major(&self) -> i32 { + self.major + } + + pub fn get_minor(&self) -> i32 { + self.minor + } + + pub fn get_patch(&self) -> i32 { + self.patch + } + + pub fn get_suffix(&self) -> &str { + &self.suffix + } + + pub fn is_compatible(&self, other: &Version) -> bool { + other.get_major() <= self.major + } +} + +impl std::fmt::Display for Version { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}.{}.{}{}", self.major, self.minor, self.patch, self.suffix) + } +} + +impl std::str::FromStr for Version { + type Err = (); + + fn from_str(s: &str) -> Result { + let re = regex::Regex::new(r"(\d+)(\.(\d+))?(\.(\d+))?(.*)").unwrap(); + Ok(match re.captures(s) { + Some(caps) => { + Version { + major: caps.get(1).and_then(|m| m.as_str().parse().ok()).unwrap_or(0), + minor: caps.get(3).and_then(|m| m.as_str().parse().ok()).unwrap_or(0), + patch: caps.get(5).and_then(|m| m.as_str().parse().ok()).unwrap_or(0), + suffix: caps.get(6).map_or("", |m| m.as_str()).to_owned(), + } + } + None => Version { + major: 0, + minor: 0, + patch: 0, + suffix: "".into(), + } + }) + } +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + use std::io::Cursor; + + pub(crate) fn test_read_doc_spec_block( + block: &[u8], + compression: compression::CompressionType, + ) { + let buff = Cursor::new(block); + let data_attrs = DatasetAttributes { + dimensions: vec![5, 6, 7], + block_size: vec![1, 2, 3], + data_type: DataType::INT16, + compression: compression, + }; + + let block = >>::read_block( + buff, + &data_attrs, + vec![0, 0, 0]).expect("read_block failed"); + + assert_eq!(block.get_size(), &vec![1, 2, 3]); + assert_eq!(block.get_grid_position(), &vec![0, 0, 0]); + assert_eq!(block.get_data(), &vec![1, 2, 3, 4, 5, 6]); + } +}