From e5052bf81ef80b4ae9251f95fe567f8ef94d5fd0 Mon Sep 17 00:00:00 2001 From: simonsan <14062932+simonsan@users.noreply.github.com> Date: Tue, 5 Nov 2024 14:19:47 +0100 Subject: [PATCH 1/3] refactor(deps): remove cdc and switch to rustic_cdc Signed-off-by: simonsan <14062932+simonsan@users.noreply.github.com> --- Cargo.lock | 7 + crates/core/Cargo.toml | 1 + crates/core/src/archiver/file_archiver.rs | 7 +- crates/core/src/cdc.rs | 2 - crates/core/src/cdc/LICENSE.txt | 21 --- crates/core/src/cdc/README.md | 73 -------- crates/core/src/cdc/polynom.rs | 59 ------ crates/core/src/cdc/rolling_hash.rs | 219 ---------------------- crates/core/src/chunker.rs | 27 ++- crates/core/src/lib.rs | 1 - 10 files changed, 24 insertions(+), 393 deletions(-) delete mode 100644 crates/core/src/cdc.rs delete mode 100644 crates/core/src/cdc/LICENSE.txt delete mode 100644 crates/core/src/cdc/README.md delete mode 100644 crates/core/src/cdc/polynom.rs delete mode 100644 crates/core/src/cdc/rolling_hash.rs diff --git a/Cargo.lock b/Cargo.lock index eff9ffba..4b80142f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3300,6 +3300,12 @@ dependencies = [ "walkdir", ] +[[package]] +name = "rustic_cdc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e0c1a26e7525064e1b95af1518b6bd4ffa73d084d9e98f9009d16d81c57693" + [[package]] name = "rustic_config" version = "0.2.2" @@ -3356,6 +3362,7 @@ dependencies = [ "runtime-format", "rustdoc-json", "rustic_backend", + "rustic_cdc", "rustic_testing", "rustup-toolchain", "scrypt", diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index e83c7330..b0da33ef 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -66,6 +66,7 @@ scrypt = { version = "0.11.0", default-features = false } binrw = "0.14.0" hex = { version = "0.4.3", features = ["serde"] } integer-sqrt = "0.1.5" +rustic_cdc = "0.3.0" serde = { version = "1.0.210" } serde-aux = "4.5.0" serde_derive = "1.0.210" diff --git a/crates/core/src/archiver/file_archiver.rs b/crates/core/src/archiver/file_archiver.rs index 74920dda..015bb619 100644 --- a/crates/core/src/archiver/file_archiver.rs +++ b/crates/core/src/archiver/file_archiver.rs @@ -1,5 +1,7 @@ use std::io::Read; +use rustic_cdc::Rabin64; + use crate::{ archiver::{ parent::{ItemWithParent, ParentResult}, @@ -15,7 +17,6 @@ use crate::{ packer::{Packer, PackerStats}, BlobId, BlobType, DataId, }, - cdc::rolling_hash::Rabin64, chunker::ChunkIter, crypto::hasher::hash, error::{ArchiverErrorKind, RusticResult}, @@ -75,7 +76,9 @@ impl<'a, BE: DecryptWriteBackend, I: ReadGlobalIndex> FileArchiver<'a, BE, I> { config, index.total_size(BlobType::Data), )?; - let rabin = Rabin64::new_with_polynom(6, poly); + + let rabin = Rabin64::new_with_polynom(6, &poly); + Ok(Self { index, data_packer, diff --git a/crates/core/src/cdc.rs b/crates/core/src/cdc.rs deleted file mode 100644 index 5cbbd0b0..00000000 --- a/crates/core/src/cdc.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub(crate) mod polynom; -pub(crate) mod rolling_hash; diff --git a/crates/core/src/cdc/LICENSE.txt b/crates/core/src/cdc/LICENSE.txt deleted file mode 100644 index 25d14fae..00000000 --- a/crates/core/src/cdc/LICENSE.txt +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License (MIT) - -Copyright (c) 2016 Vincent Cantin (https://github.com/green-coder) - -Permission is hereby granted, free of charge, to any person obtaining a copy of -this software and associated documentation files (the "Software"), to deal in -the Software without restriction, including without limitation the rights to -use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -of the Software, and to permit persons to whom the Software is furnished to do -so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/crates/core/src/cdc/README.md b/crates/core/src/cdc/README.md deleted file mode 100644 index 3a9a406a..00000000 --- a/crates/core/src/cdc/README.md +++ /dev/null @@ -1,73 +0,0 @@ -# cdc - -A library for performing *Content-Defined Chunking* (CDC) on data streams. -Implemented using generic iterators, very easy to use. - -- [API Documentation](https://docs.rs/cdc/) - -## Example - -```rust - let reader: BufReader = BufReader::new(file); - let byte_iter = reader.bytes().map(|b| b.unwrap()); - - // Finds and iterates on the separators. - for separator in SeparatorIter::new(byte_iter) { - println!("Index: {}, hash: {:016x}", separator.index, separator.hash); - } -``` - -Each module is documented via an example which you can find in the `config/` -folder. - -To run them, use a command like: - -`cargo run --example separator --release` - -**Note:** Some examples are looking for a file named `myLargeFile.bin` which I -didn't upload to Github. Please use your own files for testing. - -## What's in the crate - -From low level to high level: - -- A `RollingHash64` trait, for rolling hash with a 64 bits hash value. - -- `Rabin64`, an implementation of the Rabin Fingerprint rolling hash with a 64 - bits hash value. - -- `Separator`, a struct which describes a place in a data stream identified as a - separator. - -- `SeparatorIter`, an adaptor which takes an `Iterator` as input and - which enumerates all the separators found. - -- `Chunk`, a struct which describes a piece of the data stream (index and size). - -- `ChunkIter`, an adaptor which takes an `Iterator` as input and - which enumerates chunks. - -## Implementation details - -- The library is not cutting any files, it only provides information on how to - do it. - -- You can change the default window size used by `Rabin64`, and how the - `SeparatorIter` is choosing the separator. - -- The design of this crate may be subject to changes sometime in the future. I - am waiting for some features of `Rust` to mature up, specially the - [`impl Trait`](https://github.com/rust-lang/rust/issues/34511) feature. - -## Performance - -There is a **huge** difference between the debug build and the release build in -terms of performance. Remember that when you test the lib, use -`cargo run --release`. - -I may try to improve the performance of the lib at some point, but for now it is -good enough for most usages. - -## License - -Coded with ❤️ , licensed under the terms of the [MIT license](LICENSE.txt). diff --git a/crates/core/src/cdc/polynom.rs b/crates/core/src/cdc/polynom.rs deleted file mode 100644 index 61cbd557..00000000 --- a/crates/core/src/cdc/polynom.rs +++ /dev/null @@ -1,59 +0,0 @@ -/// The irreductible polynom to be used in the fingerprint function. -pub(crate) trait Polynom { - /// The degree of the polynom. - fn degree(&self) -> i32; - - /// Returns the modulo of the polynom. - fn modulo(self, m: Self) -> Self; -} - -/// A 64 bit polynom. -pub(crate) type Polynom64 = u64; - -impl Polynom for Polynom64 { - /// The degree of the polynom. - // `self` is u64, so `self.leading_zeroes() <= 64` which - // fits perfectly into a `i32`. (@aawsome) - #[allow(clippy::cast_possible_wrap)] - fn degree(&self) -> i32 { - 63 - self.leading_zeros() as i32 - } - - /// Returns the modulo of the polynom. - fn modulo(self, m: Self) -> Self { - let mut p = self; - while p.degree() >= m.degree() { - p ^= m << (p.degree() - m.degree()); - } - - p - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn polynom_degree() { - assert_eq!(0u64.degree(), -1); - assert_eq!(1u64.degree(), 0); - - assert_eq!(((1u64 << 7) - 1).degree(), 6); - assert_eq!((1u64 << 7).degree(), 7); - assert_eq!(((1u64 << 7) + 1).degree(), 7); - } - - #[test] - fn polynom_modulo() { - assert_eq!(7u64.modulo(3), 1); - assert_eq!(7u64.modulo(4), 3); - assert_eq!(7u64.modulo(2), 1); - - assert_eq!(16u64.modulo(8), 0); - assert_eq!(19u64.modulo(8), 3); - - assert_eq!(16u64.modulo(4), 0); - assert_eq!(19u64.modulo(4), 3); - } -} diff --git a/crates/core/src/cdc/rolling_hash.rs b/crates/core/src/cdc/rolling_hash.rs deleted file mode 100644 index 08ec97ee..00000000 --- a/crates/core/src/cdc/rolling_hash.rs +++ /dev/null @@ -1,219 +0,0 @@ -use crate::cdc::polynom::{Polynom, Polynom64}; - -/// A rolling hash implementation for 64 bit polynoms. -pub(crate) trait RollingHash64 { - /// Resets the rolling hash. - fn reset(&mut self); - - /// Attempt to prefill the window - /// - /// # Arguments - /// - /// * `iter` - The iterator to read from. - fn prefill_window(&mut self, iter: &mut I) -> usize - where - I: Iterator; - - /// Combines a reset with a prefill in an optimized way. - /// - /// # Arguments - /// - /// * `iter` - The iterator to read from. - fn reset_and_prefill_window(&mut self, iter: &mut I) -> usize - where - I: Iterator; - - /// Slides the window by byte. - /// - /// # Arguments - /// - /// * `byte` - The byte to slide in. - fn slide(&mut self, byte: u8); - - /// Returns the current hash as a `Polynom64`. - fn get_hash(&self) -> &Polynom64; -} - -/// A rolling hash implementation for 64 bit polynoms from Rabin. -#[derive(Clone)] -pub(crate) struct Rabin64 { - // Configuration - /// Window size. - pub(crate) window_size: usize, // The size of the data window used in the hash calculation. - /// Window size mask. - pub(crate) window_size_mask: usize, // = window_size - 1, supposing that it is an exponent of 2. - - // Precalculations - /// The number of bits to shift the polynom to the left. - pub(crate) polynom_shift: i32, - - /// Precalculated out table. - pub(crate) out_table: [Polynom64; 256], - /// Precalculated mod table. - pub(crate) mod_table: [Polynom64; 256], - - // Current state - /// The data window. - pub(crate) window_data: Vec, - /// The current window index. - pub(crate) window_index: usize, - /// The current hash. - pub(crate) hash: Polynom64, -} - -impl Rabin64 { - /// Calculates the out table - /// - /// # Arguments - /// - /// * `window_size` - The window size. - /// * `mod_polynom` - The modulo polynom. - /// - /// # Returns - /// - /// An array of 256 `Polynom64` values. - fn calculate_out_table(window_size: usize, mod_polynom: Polynom64) -> [Polynom64; 256] { - let mut out_table = [0; 256]; - for (b, elem) in out_table.iter_mut().enumerate() { - let mut hash = (b as Polynom64).modulo(mod_polynom); - for _ in 0..window_size - 1 { - hash <<= 8; - hash = hash.modulo(mod_polynom); - } - *elem = hash; - } - - out_table - } - - /// Calculates the mod table - /// - /// # Arguments - /// - /// * `mod_polynom` - The modulo polynom. - /// - /// # Returns - /// - /// An array of 256 `Polynom64` values. - fn calculate_mod_table(mod_polynom: Polynom64) -> [Polynom64; 256] { - let mut mod_table = [0; 256]; - let k = mod_polynom.degree(); - for (b, elem) in mod_table.iter_mut().enumerate() { - let p: Polynom64 = (b as Polynom64) << k; - *elem = p.modulo(mod_polynom) | p; - } - - mod_table - } - - /// Creates a new `Rabin64` with the given window size and modulo polynom. - /// - /// # Arguments - /// - /// * `window_size_nb_bits` - The number of bits of the window size. - /// * `mod_polynom` - The modulo polynom. - pub(crate) fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: Polynom64) -> Self { - let window_size = 1 << window_size_nb_bits; - - let window_data = vec![0; window_size]; - - Self { - window_size, - window_size_mask: window_size - 1, - polynom_shift: mod_polynom.degree() - 8, - out_table: Self::calculate_out_table(window_size, mod_polynom), - mod_table: Self::calculate_mod_table(mod_polynom), - window_data, - window_index: 0, - hash: 0, - } - } -} - -impl RollingHash64 for Rabin64 { - fn reset(&mut self) { - self.window_data.clear(); - self.window_data.resize(self.window_size, 0); - self.window_index = 0; - self.hash = 0; - - // Not needed. - // self.slide(1); - } - - // Attempt to fills the window - 1 byte. - fn prefill_window(&mut self, iter: &mut I) -> usize - where - I: Iterator, - { - let mut nb_bytes_read = 0; - for _ in 0..self.window_size - 1 { - match iter.next() { - Some(b) => { - self.slide(b); - nb_bytes_read += 1; - } - None => break, - } - } - - nb_bytes_read - } - - // Combines a reset with a prefill in an optimized way. - fn reset_and_prefill_window(&mut self, iter: &mut I) -> usize - where - I: Iterator, - { - self.hash = 0; - let mut nb_bytes_read = 0; - for _ in 0..self.window_size - 1 { - match iter.next() { - Some(b) => { - // Take the old value out of the window and the hash. - // ... let's suppose that the buffer contains zeroes, do nothing. - - // Put the new value in the window and in the hash. - self.window_data[self.window_index] = b; - let mod_index = (self.hash >> self.polynom_shift) & 255; - self.hash <<= 8; - self.hash |= u64::from(b); - self.hash ^= self.mod_table[mod_index as usize]; - - // Move the windowIndex to the next position. - self.window_index = (self.window_index + 1) & self.window_size_mask; - - nb_bytes_read += 1; - } - None => break, - } - } - - // Because we didn't overwrite that element in the loop above. - self.window_data[self.window_index] = 0; - - nb_bytes_read - } - - #[inline] - fn slide(&mut self, byte: u8) { - // Take the old value out of the window and the hash. - let out_value = self.window_data[self.window_index]; - self.hash ^= self.out_table[out_value as usize]; - - // Put the new value in the window and in the hash. - self.window_data[self.window_index] = byte; - let mod_index = (self.hash >> self.polynom_shift) & 255; - self.hash <<= 8; - self.hash |= u64::from(byte); - self.hash ^= self.mod_table[mod_index as usize]; - - // Move the windowIndex to the next position. - self.window_index = (self.window_index + 1) & self.window_size_mask; - } - - #[inline] - fn get_hash(&self) -> &Polynom64 { - &self.hash - } -} diff --git a/crates/core/src/chunker.rs b/crates/core/src/chunker.rs index fb6a300b..d90f1c93 100644 --- a/crates/core/src/chunker.rs +++ b/crates/core/src/chunker.rs @@ -1,14 +1,9 @@ use std::io::{self, Read}; use rand::{thread_rng, Rng}; +use rustic_cdc::{Polynom, Polynom64, Rabin64, RollingHash64}; -use crate::{ - cdc::{ - polynom::{Polynom, Polynom64}, - rolling_hash::{Rabin64, RollingHash64}, - }, - error::{PolynomialErrorKind, RusticResult}, -}; +use crate::error::{PolynomialErrorKind, RusticResult}; pub(super) mod constants { /// The Splitmask is used to determine if a chunk is a chunk boundary. @@ -236,9 +231,9 @@ impl PolynomExtend for Polynom64 { } if self.degree() < other.degree() { - self.gcd(other.modulo(self)) + self.gcd(other.modulo(&self)) } else { - other.gcd(self.modulo(other)) + other.gcd(self.modulo(&other)) } } @@ -256,14 +251,14 @@ impl PolynomExtend for Polynom64 { let mut b = other; if b & 1 > 0 { - res = res.add(a).modulo(modulo); + res = res.add(a).modulo(&modulo); } while b != 0 { - a = (a << 1).modulo(modulo); + a = (a << 1).modulo(&modulo); b >>= 1; if b & 1 > 0 { - res = res.add(a).modulo(modulo); + res = res.add(a).modulo(&modulo); } } @@ -283,7 +278,7 @@ fn qp(p: i32, g: Polynom64) -> Polynom64 { } // add x - res.add(2).modulo(g) + res.add(2).modulo(&g) } #[cfg(test)] @@ -297,7 +292,7 @@ mod tests { let mut reader = Cursor::new(empty); let poly = random_poly().unwrap(); - let rabin = Rabin64::new_with_polynom(6, poly); + let rabin = Rabin64::new_with_polynom(6, &poly); let chunker = ChunkIter::new(&mut reader, 0, rabin); assert_eq!(0, chunker.into_iter().count()); @@ -309,7 +304,7 @@ mod tests { let mut reader = Cursor::new(empty); let poly = random_poly().unwrap(); - let rabin = Rabin64::new_with_polynom(6, poly); + let rabin = Rabin64::new_with_polynom(6, &poly); let chunker = ChunkIter::new(&mut reader, 100, rabin); assert_eq!(0, chunker.into_iter().count()); @@ -320,7 +315,7 @@ mod tests { let mut reader = repeat(0u8); let poly = random_poly().unwrap(); - let rabin = Rabin64::new_with_polynom(6, poly); + let rabin = Rabin64::new_with_polynom(6, &poly); let mut chunker = ChunkIter::new(&mut reader, usize::MAX, rabin); let chunk = chunker.next().unwrap().unwrap(); diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index ea763bdb..3b34b1cf 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -105,7 +105,6 @@ This crate exposes a few features for controlling dependency usage. pub(crate) mod archiver; pub(crate) mod backend; pub(crate) mod blob; -pub(crate) mod cdc; pub(crate) mod chunker; pub(crate) mod commands; pub(crate) mod crypto; From 0fbaa99d100b16faf0851b1ff40c334bb7c6806f Mon Sep 17 00:00:00 2001 From: simonsan <14062932+simonsan@users.noreply.github.com> Date: Tue, 5 Nov 2024 14:24:59 +0100 Subject: [PATCH 2/3] update cdc Signed-off-by: simonsan <14062932+simonsan@users.noreply.github.com> --- crates/core/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index b0da33ef..64c7ed9f 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -66,7 +66,7 @@ scrypt = { version = "0.11.0", default-features = false } binrw = "0.14.0" hex = { version = "0.4.3", features = ["serde"] } integer-sqrt = "0.1.5" -rustic_cdc = "0.3.0" +rustic_cdc = "0.3.1" serde = { version = "1.0.210" } serde-aux = "4.5.0" serde_derive = "1.0.210" From 4c459fc3b5fc244a405589c785d33861660a627c Mon Sep 17 00:00:00 2001 From: simonsan <14062932+simonsan@users.noreply.github.com> Date: Tue, 5 Nov 2024 14:26:41 +0100 Subject: [PATCH 3/3] update cdc Signed-off-by: simonsan <14062932+simonsan@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4b80142f..9244977a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3302,9 +3302,9 @@ dependencies = [ [[package]] name = "rustic_cdc" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32e0c1a26e7525064e1b95af1518b6bd4ffa73d084d9e98f9009d16d81c57693" +checksum = "fbcebf2228827bc4b61cb54dfd84cf43aacf06ca2dfe4c014b136a0e32b876e2" [[package]] name = "rustic_config"