From bd9585d354fbd267bbfd9bd627ed51468b0669ab Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 25 Nov 2024 09:59:57 -0800 Subject: [PATCH 01/56] scan file implementation for cdf --- kernel/src/scan/state.rs | 4 +- kernel/src/table_changes/log_replay.rs | 1 + kernel/src/table_changes/mod.rs | 1 + kernel/src/table_changes/scan_file.rs | 403 +++++++++++++++++++++++++ 4 files changed, 407 insertions(+), 2 deletions(-) create mode 100644 kernel/src/table_changes/scan_file.rs diff --git a/kernel/src/scan/state.rs b/kernel/src/scan/state.rs index 19534d0cd..cc55103b8 100644 --- a/kernel/src/scan/state.rs +++ b/kernel/src/scan/state.rs @@ -112,11 +112,11 @@ pub type ScanCallback = fn( /// ## Example /// ```ignore /// let mut context = [my context]; -/// for res in scan_data { // scan data from scan.get_scan_data() +/// for res in scan_data { // scan data from scan.scan_data() /// let (data, vector) = res?; /// context = delta_kernel::scan::state::visit_scan_files( /// data.as_ref(), -/// vector, +/// selection_vector, /// context, /// my_callback, /// )?; diff --git a/kernel/src/table_changes/log_replay.rs b/kernel/src/table_changes/log_replay.rs index 993a89912..76d324088 100644 --- a/kernel/src/table_changes/log_replay.rs +++ b/kernel/src/table_changes/log_replay.rs @@ -21,6 +21,7 @@ use crate::table_changes::{check_cdf_table_properties, ensure_cdf_read_supported use crate::table_properties::TableProperties; use crate::utils::require; use crate::{DeltaResult, Engine, EngineData, Error, ExpressionRef, RowVisitor}; + use itertools::Itertools; #[cfg(test)] diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index e0ad823f1..766866d25 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -17,6 +17,7 @@ use crate::{DeltaResult, Engine, Error, Version}; mod log_replay; pub mod scan; +mod scan_file; static CDF_FIELDS: LazyLock<[StructField; 3]> = LazyLock::new(|| { [ diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs new file mode 100644 index 000000000..18fad9368 --- /dev/null +++ b/kernel/src/table_changes/scan_file.rs @@ -0,0 +1,403 @@ +//! This module handles [`CDFScanFile`]s for [`TableChangeScan`]. A [`CDFScanFile`] consists of all the +//! metadata required to generate a change data feed. [`CDFScanFile`] can be constructed using +//! [`CDFScanFileVisitor`]. The visitor reads from engine data with the schema [`cdf_scan_row_schema`]. +//! You can convert engine data to this schema using the [`get_cdf_scan_row_expression`]. +use itertools::Itertools; +use std::collections::HashMap; +use std::sync::{Arc, LazyLock}; + +use super::log_replay::TableChangesScanData; +use crate::actions::visitors::visit_deletion_vector_at; +use crate::engine_data::{GetData, TypedGetData}; +use crate::expressions::{column_expr, Expression}; +use crate::scan::state::DvInfo; +use crate::schema::{ + ColumnName, ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType, +}; +use crate::utils::require; +use crate::{DeltaResult, EngineData, Error, RowVisitor}; + +#[allow(unused)] +pub(crate) struct UnresolvedCDFScanFile { + pub scan_file: CDFScanFile, + pub remove_dvs: Arc>, +} +#[allow(unused)] +#[derive(Debug)] +pub(crate) struct ResolvedCDFScanFile { + pub scan_file: CDFScanFile, + pub selection_vector: Option>, +} + +// The type of action associated with a [`CDFScanFile`]. +#[allow(unused)] +#[derive(Debug, Clone, PartialEq)] +pub(crate) enum CDFScanFileType { + Add, + Remove, + Cdc, +} + +/// Represents all the metadata needed to read a Change Data Feed. +#[allow(unused)] +#[derive(Debug, PartialEq, Clone)] +pub(crate) struct CDFScanFile { + /// The type of action this file belongs to. This may be one of add, remove, or cdc. + pub scan_type: CDFScanFileType, + /// a `&str` which is the path to the file + pub path: String, + /// a [`DvInfo`] struct, which allows getting the selection vector for this file + pub dv_info: DvInfo, + /// a `HashMap` which are partition values + pub partition_values: HashMap, + /// the commit version that this action was performed in + pub commit_version: i64, + /// the timestamp of the commit that this action was performed in + pub commit_timestamp: i64, +} + +pub(crate) type CDFScanCallback = fn(context: &mut T, scan_file: CDFScanFile); + +/// Transforms an iterator of TableChangesScanData into an iterator of +/// `UnresolvedCDFScanFile` by visiting the engine data. +#[allow(unused)] +pub(crate) fn scan_data_to_scan_file( + scan_data: impl Iterator>, +) -> impl Iterator> { + scan_data + .map(|scan_data| -> DeltaResult<_> { + let scan_data = scan_data?; + let callback: CDFScanCallback> = + |context, scan_file| context.push(scan_file); + let result = visit_cdf_scan_files( + scan_data.scan_data.as_ref(), + &scan_data.selection_vector, + vec![], + callback, + )? + .into_iter() + .map(move |scan_file| UnresolvedCDFScanFile { + scan_file, + remove_dvs: scan_data.remove_dvs.clone(), + }); + Ok(result) + }) // Iterator-Result-Iterator + .flatten_ok() // Iterator-Result +} + +/// Request that the kernel call a callback on each valid file that needs to be read for the +/// scan. +/// +/// The arguments to the callback are: +/// * `context`: an `&mut context` argument. this can be anything that engine needs to pass through to each call +/// * `CDFScanFile`: a [`CDFScanFile`] struct that holds all the metadata required to perform Change Data +/// Feed +/// +/// ## Context +/// A note on the `context`. This can be any value the engine wants. This function takes ownership +/// of the passed arg, but then returns it, so the engine can repeatedly call `visit_cdf_scan_files` +/// with the same context. +/// +/// ## Example +/// ```ignore +/// let mut context = [my context]; +/// for res in scan_data { // scan data table_changes_scan.scan_data() +/// let (data, vector, remove_dv) = res?; +/// context = delta_kernel::table_changes::scan_file::visit_cdf_scan_files( +/// data.as_ref(), +/// selection_vector, +/// context, +/// my_callback, +/// )?; +/// } +/// ``` +#[allow(unused)] +pub(crate) fn visit_cdf_scan_files( + data: &dyn EngineData, + selection_vector: &[bool], + context: T, + callback: CDFScanCallback, +) -> DeltaResult { + let mut visitor = CDFScanFileVisitor { + callback, + selection_vector, + context, + }; + + visitor.visit_rows_of(data)?; + Ok(visitor.context) +} + +// add some visitor magic for engines +#[allow(unused)] +struct CDFScanFileVisitor<'a, T> { + callback: CDFScanCallback, + selection_vector: &'a [bool], + context: T, +} + +impl RowVisitor for CDFScanFileVisitor<'_, T> { + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + require!( + getters.len() == 18, + Error::InternalError(format!( + "Wrong number of CDFScanFileVisitor getters: {}", + getters.len() + )) + ); + for row_index in 0..row_count { + if !self.selection_vector[row_index] { + continue; + } + + let (scan_type, path, deletion_vector, partition_values) = + if let Some(path) = getters[0].get_opt(row_index, "scanFile.add.path")? { + let scan_type = CDFScanFileType::Add; + let deletion_vector = visit_deletion_vector_at(row_index, &getters[1..=5])?; + let partition_values = getters[6] + .get(row_index, "scanFile.add.fileConstantValues.partitionValues")?; + (scan_type, path, deletion_vector, partition_values) + } else if let Some(path) = getters[7].get_opt(row_index, "scanFile.remove.path")? { + let scan_type = CDFScanFileType::Remove; + let deletion_vector = visit_deletion_vector_at(row_index, &getters[8..=12])?; + let partition_values = getters[13].get( + row_index, + "scanFile.remove.fileConstantValues.partitionValues", + )?; + (scan_type, path, deletion_vector, partition_values) + } else if let Some(path) = getters[14].get_opt(row_index, "scanFile.cdc.path")? { + let scan_type = CDFScanFileType::Cdc; + let partition_values = getters[15] + .get(row_index, "scanFile.cdc.fileConstantValues.partitionValues")?; + (scan_type, path, None, partition_values) + } else { + continue; + }; + let dv_info = DvInfo { deletion_vector }; + let scan_file = CDFScanFile { + scan_type, + path, + dv_info, + partition_values, + commit_timestamp: getters[16].get(row_index, "scanFile.timestamp")?, + commit_version: getters[17].get(row_index, "scanFile.commit_version")?, + }; + (self.callback)(&mut self.context, scan_file) + } + Ok(()) + } + + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + static NAMES_AND_TYPES: LazyLock = + LazyLock::new(|| cdf_scan_row_schema().leaves(None)); + NAMES_AND_TYPES.as_ref() + } +} + +/// Get the schema that scan rows (from [`TableChanges::scan_data`]) will be returned with. +pub(crate) fn cdf_scan_row_schema() -> SchemaRef { + static CDF_SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| { + let deletion_vector = StructType::new([ + StructField::new("storageType", DataType::STRING, true), + StructField::new("pathOrInlineDv", DataType::STRING, true), + StructField::new("offset", DataType::INTEGER, true), + StructField::new("sizeInBytes", DataType::INTEGER, true), + StructField::new("cardinality", DataType::LONG, true), + ]); + let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); + let file_constant_values = + StructType::new([StructField::new("partitionValues", partition_values, true)]); + + let add = StructType::new([ + StructField::new("path", DataType::STRING, true), + StructField::new("deletionVector", deletion_vector.clone(), true), + StructField::new("fileConstantValues", file_constant_values.clone(), true), + ]); + let remove = StructType::new([ + StructField::new("path", DataType::STRING, true), + StructField::new("deletionVector", deletion_vector, true), + StructField::new("fileConstantValues", file_constant_values.clone(), true), + ]); + let cdc = StructType::new([ + StructField::new("path", DataType::STRING, true), + StructField::new("fileConstantValues", file_constant_values, true), + ]); + + Arc::new(StructType::new([ + StructField::new("add", add, true), + StructField::new("remove", remove, true), + StructField::new("cdc", cdc, true), + StructField::new("timestamp", DataType::LONG, true), + StructField::new("commit_version", DataType::LONG, true), + ])) + }); + CDF_SCAN_ROW_SCHEMA.clone() +} + +/// Expression to convert an action with `log_schema` into one with +/// `TABLE_CHANGES_cdf_scan_row_schema`. This is the expression used to create `TableChangesScanData`. +#[allow(unused)] +pub(crate) fn get_cdf_scan_row_expression(commit_timestamp: i64, commit_number: i64) -> Expression { + Expression::struct_from([ + Expression::struct_from([ + column_expr!("add.path"), + column_expr!("add.deletionVector"), + Expression::struct_from([column_expr!("add.partitionValues")]), + ]), + Expression::struct_from([ + column_expr!("remove.path"), + column_expr!("remove.deletionVector"), + Expression::struct_from([column_expr!("remove.partitionValues")]), + ]), + Expression::struct_from([ + column_expr!("cdc.path"), + Expression::struct_from([column_expr!("cdc.partitionValues")]), + ]), + commit_timestamp.into(), + commit_number.into(), + ]) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use itertools::Itertools; + + use super::CDFScanFileType; + use super::{ + cdf_scan_row_schema, get_cdf_scan_row_expression, visit_cdf_scan_files, CDFScanCallback, + CDFScanFile, + }; + use crate::actions::deletion_vector::DeletionVectorDescriptor; + use crate::actions::{get_log_schema, Add, Cdc, Remove}; + use crate::engine::sync::SyncEngine; + use crate::log_segment::LogSegment; + use crate::scan::state::DvInfo; + use crate::utils::test_utils::{Action, LocalMockTable}; + use crate::{DeltaResult, Engine}; + + #[tokio::test] + async fn schema_transform_correct() { + let engine = SyncEngine::new(); + let mut mock_table = LocalMockTable::new(); + + let add_dv = DeletionVectorDescriptor { + storage_type: "u".to_string(), + path_or_inline_dv: "vBn[lx{q8@P<9BNH/isA".to_string(), + offset: Some(1), + size_in_bytes: 36, + cardinality: 2, + }; + let add_partition_values = HashMap::from([("a".to_string(), "b".to_string())]); + let add = Add { + path: "fake_path_1".into(), + deletion_vector: Some(add_dv.clone()), + partition_values: add_partition_values, + ..Default::default() + }; + + let rm_dv = DeletionVectorDescriptor { + storage_type: "u".to_string(), + path_or_inline_dv: "U5OWRz5k%CFT.Td}yCPW".to_string(), + offset: Some(1), + size_in_bytes: 38, + cardinality: 3, + }; + let rm_partition_values = Some(HashMap::from([("c".to_string(), "d".to_string())])); + let remove = Remove { + path: "fake_path_2".into(), + deletion_vector: Some(rm_dv), + partition_values: rm_partition_values, + ..Default::default() + }; + + let cdc_partition_values = HashMap::from([("x".to_string(), "y".to_string())]); + let cdc = Cdc { + path: "fake_path_3".into(), + partition_values: cdc_partition_values, + ..Default::default() + }; + + mock_table + .commit([ + Action::Add(add.clone()), + Action::Remove(remove.clone()), + Action::Cdc(cdc.clone()), + ]) + .await; + + let table_root = url::Url::from_directory_path(mock_table.table_root()).unwrap(); + let log_root = table_root.join("_delta_log/").unwrap(); + let log_segment = + LogSegment::for_table_changes(engine.get_file_system_client().as_ref(), log_root, 0, 0) + .unwrap(); + let commit = log_segment.ascending_commit_files[0].clone(); + + let actions = engine + .get_json_handler() + .read_json_files(&[commit.location.clone()], get_log_schema().clone(), None) + .unwrap(); + + // Transform the engine data into the [`cdf_scan_row_schema`] and insert + // the following timestamp and commit version. + let commit_timestamp = 1234_i64; + let commit_version = 42_i64; + let scan_files: Vec<_> = actions + .map_ok(|actions| { + engine + .get_expression_handler() + .get_evaluator( + get_log_schema().clone(), + get_cdf_scan_row_expression(commit_timestamp, commit_version), + cdf_scan_row_schema().into(), + ) + .evaluate(actions.as_ref()) + .unwrap() + }) + .map(|data| -> DeltaResult<_> { + let data = data?; + let selection_vector = vec![true; data.len()]; + let callback: CDFScanCallback> = + |context, scan_file| context.push(scan_file); + visit_cdf_scan_files(data.as_ref(), &selection_vector, vec![], callback) + }) + .flatten_ok() + .try_collect() + .unwrap(); + + let expected_scan_files = vec![ + CDFScanFile { + scan_type: CDFScanFileType::Add, + path: add.path, + dv_info: DvInfo { + deletion_vector: add.deletion_vector, + }, + partition_values: add.partition_values, + commit_version, + commit_timestamp, + }, + CDFScanFile { + scan_type: CDFScanFileType::Remove, + path: remove.path, + dv_info: DvInfo { + deletion_vector: remove.deletion_vector, + }, + partition_values: remove.partition_values.unwrap(), + commit_version, + commit_timestamp, + }, + CDFScanFile { + scan_type: CDFScanFileType::Cdc, + path: cdc.path, + dv_info: DvInfo { + deletion_vector: None, + }, + partition_values: cdc.partition_values, + commit_version, + commit_timestamp, + }, + ]; + assert_eq!(expected_scan_files, scan_files); + } +} From e2c2d9052fbdf424d86cd473972a1b43812ffde0 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Fri, 6 Dec 2024 14:44:11 -0800 Subject: [PATCH 02/56] change naming and update documentation for cdf scan files --- kernel/src/table_changes/scan_file.rs | 93 ++++++++++++++++----------- 1 file changed, 54 insertions(+), 39 deletions(-) diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index 18fad9368..876a9c2f4 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -1,6 +1,6 @@ -//! This module handles [`CDFScanFile`]s for [`TableChangeScan`]. A [`CDFScanFile`] consists of all the -//! metadata required to generate a change data feed. [`CDFScanFile`] can be constructed using -//! [`CDFScanFileVisitor`]. The visitor reads from engine data with the schema [`cdf_scan_row_schema`]. +//! This module handles [`CdfScanFile`]s for [`TableChangeScan`]. A [`CdfScanFile`] consists of all the +//! metadata required to generate a change data feed. [`CdfScanFile`] can be constructed using +//! [`CdfScanFileVisitor`]. The visitor reads from engine data with the schema [`cdf_scan_row_schema`]. //! You can convert engine data to this schema using the [`get_cdf_scan_row_expression`]. use itertools::Itertools; use std::collections::HashMap; @@ -17,22 +17,37 @@ use crate::schema::{ use crate::utils::require; use crate::{DeltaResult, EngineData, Error, RowVisitor}; +/// A struct holding a [`CdfScanFile`] and map holding remove action paths to their deletion +/// vectors. The [`CdfScanFile`] has a type [`CdfScanFileType`] that represents the action it was +/// read from. Normally, a `scan_file` with type [`CdfScanFileType::Add`] produces row insertions. +/// However the type may not be accurate due to deletion vector resolution. Add/Remove pairs are +/// represented by a single `scan_file` with type add, and a corresponding deletion vector from the +/// map. After resolving deletion vectors, the `scan_file` might only result in removed rows, only +/// added rows, or both added and removed rows. +/// +/// An [`UnresolvedCdfScanFile`] can be converted into [`ResolvedCdfScanFile`] by reading the +/// deletion vectors, generating the correct selection vectors, and patching the [`CdfScanFileType`]. +/// This is done in `resolve_scan_file_dvs`. #[allow(unused)] -pub(crate) struct UnresolvedCDFScanFile { - pub scan_file: CDFScanFile, +pub(crate) struct UnresolvedCdfScanFile { + pub scan_file: CdfScanFile, pub remove_dvs: Arc>, } + +/// A struct holding a [`CdfScanFile`] and its selection vector. The [`CdfScanFile`] has a type +/// [`CdfScanFileType`] that represents the `_change_type` that its rows will have in the change +/// data feed. See [`UnresolvedCdfScanFile`] for more details. #[allow(unused)] #[derive(Debug)] -pub(crate) struct ResolvedCDFScanFile { - pub scan_file: CDFScanFile, +pub(crate) struct ResolvedCdfScanFile { + pub scan_file: CdfScanFile, pub selection_vector: Option>, } -// The type of action associated with a [`CDFScanFile`]. +// The type of action associated with a [`CdfScanFile`]. #[allow(unused)] #[derive(Debug, Clone, PartialEq)] -pub(crate) enum CDFScanFileType { +pub(crate) enum CdfScanFileType { Add, Remove, Cdc, @@ -41,9 +56,9 @@ pub(crate) enum CDFScanFileType { /// Represents all the metadata needed to read a Change Data Feed. #[allow(unused)] #[derive(Debug, PartialEq, Clone)] -pub(crate) struct CDFScanFile { +pub(crate) struct CdfScanFile { /// The type of action this file belongs to. This may be one of add, remove, or cdc. - pub scan_type: CDFScanFileType, + pub scan_type: CdfScanFileType, /// a `&str` which is the path to the file pub path: String, /// a [`DvInfo`] struct, which allows getting the selection vector for this file @@ -56,18 +71,18 @@ pub(crate) struct CDFScanFile { pub commit_timestamp: i64, } -pub(crate) type CDFScanCallback = fn(context: &mut T, scan_file: CDFScanFile); +pub(crate) type CdfScanCallback = fn(context: &mut T, scan_file: CdfScanFile); /// Transforms an iterator of TableChangesScanData into an iterator of -/// `UnresolvedCDFScanFile` by visiting the engine data. +/// `UnresolvedCdfScanFile` by visiting the engine data. #[allow(unused)] pub(crate) fn scan_data_to_scan_file( scan_data: impl Iterator>, -) -> impl Iterator> { +) -> impl Iterator> { scan_data .map(|scan_data| -> DeltaResult<_> { let scan_data = scan_data?; - let callback: CDFScanCallback> = + let callback: CdfScanCallback> = |context, scan_file| context.push(scan_file); let result = visit_cdf_scan_files( scan_data.scan_data.as_ref(), @@ -76,7 +91,7 @@ pub(crate) fn scan_data_to_scan_file( callback, )? .into_iter() - .map(move |scan_file| UnresolvedCDFScanFile { + .map(move |scan_file| UnresolvedCdfScanFile { scan_file, remove_dvs: scan_data.remove_dvs.clone(), }); @@ -90,7 +105,7 @@ pub(crate) fn scan_data_to_scan_file( /// /// The arguments to the callback are: /// * `context`: an `&mut context` argument. this can be anything that engine needs to pass through to each call -/// * `CDFScanFile`: a [`CDFScanFile`] struct that holds all the metadata required to perform Change Data +/// * `CdfScanFile`: a [`CdfScanFile`] struct that holds all the metadata required to perform Change Data /// Feed /// /// ## Context @@ -116,9 +131,9 @@ pub(crate) fn visit_cdf_scan_files( data: &dyn EngineData, selection_vector: &[bool], context: T, - callback: CDFScanCallback, + callback: CdfScanCallback, ) -> DeltaResult { - let mut visitor = CDFScanFileVisitor { + let mut visitor = CdfScanFileVisitor { callback, selection_vector, context, @@ -130,18 +145,18 @@ pub(crate) fn visit_cdf_scan_files( // add some visitor magic for engines #[allow(unused)] -struct CDFScanFileVisitor<'a, T> { - callback: CDFScanCallback, +struct CdfScanFileVisitor<'a, T> { + callback: CdfScanCallback, selection_vector: &'a [bool], context: T, } -impl RowVisitor for CDFScanFileVisitor<'_, T> { +impl RowVisitor for CdfScanFileVisitor<'_, T> { fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { require!( getters.len() == 18, Error::InternalError(format!( - "Wrong number of CDFScanFileVisitor getters: {}", + "Wrong number of CdfScanFileVisitor getters: {}", getters.len() )) ); @@ -152,13 +167,13 @@ impl RowVisitor for CDFScanFileVisitor<'_, T> { let (scan_type, path, deletion_vector, partition_values) = if let Some(path) = getters[0].get_opt(row_index, "scanFile.add.path")? { - let scan_type = CDFScanFileType::Add; + let scan_type = CdfScanFileType::Add; let deletion_vector = visit_deletion_vector_at(row_index, &getters[1..=5])?; let partition_values = getters[6] .get(row_index, "scanFile.add.fileConstantValues.partitionValues")?; (scan_type, path, deletion_vector, partition_values) } else if let Some(path) = getters[7].get_opt(row_index, "scanFile.remove.path")? { - let scan_type = CDFScanFileType::Remove; + let scan_type = CdfScanFileType::Remove; let deletion_vector = visit_deletion_vector_at(row_index, &getters[8..=12])?; let partition_values = getters[13].get( row_index, @@ -166,7 +181,7 @@ impl RowVisitor for CDFScanFileVisitor<'_, T> { )?; (scan_type, path, deletion_vector, partition_values) } else if let Some(path) = getters[14].get_opt(row_index, "scanFile.cdc.path")? { - let scan_type = CDFScanFileType::Cdc; + let scan_type = CdfScanFileType::Cdc; let partition_values = getters[15] .get(row_index, "scanFile.cdc.fileConstantValues.partitionValues")?; (scan_type, path, None, partition_values) @@ -174,7 +189,7 @@ impl RowVisitor for CDFScanFileVisitor<'_, T> { continue; }; let dv_info = DvInfo { deletion_vector }; - let scan_file = CDFScanFile { + let scan_file = CdfScanFile { scan_type, path, dv_info, @@ -196,7 +211,7 @@ impl RowVisitor for CDFScanFileVisitor<'_, T> { /// Get the schema that scan rows (from [`TableChanges::scan_data`]) will be returned with. pub(crate) fn cdf_scan_row_schema() -> SchemaRef { - static CDF_SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| { + static Cdf_SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| { let deletion_vector = StructType::new([ StructField::new("storageType", DataType::STRING, true), StructField::new("pathOrInlineDv", DataType::STRING, true), @@ -231,7 +246,7 @@ pub(crate) fn cdf_scan_row_schema() -> SchemaRef { StructField::new("commit_version", DataType::LONG, true), ])) }); - CDF_SCAN_ROW_SCHEMA.clone() + Cdf_SCAN_ROW_SCHEMA.clone() } /// Expression to convert an action with `log_schema` into one with @@ -264,10 +279,10 @@ mod tests { use itertools::Itertools; - use super::CDFScanFileType; + use super::CdfScanFileType; use super::{ - cdf_scan_row_schema, get_cdf_scan_row_expression, visit_cdf_scan_files, CDFScanCallback, - CDFScanFile, + cdf_scan_row_schema, get_cdf_scan_row_expression, visit_cdf_scan_files, CdfScanCallback, + CdfScanFile, }; use crate::actions::deletion_vector::DeletionVectorDescriptor; use crate::actions::{get_log_schema, Add, Cdc, Remove}; @@ -358,7 +373,7 @@ mod tests { .map(|data| -> DeltaResult<_> { let data = data?; let selection_vector = vec![true; data.len()]; - let callback: CDFScanCallback> = + let callback: CdfScanCallback> = |context, scan_file| context.push(scan_file); visit_cdf_scan_files(data.as_ref(), &selection_vector, vec![], callback) }) @@ -367,8 +382,8 @@ mod tests { .unwrap(); let expected_scan_files = vec![ - CDFScanFile { - scan_type: CDFScanFileType::Add, + CdfScanFile { + scan_type: CdfScanFileType::Add, path: add.path, dv_info: DvInfo { deletion_vector: add.deletion_vector, @@ -377,8 +392,8 @@ mod tests { commit_version, commit_timestamp, }, - CDFScanFile { - scan_type: CDFScanFileType::Remove, + CdfScanFile { + scan_type: CdfScanFileType::Remove, path: remove.path, dv_info: DvInfo { deletion_vector: remove.deletion_vector, @@ -387,8 +402,8 @@ mod tests { commit_version, commit_timestamp, }, - CDFScanFile { - scan_type: CDFScanFileType::Cdc, + CdfScanFile { + scan_type: CdfScanFileType::Cdc, path: cdc.path, dv_info: DvInfo { deletion_vector: None, From 3db127f4208bd28bd87b3fe5b0cc87683589dadf Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Fri, 6 Dec 2024 14:52:44 -0800 Subject: [PATCH 03/56] fixup some naming --- kernel/src/table_changes/scan_file.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index 876a9c2f4..0e069b7f6 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -211,7 +211,7 @@ impl RowVisitor for CdfScanFileVisitor<'_, T> { /// Get the schema that scan rows (from [`TableChanges::scan_data`]) will be returned with. pub(crate) fn cdf_scan_row_schema() -> SchemaRef { - static Cdf_SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| { + static CDF_SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| { let deletion_vector = StructType::new([ StructField::new("storageType", DataType::STRING, true), StructField::new("pathOrInlineDv", DataType::STRING, true), @@ -246,7 +246,7 @@ pub(crate) fn cdf_scan_row_schema() -> SchemaRef { StructField::new("commit_version", DataType::LONG, true), ])) }); - Cdf_SCAN_ROW_SCHEMA.clone() + CDF_SCAN_ROW_SCHEMA.clone() } /// Expression to convert an action with `log_schema` into one with From ec6e62919565c7582a100145b508e2af92417b15 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Fri, 6 Dec 2024 16:05:30 -0800 Subject: [PATCH 04/56] inline dvinfo creation --- kernel/src/table_changes/scan_file.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index 0e069b7f6..8b5750cc1 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -188,11 +188,10 @@ impl RowVisitor for CdfScanFileVisitor<'_, T> { } else { continue; }; - let dv_info = DvInfo { deletion_vector }; let scan_file = CdfScanFile { scan_type, path, - dv_info, + dv_info: DvInfo { deletion_vector }, partition_values, commit_timestamp: getters[16].get(row_index, "scanFile.timestamp")?, commit_version: getters[17].get(row_index, "scanFile.commit_version")?, From 74eed1016f5babd88f45835faa8a13cf20ea2490 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Fri, 6 Dec 2024 16:13:16 -0800 Subject: [PATCH 05/56] Change name of expression transform --- kernel/src/table_changes/scan_file.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index 8b5750cc1..dfb608fdf 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -1,7 +1,7 @@ //! This module handles [`CdfScanFile`]s for [`TableChangeScan`]. A [`CdfScanFile`] consists of all the //! metadata required to generate a change data feed. [`CdfScanFile`] can be constructed using //! [`CdfScanFileVisitor`]. The visitor reads from engine data with the schema [`cdf_scan_row_schema`]. -//! You can convert engine data to this schema using the [`get_cdf_scan_row_expression`]. +//! You can convert engine data to this schema using the [`cdf_scan_row_expression`]. use itertools::Itertools; use std::collections::HashMap; use std::sync::{Arc, LazyLock}; @@ -251,7 +251,7 @@ pub(crate) fn cdf_scan_row_schema() -> SchemaRef { /// Expression to convert an action with `log_schema` into one with /// `TABLE_CHANGES_cdf_scan_row_schema`. This is the expression used to create `TableChangesScanData`. #[allow(unused)] -pub(crate) fn get_cdf_scan_row_expression(commit_timestamp: i64, commit_number: i64) -> Expression { +pub(crate) fn cdf_scan_row_expression(commit_timestamp: i64, commit_number: i64) -> Expression { Expression::struct_from([ Expression::struct_from([ column_expr!("add.path"), @@ -280,7 +280,7 @@ mod tests { use super::CdfScanFileType; use super::{ - cdf_scan_row_schema, get_cdf_scan_row_expression, visit_cdf_scan_files, CdfScanCallback, + cdf_scan_row_expression, cdf_scan_row_schema, visit_cdf_scan_files, CdfScanCallback, CdfScanFile, }; use crate::actions::deletion_vector::DeletionVectorDescriptor; @@ -363,7 +363,7 @@ mod tests { .get_expression_handler() .get_evaluator( get_log_schema().clone(), - get_cdf_scan_row_expression(commit_timestamp, commit_version), + cdf_scan_row_expression(commit_timestamp, commit_version), cdf_scan_row_schema().into(), ) .evaluate(actions.as_ref()) From 9fe16dbfd4765809410041fad5d1bb93befb28d1 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sat, 7 Dec 2024 15:16:06 -0800 Subject: [PATCH 06/56] Add test for null parttion columns --- kernel/src/table_changes/scan_file.rs | 29 ++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index dfb608fdf..1a06ec169 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -170,12 +170,12 @@ impl RowVisitor for CdfScanFileVisitor<'_, T> { let scan_type = CdfScanFileType::Add; let deletion_vector = visit_deletion_vector_at(row_index, &getters[1..=5])?; let partition_values = getters[6] - .get(row_index, "scanFile.add.fileConstantValues.partitionValues")?; + .get_opt(row_index, "scanFile.add.fileConstantValues.partitionValues")?; (scan_type, path, deletion_vector, partition_values) } else if let Some(path) = getters[7].get_opt(row_index, "scanFile.remove.path")? { let scan_type = CdfScanFileType::Remove; let deletion_vector = visit_deletion_vector_at(row_index, &getters[8..=12])?; - let partition_values = getters[13].get( + let partition_values = getters[13].get_opt( row_index, "scanFile.remove.fileConstantValues.partitionValues", )?; @@ -183,11 +183,12 @@ impl RowVisitor for CdfScanFileVisitor<'_, T> { } else if let Some(path) = getters[14].get_opt(row_index, "scanFile.cdc.path")? { let scan_type = CdfScanFileType::Cdc; let partition_values = getters[15] - .get(row_index, "scanFile.cdc.fileConstantValues.partitionValues")?; + .get_opt(row_index, "scanFile.cdc.fileConstantValues.partitionValues")?; (scan_type, path, None, partition_values) } else { continue; }; + let partition_values = partition_values.unwrap_or_else(Default::default); let scan_file = CdfScanFile { scan_type, path, @@ -241,8 +242,8 @@ pub(crate) fn cdf_scan_row_schema() -> SchemaRef { StructField::new("add", add, true), StructField::new("remove", remove, true), StructField::new("cdc", cdc, true), - StructField::new("timestamp", DataType::LONG, true), - StructField::new("commit_version", DataType::LONG, true), + StructField::new("timestamp", DataType::LONG, false), + StructField::new("commit_version", DataType::LONG, false), ])) }); CDF_SCAN_ROW_SCHEMA.clone() @@ -333,11 +334,19 @@ mod tests { ..Default::default() }; + let remove_no_partition = Remove { + path: "fake_path_2".into(), + deletion_vector: None, + partition_values: None, + ..Default::default() + }; + mock_table .commit([ Action::Add(add.clone()), Action::Remove(remove.clone()), Action::Cdc(cdc.clone()), + Action::Remove(remove_no_partition.clone()), ]) .await; @@ -411,6 +420,16 @@ mod tests { commit_version, commit_timestamp, }, + CdfScanFile { + scan_type: CdfScanFileType::Remove, + path: remove_no_partition.path, + dv_info: DvInfo { + deletion_vector: None, + }, + partition_values: HashMap::new(), + commit_version, + commit_timestamp, + }, ]; assert_eq!(expected_scan_files, scan_files); } From 8635e380a0761c6b611b94cf6b9eaa418b8539c7 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sat, 7 Dec 2024 16:06:45 -0800 Subject: [PATCH 07/56] Improve testing for scan_file --- kernel/src/table_changes/log_replay.rs | 29 ++----- kernel/src/table_changes/scan_file.rs | 114 ++++++++++++------------- 2 files changed, 65 insertions(+), 78 deletions(-) diff --git a/kernel/src/table_changes/log_replay.rs b/kernel/src/table_changes/log_replay.rs index 76d324088..99121a545 100644 --- a/kernel/src/table_changes/log_replay.rs +++ b/kernel/src/table_changes/log_replay.rs @@ -11,12 +11,12 @@ use crate::actions::{ PROTOCOL_NAME, REMOVE_NAME, }; use crate::engine_data::{GetData, TypedGetData}; -use crate::expressions::{column_expr, column_name, ColumnName, Expression}; +use crate::expressions::{column_name, ColumnName}; use crate::path::ParsedLogPath; use crate::scan::data_skipping::DataSkippingFilter; -use crate::scan::scan_row_schema; use crate::scan::state::DvInfo; use crate::schema::{ArrayType, ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructType}; +use crate::table_changes::scan_file::{cdf_scan_row_expression, cdf_scan_row_schema}; use crate::table_changes::{check_cdf_table_properties, ensure_cdf_read_supported}; use crate::table_properties::TableProperties; use crate::utils::require; @@ -66,21 +66,6 @@ pub(crate) fn table_changes_action_iter( Ok(result) } -// Gets the expression for generating the engine data in [`TableChangesScanData`]. -// -// TODO: This expression is temporary. In the future it will also select `cdc` and `remove` actions -// fields. -fn add_transform_expr() -> Expression { - Expression::Struct(vec![ - column_expr!("add.path"), - column_expr!("add.size"), - column_expr!("add.modificationTime"), - column_expr!("add.stats"), - column_expr!("add.deletionVector"), - Expression::Struct(vec![column_expr!("add.partitionValues")]), - ]) -} - /// Processes a single commit file from the log to generate an iterator of [`TableChangesScanData`]. /// The scanner operates in two phases that _must_ be performed in the following order: /// 1. Prepare phase [`LogReplayScanner::try_new`]: This iterates over every action in the commit. @@ -238,7 +223,7 @@ impl LogReplayScanner { remove_dvs, commit_file, // TODO: Add the timestamp as a column with an expression - timestamp: _, + timestamp, } = self; let remove_dvs = Arc::new(remove_dvs); @@ -248,10 +233,14 @@ impl LogReplayScanner { schema, None, )?; + let commit_version = commit_file + .version + .try_into() + .map_err(|_| Error::generic("Failed to convert commit version to i64"))?; let evaluator = engine.get_expression_handler().get_evaluator( get_log_add_schema().clone(), - add_transform_expr(), - scan_row_schema().into(), + cdf_scan_row_expression(timestamp, commit_version), + cdf_scan_row_schema().into(), ); let result = action_iter.map(move |actions| -> DeltaResult<_> { diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index 1a06ec169..bdc6b1b92 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -29,6 +29,7 @@ use crate::{DeltaResult, EngineData, Error, RowVisitor}; /// deletion vectors, generating the correct selection vectors, and patching the [`CdfScanFileType`]. /// This is done in `resolve_scan_file_dvs`. #[allow(unused)] +#[derive(Debug)] pub(crate) struct UnresolvedCdfScanFile { pub scan_file: CdfScanFile, pub remove_dvs: Arc>, @@ -276,21 +277,20 @@ pub(crate) fn cdf_scan_row_expression(commit_timestamp: i64, commit_number: i64) #[cfg(test)] mod tests { use std::collections::HashMap; + use std::sync::Arc; use itertools::Itertools; - use super::CdfScanFileType; - use super::{ - cdf_scan_row_expression, cdf_scan_row_schema, visit_cdf_scan_files, CdfScanCallback, - CdfScanFile, - }; + use super::{scan_data_to_scan_file, CdfScanFile, CdfScanFileType}; use crate::actions::deletion_vector::DeletionVectorDescriptor; - use crate::actions::{get_log_schema, Add, Cdc, Remove}; + use crate::actions::{Add, Cdc, Remove}; use crate::engine::sync::SyncEngine; use crate::log_segment::LogSegment; use crate::scan::state::DvInfo; + use crate::schema::{DataType, StructField, StructType}; + use crate::table_changes::log_replay::table_changes_action_iter; use crate::utils::test_utils::{Action, LocalMockTable}; - use crate::{DeltaResult, Engine}; + use crate::Engine; #[tokio::test] async fn schema_transform_correct() { @@ -309,6 +309,7 @@ mod tests { path: "fake_path_1".into(), deletion_vector: Some(add_dv.clone()), partition_values: add_partition_values, + data_change: true, ..Default::default() }; @@ -324,6 +325,7 @@ mod tests { path: "fake_path_2".into(), deletion_vector: Some(rm_dv), partition_values: rm_partition_values, + data_change: true, ..Default::default() }; @@ -338,57 +340,47 @@ mod tests { path: "fake_path_2".into(), deletion_vector: None, partition_values: None, + data_change: true, ..Default::default() }; mock_table - .commit([ - Action::Add(add.clone()), - Action::Remove(remove.clone()), - Action::Cdc(cdc.clone()), - Action::Remove(remove_no_partition.clone()), - ]) + .commit([Action::Add(add.clone()), Action::Remove(remove.clone())]) + .await; + mock_table.commit([Action::Cdc(cdc.clone())]).await; + mock_table + .commit([Action::Remove(remove_no_partition.clone())]) .await; + // Read the table and generate [`TableChangesScanData`] let table_root = url::Url::from_directory_path(mock_table.table_root()).unwrap(); let log_root = table_root.join("_delta_log/").unwrap(); - let log_segment = - LogSegment::for_table_changes(engine.get_file_system_client().as_ref(), log_root, 0, 0) - .unwrap(); - let commit = log_segment.ascending_commit_files[0].clone(); - - let actions = engine - .get_json_handler() - .read_json_files(&[commit.location.clone()], get_log_schema().clone(), None) - .unwrap(); - - // Transform the engine data into the [`cdf_scan_row_schema`] and insert - // the following timestamp and commit version. - let commit_timestamp = 1234_i64; - let commit_version = 42_i64; - let scan_files: Vec<_> = actions - .map_ok(|actions| { - engine - .get_expression_handler() - .get_evaluator( - get_log_schema().clone(), - cdf_scan_row_expression(commit_timestamp, commit_version), - cdf_scan_row_schema().into(), - ) - .evaluate(actions.as_ref()) - .unwrap() - }) - .map(|data| -> DeltaResult<_> { - let data = data?; - let selection_vector = vec![true; data.len()]; - let callback: CdfScanCallback> = - |context, scan_file| context.push(scan_file); - visit_cdf_scan_files(data.as_ref(), &selection_vector, vec![], callback) - }) - .flatten_ok() - .try_collect() - .unwrap(); + let log_segment = LogSegment::for_table_changes( + engine.get_file_system_client().as_ref(), + log_root, + 0, + None, + ) + .unwrap(); + let table_schema = StructType::new([ + StructField::new("id", DataType::INTEGER, true), + StructField::new("value", DataType::STRING, true), + ]); + let scan_data = table_changes_action_iter( + Arc::new(engine), + log_segment.ascending_commit_files.clone(), + table_schema.into(), + None, + ) + .unwrap(); + let scan_files = scan_data_to_scan_file(scan_data); + // Generate the expected [`CdfScanFile`] + let timestamps = log_segment + .ascending_commit_files + .iter() + .map(|commit| commit.location.last_modified) + .collect_vec(); let expected_scan_files = vec![ CdfScanFile { scan_type: CdfScanFileType::Add, @@ -397,8 +389,8 @@ mod tests { deletion_vector: add.deletion_vector, }, partition_values: add.partition_values, - commit_version, - commit_timestamp, + commit_version: 0, + commit_timestamp: timestamps[0], }, CdfScanFile { scan_type: CdfScanFileType::Remove, @@ -407,8 +399,8 @@ mod tests { deletion_vector: remove.deletion_vector, }, partition_values: remove.partition_values.unwrap(), - commit_version, - commit_timestamp, + commit_version: 0, + commit_timestamp: timestamps[0], }, CdfScanFile { scan_type: CdfScanFileType::Cdc, @@ -417,8 +409,8 @@ mod tests { deletion_vector: None, }, partition_values: cdc.partition_values, - commit_version, - commit_timestamp, + commit_version: 1, + commit_timestamp: timestamps[1], }, CdfScanFile { scan_type: CdfScanFileType::Remove, @@ -427,10 +419,16 @@ mod tests { deletion_vector: None, }, partition_values: HashMap::new(), - commit_version, - commit_timestamp, + commit_version: 2, + commit_timestamp: timestamps[2], }, ]; - assert_eq!(expected_scan_files, scan_files); + + // Check the generated [`UnresolvedCdfScanFile`] + for (unresolved_scan_file, scan_file) in scan_files.zip(expected_scan_files.into_iter()) { + let unresolved_scan_file = unresolved_scan_file.unwrap(); + assert_eq!(unresolved_scan_file.scan_file, scan_file); + assert_eq!(unresolved_scan_file.remove_dvs, HashMap::new().into()); + } } } From a17f4b09cbf1b7be74e2deaa477088236af7535b Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sat, 7 Dec 2024 18:34:18 -0800 Subject: [PATCH 08/56] Change visitor, remove (un)resolved cdf scan file --- kernel/src/table_changes/log_replay.rs | 2 +- kernel/src/table_changes/scan_file.rs | 81 ++++++++------------------ 2 files changed, 24 insertions(+), 59 deletions(-) diff --git a/kernel/src/table_changes/log_replay.rs b/kernel/src/table_changes/log_replay.rs index 99121a545..9c6cfe872 100644 --- a/kernel/src/table_changes/log_replay.rs +++ b/kernel/src/table_changes/log_replay.rs @@ -37,7 +37,7 @@ pub(crate) struct TableChangesScanData { pub(crate) scan_data: Box, /// The selection vector used to filter the `scan_data`. pub(crate) selection_vector: Vec, - /// An map from a remove action's path to its deletion vector + /// A map from a remove action's path to its deletion vector pub(crate) remove_dvs: Arc>, } diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index bdc6b1b92..9a8a3a829 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -15,35 +15,7 @@ use crate::schema::{ ColumnName, ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType, }; use crate::utils::require; -use crate::{DeltaResult, EngineData, Error, RowVisitor}; - -/// A struct holding a [`CdfScanFile`] and map holding remove action paths to their deletion -/// vectors. The [`CdfScanFile`] has a type [`CdfScanFileType`] that represents the action it was -/// read from. Normally, a `scan_file` with type [`CdfScanFileType::Add`] produces row insertions. -/// However the type may not be accurate due to deletion vector resolution. Add/Remove pairs are -/// represented by a single `scan_file` with type add, and a corresponding deletion vector from the -/// map. After resolving deletion vectors, the `scan_file` might only result in removed rows, only -/// added rows, or both added and removed rows. -/// -/// An [`UnresolvedCdfScanFile`] can be converted into [`ResolvedCdfScanFile`] by reading the -/// deletion vectors, generating the correct selection vectors, and patching the [`CdfScanFileType`]. -/// This is done in `resolve_scan_file_dvs`. -#[allow(unused)] -#[derive(Debug)] -pub(crate) struct UnresolvedCdfScanFile { - pub scan_file: CdfScanFile, - pub remove_dvs: Arc>, -} - -/// A struct holding a [`CdfScanFile`] and its selection vector. The [`CdfScanFile`] has a type -/// [`CdfScanFileType`] that represents the `_change_type` that its rows will have in the change -/// data feed. See [`UnresolvedCdfScanFile`] for more details. -#[allow(unused)] -#[derive(Debug)] -pub(crate) struct ResolvedCdfScanFile { - pub scan_file: CdfScanFile, - pub selection_vector: Option>, -} +use crate::{DeltaResult, Error, RowVisitor}; // The type of action associated with a [`CdfScanFile`]. #[allow(unused)] @@ -60,16 +32,18 @@ pub(crate) enum CdfScanFileType { pub(crate) struct CdfScanFile { /// The type of action this file belongs to. This may be one of add, remove, or cdc. pub scan_type: CdfScanFileType, - /// a `&str` which is the path to the file + /// A `&str` which is the path to the file pub path: String, - /// a [`DvInfo`] struct, which allows getting the selection vector for this file + /// A [`DvInfo`] struct, which allows getting the selection vector for this file pub dv_info: DvInfo, - /// a `HashMap` which are partition values + /// A `HashMap` which are partition values pub partition_values: HashMap, - /// the commit version that this action was performed in + /// The commit version that this action was performed in pub commit_version: i64, - /// the timestamp of the commit that this action was performed in + /// The timestamp of the commit that this action was performed in pub commit_timestamp: i64, + /// A map from a remove action's path to its deletion vector + pub remove_dvs: Arc>, } pub(crate) type CdfScanCallback = fn(context: &mut T, scan_file: CdfScanFile); @@ -79,24 +53,13 @@ pub(crate) type CdfScanCallback = fn(context: &mut T, scan_file: CdfScanFile) #[allow(unused)] pub(crate) fn scan_data_to_scan_file( scan_data: impl Iterator>, -) -> impl Iterator> { +) -> impl Iterator> { scan_data .map(|scan_data| -> DeltaResult<_> { let scan_data = scan_data?; let callback: CdfScanCallback> = |context, scan_file| context.push(scan_file); - let result = visit_cdf_scan_files( - scan_data.scan_data.as_ref(), - &scan_data.selection_vector, - vec![], - callback, - )? - .into_iter() - .map(move |scan_file| UnresolvedCdfScanFile { - scan_file, - remove_dvs: scan_data.remove_dvs.clone(), - }); - Ok(result) + Ok(visit_cdf_scan_files(&scan_data, vec![], callback)?.into_iter()) }) // Iterator-Result-Iterator .flatten_ok() // Iterator-Result } @@ -129,18 +92,18 @@ pub(crate) fn scan_data_to_scan_file( /// ``` #[allow(unused)] pub(crate) fn visit_cdf_scan_files( - data: &dyn EngineData, - selection_vector: &[bool], + scan_data: &TableChangesScanData, context: T, callback: CdfScanCallback, ) -> DeltaResult { let mut visitor = CdfScanFileVisitor { callback, - selection_vector, context, + selection_vector: &scan_data.selection_vector, + remove_dvs: &scan_data.remove_dvs, }; - visitor.visit_rows_of(data)?; + visitor.visit_rows_of(scan_data.scan_data.as_ref())?; Ok(visitor.context) } @@ -149,6 +112,7 @@ pub(crate) fn visit_cdf_scan_files( struct CdfScanFileVisitor<'a, T> { callback: CdfScanCallback, selection_vector: &'a [bool], + remove_dvs: &'a Arc>, context: T, } @@ -197,6 +161,7 @@ impl RowVisitor for CdfScanFileVisitor<'_, T> { partition_values, commit_timestamp: getters[16].get(row_index, "scanFile.timestamp")?, commit_version: getters[17].get(row_index, "scanFile.commit_version")?, + remove_dvs: self.remove_dvs.clone(), }; (self.callback)(&mut self.context, scan_file) } @@ -373,7 +338,7 @@ mod tests { None, ) .unwrap(); - let scan_files = scan_data_to_scan_file(scan_data); + let scan_files: Vec<_> = scan_data_to_scan_file(scan_data).try_collect().unwrap(); // Generate the expected [`CdfScanFile`] let timestamps = log_segment @@ -381,6 +346,7 @@ mod tests { .iter() .map(|commit| commit.location.last_modified) .collect_vec(); + let remove_dvs = Arc::new(HashMap::new()); let expected_scan_files = vec![ CdfScanFile { scan_type: CdfScanFileType::Add, @@ -391,6 +357,7 @@ mod tests { partition_values: add.partition_values, commit_version: 0, commit_timestamp: timestamps[0], + remove_dvs: remove_dvs.clone(), }, CdfScanFile { scan_type: CdfScanFileType::Remove, @@ -401,6 +368,7 @@ mod tests { partition_values: remove.partition_values.unwrap(), commit_version: 0, commit_timestamp: timestamps[0], + remove_dvs: remove_dvs.clone(), }, CdfScanFile { scan_type: CdfScanFileType::Cdc, @@ -411,6 +379,7 @@ mod tests { partition_values: cdc.partition_values, commit_version: 1, commit_timestamp: timestamps[1], + remove_dvs: remove_dvs.clone(), }, CdfScanFile { scan_type: CdfScanFileType::Remove, @@ -421,14 +390,10 @@ mod tests { partition_values: HashMap::new(), commit_version: 2, commit_timestamp: timestamps[2], + remove_dvs, }, ]; - // Check the generated [`UnresolvedCdfScanFile`] - for (unresolved_scan_file, scan_file) in scan_files.zip(expected_scan_files.into_iter()) { - let unresolved_scan_file = unresolved_scan_file.unwrap(); - assert_eq!(unresolved_scan_file.scan_file, scan_file); - assert_eq!(unresolved_scan_file.remove_dvs, HashMap::new().into()); - } + assert_eq!(scan_files, expected_scan_files); } } From 5fab00c67775b6b5e52c23ccd11173975e886b48 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sat, 7 Dec 2024 20:08:13 -0800 Subject: [PATCH 09/56] Only keep track of remove dv instead of hashmap --- kernel/src/table_changes/scan_file.rs | 37 +++++++++++++++------------ 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index 9a8a3a829..101882f57 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -30,20 +30,20 @@ pub(crate) enum CdfScanFileType { #[allow(unused)] #[derive(Debug, PartialEq, Clone)] pub(crate) struct CdfScanFile { - /// The type of action this file belongs to. This may be one of add, remove, or cdc. + /// The type of action this file belongs to. This may be one of add, remove, or cdc pub scan_type: CdfScanFileType, /// A `&str` which is the path to the file pub path: String, - /// A [`DvInfo`] struct, which allows getting the selection vector for this file - pub dv_info: DvInfo, + /// A [`DvInfo`] struct with the path to the action's deletion vector + pub add_dv: DvInfo, + /// A [`DvInfo`] struct with the path to the paired remove action's deletion vector + pub remove_dv: DvInfo, /// A `HashMap` which are partition values pub partition_values: HashMap, /// The commit version that this action was performed in pub commit_version: i64, /// The timestamp of the commit that this action was performed in pub commit_timestamp: i64, - /// A map from a remove action's path to its deletion vector - pub remove_dvs: Arc>, } pub(crate) type CdfScanCallback = fn(context: &mut T, scan_file: CdfScanFile); @@ -154,14 +154,17 @@ impl RowVisitor for CdfScanFileVisitor<'_, T> { continue; }; let partition_values = partition_values.unwrap_or_else(Default::default); + let remove_dv = self.remove_dvs.get(&path).cloned().unwrap_or(DvInfo { + deletion_vector: None, + }); let scan_file = CdfScanFile { scan_type, path, - dv_info: DvInfo { deletion_vector }, + add_dv: DvInfo { deletion_vector }, partition_values, commit_timestamp: getters[16].get(row_index, "scanFile.timestamp")?, commit_version: getters[17].get(row_index, "scanFile.commit_version")?, - remove_dvs: self.remove_dvs.clone(), + remove_dv, }; (self.callback)(&mut self.context, scan_file) } @@ -346,51 +349,53 @@ mod tests { .iter() .map(|commit| commit.location.last_modified) .collect_vec(); - let remove_dvs = Arc::new(HashMap::new()); + let expected_remove_dv = DvInfo { + deletion_vector: None, + }; let expected_scan_files = vec![ CdfScanFile { scan_type: CdfScanFileType::Add, path: add.path, - dv_info: DvInfo { + add_dv: DvInfo { deletion_vector: add.deletion_vector, }, partition_values: add.partition_values, commit_version: 0, commit_timestamp: timestamps[0], - remove_dvs: remove_dvs.clone(), + remove_dv: expected_remove_dv.clone(), }, CdfScanFile { scan_type: CdfScanFileType::Remove, path: remove.path, - dv_info: DvInfo { + add_dv: DvInfo { deletion_vector: remove.deletion_vector, }, partition_values: remove.partition_values.unwrap(), commit_version: 0, commit_timestamp: timestamps[0], - remove_dvs: remove_dvs.clone(), + remove_dv: expected_remove_dv.clone(), }, CdfScanFile { scan_type: CdfScanFileType::Cdc, path: cdc.path, - dv_info: DvInfo { + add_dv: DvInfo { deletion_vector: None, }, partition_values: cdc.partition_values, commit_version: 1, commit_timestamp: timestamps[1], - remove_dvs: remove_dvs.clone(), + remove_dv: expected_remove_dv.clone(), }, CdfScanFile { scan_type: CdfScanFileType::Remove, path: remove_no_partition.path, - dv_info: DvInfo { + add_dv: DvInfo { deletion_vector: None, }, partition_values: HashMap::new(), commit_version: 2, commit_timestamp: timestamps[2], - remove_dvs, + remove_dv: expected_remove_dv, }, ]; From 21553a42c23d586a695158b16f1cfc120f5ea9c2 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sat, 7 Dec 2024 21:37:56 -0800 Subject: [PATCH 10/56] Fix remove dv --- kernel/src/table_changes/scan_file.rs | 41 ++++++++++++++++----------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index 101882f57..6df59947e 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -36,8 +36,9 @@ pub(crate) struct CdfScanFile { pub path: String, /// A [`DvInfo`] struct with the path to the action's deletion vector pub add_dv: DvInfo, - /// A [`DvInfo`] struct with the path to the paired remove action's deletion vector - pub remove_dv: DvInfo, + /// An optional [`DvInfo`] struct. If present, this is deletion vector of a remove action with + /// the same path as this [`CdfScanFile`] + pub remove_dv: Option, /// A `HashMap` which are partition values pub partition_values: HashMap, /// The commit version that this action was performed in @@ -154,17 +155,14 @@ impl RowVisitor for CdfScanFileVisitor<'_, T> { continue; }; let partition_values = partition_values.unwrap_or_else(Default::default); - let remove_dv = self.remove_dvs.get(&path).cloned().unwrap_or(DvInfo { - deletion_vector: None, - }); let scan_file = CdfScanFile { + remove_dv: self.remove_dvs.get(&path).cloned(), scan_type, path, add_dv: DvInfo { deletion_vector }, partition_values, commit_timestamp: getters[16].get(row_index, "scanFile.timestamp")?, commit_version: getters[17].get(row_index, "scanFile.commit_version")?, - remove_dv, }; (self.callback)(&mut self.context, scan_file) } @@ -261,7 +259,7 @@ mod tests { use crate::Engine; #[tokio::test] - async fn schema_transform_correct() { + async fn test_scan_file_visiting() { let engine = SyncEngine::new(); let mut mock_table = LocalMockTable::new(); @@ -273,13 +271,20 @@ mod tests { cardinality: 2, }; let add_partition_values = HashMap::from([("a".to_string(), "b".to_string())]); - let add = Add { + let add_paired = Add { path: "fake_path_1".into(), deletion_vector: Some(add_dv.clone()), partition_values: add_partition_values, data_change: true, ..Default::default() }; + let remove_paired = Remove { + path: "fake_path_1".into(), + deletion_vector: None, + partition_values: None, + data_change: true, + ..Default::default() + }; let rm_dv = DeletionVectorDescriptor { storage_type: "u".to_string(), @@ -313,7 +318,11 @@ mod tests { }; mock_table - .commit([Action::Add(add.clone()), Action::Remove(remove.clone())]) + .commit([ + Action::Remove(remove_paired.clone()), + Action::Add(add_paired.clone()), + Action::Remove(remove.clone()), + ]) .await; mock_table.commit([Action::Cdc(cdc.clone())]).await; mock_table @@ -355,14 +364,14 @@ mod tests { let expected_scan_files = vec![ CdfScanFile { scan_type: CdfScanFileType::Add, - path: add.path, + path: add_paired.path, add_dv: DvInfo { - deletion_vector: add.deletion_vector, + deletion_vector: add_paired.deletion_vector, }, - partition_values: add.partition_values, + partition_values: add_paired.partition_values, commit_version: 0, commit_timestamp: timestamps[0], - remove_dv: expected_remove_dv.clone(), + remove_dv: Some(expected_remove_dv), }, CdfScanFile { scan_type: CdfScanFileType::Remove, @@ -373,7 +382,7 @@ mod tests { partition_values: remove.partition_values.unwrap(), commit_version: 0, commit_timestamp: timestamps[0], - remove_dv: expected_remove_dv.clone(), + remove_dv: None, }, CdfScanFile { scan_type: CdfScanFileType::Cdc, @@ -384,7 +393,7 @@ mod tests { partition_values: cdc.partition_values, commit_version: 1, commit_timestamp: timestamps[1], - remove_dv: expected_remove_dv.clone(), + remove_dv: None, }, CdfScanFile { scan_type: CdfScanFileType::Remove, @@ -395,7 +404,7 @@ mod tests { partition_values: HashMap::new(), commit_version: 2, commit_timestamp: timestamps[2], - remove_dv: expected_remove_dv, + remove_dv: None, }, ]; From 08867f8036bafdca1e8f27de0ad78d43bef23e2c Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sat, 7 Dec 2024 23:43:41 -0800 Subject: [PATCH 11/56] patch rm_dv --- kernel/src/table_changes/scan_file.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index 6df59947e..687e0749d 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -35,7 +35,7 @@ pub(crate) struct CdfScanFile { /// A `&str` which is the path to the file pub path: String, /// A [`DvInfo`] struct with the path to the action's deletion vector - pub add_dv: DvInfo, + pub dv_info: DvInfo, /// An optional [`DvInfo`] struct. If present, this is deletion vector of a remove action with /// the same path as this [`CdfScanFile`] pub remove_dv: Option, @@ -159,7 +159,7 @@ impl RowVisitor for CdfScanFileVisitor<'_, T> { remove_dv: self.remove_dvs.get(&path).cloned(), scan_type, path, - add_dv: DvInfo { deletion_vector }, + dv_info: DvInfo { deletion_vector }, partition_values, commit_timestamp: getters[16].get(row_index, "scanFile.timestamp")?, commit_version: getters[17].get(row_index, "scanFile.commit_version")?, @@ -263,7 +263,7 @@ mod tests { let engine = SyncEngine::new(); let mut mock_table = LocalMockTable::new(); - let add_dv = DeletionVectorDescriptor { + let dv_info = DeletionVectorDescriptor { storage_type: "u".to_string(), path_or_inline_dv: "vBn[lx{q8@P<9BNH/isA".to_string(), offset: Some(1), @@ -273,7 +273,7 @@ mod tests { let add_partition_values = HashMap::from([("a".to_string(), "b".to_string())]); let add_paired = Add { path: "fake_path_1".into(), - deletion_vector: Some(add_dv.clone()), + deletion_vector: Some(dv_info.clone()), partition_values: add_partition_values, data_change: true, ..Default::default() @@ -365,7 +365,7 @@ mod tests { CdfScanFile { scan_type: CdfScanFileType::Add, path: add_paired.path, - add_dv: DvInfo { + dv_info: DvInfo { deletion_vector: add_paired.deletion_vector, }, partition_values: add_paired.partition_values, @@ -376,7 +376,7 @@ mod tests { CdfScanFile { scan_type: CdfScanFileType::Remove, path: remove.path, - add_dv: DvInfo { + dv_info: DvInfo { deletion_vector: remove.deletion_vector, }, partition_values: remove.partition_values.unwrap(), @@ -387,7 +387,7 @@ mod tests { CdfScanFile { scan_type: CdfScanFileType::Cdc, path: cdc.path, - add_dv: DvInfo { + dv_info: DvInfo { deletion_vector: None, }, partition_values: cdc.partition_values, @@ -398,7 +398,7 @@ mod tests { CdfScanFile { scan_type: CdfScanFileType::Remove, path: remove_no_partition.path, - add_dv: DvInfo { + dv_info: DvInfo { deletion_vector: None, }, partition_values: HashMap::new(), From 122810770fe534a7a699b0388d1bf172357d18ef Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 11:30:42 -0800 Subject: [PATCH 12/56] Update comment for CdfScanFileVisitor --- kernel/src/table_changes/scan_file.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/src/table_changes/scan_file.rs b/kernel/src/table_changes/scan_file.rs index 687e0749d..36e367e3a 100644 --- a/kernel/src/table_changes/scan_file.rs +++ b/kernel/src/table_changes/scan_file.rs @@ -108,7 +108,8 @@ pub(crate) fn visit_cdf_scan_files( Ok(visitor.context) } -// add some visitor magic for engines +/// A visitor that extracts [`CdfScanFile`]s from engine data. Expects data to have the schema +/// [`cdf_scan_row_schema`]. #[allow(unused)] struct CdfScanFileVisitor<'a, T> { callback: CdfScanCallback, From 97a5790de65162be7bef581add1104e39c1f9ae7 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Thu, 5 Dec 2024 23:00:07 -0800 Subject: [PATCH 13/56] Initial cdf read phase with deletion vector resolution --- kernel/src/scan/state.rs | 18 +- kernel/src/table_changes/data_read.rs | 231 ++++++++++++++++++++++++++ kernel/src/table_changes/mod.rs | 1 + 3 files changed, 245 insertions(+), 5 deletions(-) create mode 100644 kernel/src/table_changes/data_read.rs diff --git a/kernel/src/scan/state.rs b/kernel/src/scan/state.rs index cc55103b8..f2fe69c73 100644 --- a/kernel/src/scan/state.rs +++ b/kernel/src/scan/state.rs @@ -14,6 +14,7 @@ use crate::{ table_features::ColumnMappingMode, DeltaResult, Engine, EngineData, Error, }; +use roaring::RoaringTreemap; use serde::{Deserialize, Serialize}; use tracing::warn; @@ -53,19 +54,26 @@ impl DvInfo { self.deletion_vector.is_some() } - pub fn get_selection_vector( + pub(crate) fn get_treemap( &self, engine: &dyn Engine, table_root: &url::Url, - ) -> DeltaResult>> { - let dv_treemap = self - .deletion_vector + ) -> DeltaResult> { + self.deletion_vector .as_ref() .map(|dv_descriptor| { let fs_client = engine.get_file_system_client(); dv_descriptor.read(fs_client, table_root) }) - .transpose()?; + .transpose() + } + + pub fn get_selection_vector( + &self, + engine: &dyn Engine, + table_root: &url::Url, + ) -> DeltaResult>> { + let dv_treemap = self.get_treemap(engine, table_root)?; Ok(dv_treemap.map(treemap_to_bools)) } diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs new file mode 100644 index 000000000..9e58858d1 --- /dev/null +++ b/kernel/src/table_changes/data_read.rs @@ -0,0 +1,231 @@ +use std::{iter, ops::Not}; + +use itertools::Either; +use roaring::RoaringTreemap; +use url::Url; + +use super::scan_file::{CDFScanFileType, ResolvedCDFScanFile, UnresolvedCDFScanFile}; +use crate::{ + actions::deletion_vector::treemap_to_bools, table_changes::scan_file::CDFScanFile, DeltaResult, + Engine, Error, +}; + +/// Resolves the deletion vectors for an [`UnresolvedCDFScanFile`]. This function handles two +/// types of `CDFScanFile`s: +/// 1. The first case is a [`CDFScanFile`] paired with a remove deletion vector. The `scan_type` +/// must be [`CDFScanFileType::Add`]. In this case, both the add and remove deletion vectors are +/// read if they exist. Then, we find the set of rows in the scan file that have been added, and +/// the set of rows that have been removed. The set of removed rows (if any) will be represented +/// by a [`ResolvedCDFScanFile`] with `scan_type` = [`CDFScanFileType::Remove`]. The set of +/// added rows (if any) will be represented by a [`ResolvedCDFScanFile`] with `scan_type` = +/// [`CDFScanFileType::Add`]. +/// +/// Note: We allow the possibility for there to be both added rows, and removed rows for a +/// single add/remove pair. +/// 2. The second case handles all other add, remove, and cdc [`CDFScanFile`]s. These will simply +/// have the deletion vector read (if present), and each is converted into a [`ResolvedCDFScanFile`]. +/// No changes are made to the `scan_type`. +#[allow(unused)] +pub(crate) fn resolve_scan_file_dv( + engine: &dyn Engine, + table_root: &Url, + scan_file: UnresolvedCDFScanFile, +) -> DeltaResult> { + let UnresolvedCDFScanFile { + scan_file, + remove_dvs, + } = scan_file; + let paired_rm_dv = remove_dvs.get(&scan_file.path); + match (&scan_file.scan_type, paired_rm_dv) { + (CDFScanFileType::Add, Some(rm_dv)) => { + // Helper function to convert a treemap to a [`ResolvedCDFScanFile`]. The `scan_type` + // of the [`ResolvedCDFScanFile`] is set to `out_type` This returns an empty iterator + // if nothing is selected. + fn treemap_to_iter( + selection_treemap: RoaringTreemap, + mut scan_file: CDFScanFile, + out_type: CDFScanFileType, + ) -> impl Iterator { + if selection_treemap.is_empty() { + // Nothing has been selected, we do not read this data file + Either::Left(iter::empty()) + } else { + let added_dv = treemap_to_bools(selection_treemap) + .into_iter() + .map(Not::not) + .collect(); + println!("got dv: {:?} for type {:?}", added_dv, out_type); + scan_file.scan_type = out_type; + + Either::Right(iter::once(ResolvedCDFScanFile { + scan_file, + selection_vector: Some(added_dv), + })) + } + } + + // Retrieve the deletion vector from the add action and remove action + let add_dv = scan_file + .dv_info + .get_treemap(engine, table_root)? + .unwrap_or(Default::default()); + let rm_dv = rm_dv + .get_treemap(engine, table_root)? + .unwrap_or(Default::default()); + + // We calculate the deletion vectors as follows. Note that logically the `rm_dv` is the + // beginning state of the commit, and `add_dv` is the final state of the commit. In + // other words the dv went from being `rm_dv` to become `add_dv`. + // + // 1. First, find the `rm_dv XOR add_dv`. This sets the bits for all rows that have + // been changed in this commit. + // 2. Mask the set of changed bits by either the remove or add deletion vector. The + // cases are as follows: + // - If we mask by `rm_dv`, then the row went from 1 (deleted) to 0 (restored). + // Hence this row has been added. + // - If we mask by `add_dv`, then the row went fro 0 (present) to 1 (deleted). Hence + // this row has been deleted. + + let xor_dv = &rm_dv ^ &add_dv; + println!( + "Remove dv: {:?}, add_dv: {:?}, xor_dv: {:?}", + rm_dv, add_dv, xor_dv + ); + let added_selection_treemap = &xor_dv & rm_dv; + let removed_selection_treemap = xor_dv & add_dv; + + // Generate the [`ResolvedCDFScanFile`] for remove if there are rows selected + let removed = treemap_to_iter( + removed_selection_treemap, + scan_file.clone(), + CDFScanFileType::Remove, + ); + + // Generate the [`ResolvedCDFScanFile`] for add if there are rows selected + let added = treemap_to_iter(added_selection_treemap, scan_file, CDFScanFileType::Add); + + Ok(Either::Right(added.chain(removed))) + } + (_, Some(_)) => Err(Error::generic( + "Remove DV should only match to an add action!", + )), + (_, None) => { + let selection_vector = scan_file.dv_info.get_selection_vector(engine, table_root)?; + Ok(Either::Left(iter::once(ResolvedCDFScanFile { + scan_file, + selection_vector, + }))) + } + } +} + +#[cfg(test)] +mod tests { + use std::{collections::HashMap, path::PathBuf, sync::Arc}; + + use itertools::Itertools; + + use crate::{ + actions::deletion_vector::DeletionVectorDescriptor, + engine::sync::SyncEngine, + scan::state::DvInfo, + table_changes::scan_file::{CDFScanFile, CDFScanFileType, UnresolvedCDFScanFile}, + }; + + use super::resolve_scan_file_dv; + + #[test] + fn add_with_dv() { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let commit_version = 42_i64; + let commit_timestamp = 1234_i64; + let deletion_vector = Some(DeletionVectorDescriptor { + storage_type: "u".to_string(), + path_or_inline_dv: "vBn[lx{q8@P<9BNH/isA".to_string(), + offset: Some(1), + size_in_bytes: 36, + cardinality: 2, + }); + let path = "fake_path".to_string(); + let dv_info = DvInfo { deletion_vector }; + let scan_file = CDFScanFile { + scan_type: CDFScanFileType::Add, + path: path.clone(), + dv_info, + partition_values: HashMap::new(), + commit_version, + commit_timestamp, + }; + + let rm_dv = DvInfo { + deletion_vector: None, + }; + let input = UnresolvedCDFScanFile { + scan_file, + remove_dvs: Arc::new(HashMap::from([(path.clone(), rm_dv)])), + }; + + // Remove: None deleted + // Add: DV with 0th and 9th bit set (ie deleted) + let mut expected_sv = vec![false; 10]; + expected_sv[0] = true; + expected_sv[9] = true; + let resolved = resolve_scan_file_dv(&engine, &table_root, input) + .unwrap() + .map(|file| (file.scan_file.scan_type, file.selection_vector)) + .collect_vec(); + assert_eq!(resolved, vec![(CDFScanFileType::Remove, Some(expected_sv))]); + } + + #[test] + fn rm_with_dv() { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let commit_version = 42_i64; + let commit_timestamp = 1234_i64; + + let path = "fake_path".to_string(); + let dv_info = DvInfo { + deletion_vector: None, + }; + let scan_file = CDFScanFile { + scan_type: CDFScanFileType::Add, + path: path.clone(), + dv_info, + partition_values: HashMap::new(), + commit_version, + commit_timestamp, + }; + + let deletion_vector = Some(DeletionVectorDescriptor { + storage_type: "u".to_string(), + path_or_inline_dv: "vBn[lx{q8@P<9BNH/isA".to_string(), + offset: Some(1), + size_in_bytes: 36, + cardinality: 2, + }); + let rm_dv = DvInfo { deletion_vector }; + let input = UnresolvedCDFScanFile { + scan_file, + remove_dvs: Arc::new(HashMap::from([(path.clone(), rm_dv)])), + }; + + // Remove: DV with 0th and 9th bit set (ie deleted) + // Add: No rows deleted + let mut expected_sv = vec![false; 10]; + expected_sv[0] = true; + expected_sv[9] = true; + let resolved = resolve_scan_file_dv(&engine, &table_root, input) + .unwrap() + .map(|file| (file.scan_file.scan_type, file.selection_vector)) + .collect_vec(); + assert_eq!(resolved, vec![(CDFScanFileType::Add, Some(expected_sv))]); + } +} diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index 766866d25..d27454444 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -15,6 +15,7 @@ use crate::table_properties::TableProperties; use crate::utils::require; use crate::{DeltaResult, Engine, Error, Version}; +mod data_read; mod log_replay; pub mod scan; mod scan_file; From 4969e4451feb3030cac12af7e3471687103fd970 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Fri, 6 Dec 2024 09:26:43 -0800 Subject: [PATCH 14/56] lazily construct empty treemaps --- kernel/src/table_changes/data_read.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs index 9e58858d1..43fd8e2ec 100644 --- a/kernel/src/table_changes/data_read.rs +++ b/kernel/src/table_changes/data_read.rs @@ -68,10 +68,10 @@ pub(crate) fn resolve_scan_file_dv( let add_dv = scan_file .dv_info .get_treemap(engine, table_root)? - .unwrap_or(Default::default()); + .unwrap_or_else(Default::default); let rm_dv = rm_dv .get_treemap(engine, table_root)? - .unwrap_or(Default::default()); + .unwrap_or_else(Default::default); // We calculate the deletion vectors as follows. Note that logically the `rm_dv` is the // beginning state of the commit, and `add_dv` is the final state of the commit. In From dcb17fa3990d68ef944103c2193541b019350d07 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Fri, 6 Dec 2024 12:53:02 -0800 Subject: [PATCH 15/56] Change treemap handling in kernel, use selection vector, and simplify dv resolution --- kernel/src/actions/deletion_vector.rs | 40 +++++++++++++++++++++++---- kernel/src/scan/mod.rs | 6 ++-- kernel/src/scan/state.rs | 8 ++---- kernel/src/table_changes/data_read.rs | 38 ++++++++----------------- 4 files changed, 54 insertions(+), 38 deletions(-) diff --git a/kernel/src/actions/deletion_vector.rs b/kernel/src/actions/deletion_vector.rs index 59e4b48fa..a87d7c014 100644 --- a/kernel/src/actions/deletion_vector.rs +++ b/kernel/src/actions/deletion_vector.rs @@ -215,7 +215,17 @@ fn slice_to_u32(buf: &[u8], endian: Endian) -> DeltaResult { /// helper function to convert a treemap into a boolean vector where, for index i, if the bit is /// set, the vector will be false, and otherwise at index i the vector will be true -pub(crate) fn treemap_to_bools(treemap: RoaringTreemap) -> Vec { +pub(crate) fn deletion_treemap_to_bools(treemap: RoaringTreemap) -> Vec { + treemap_to_bools_with(treemap, false) +} + +/// helper function to convert a treemap into a boolean vector where, for index i, if the bit is +/// set, the vector will be true, and otherwise at index i the vector will be false +pub(crate) fn selection_treemap_to_bools(treemap: RoaringTreemap) -> Vec { + treemap_to_bools_with(treemap, true) +} + +fn treemap_to_bools_with(treemap: RoaringTreemap, set_bit: bool) -> Vec { fn combine(high_bits: u32, low_bits: u32) -> usize { ((u64::from(high_bits) << 32) | u64::from(low_bits)) as usize } @@ -224,12 +234,12 @@ pub(crate) fn treemap_to_bools(treemap: RoaringTreemap) -> Vec { Some(max) => { // there are values in the map //TODO(nick) panic if max is > MAX_USIZE - let mut result = vec![true; max as usize + 1]; + let mut result = vec![!set_bit; max as usize + 1]; let bitmaps = treemap.bitmaps(); for (index, bitmap) in bitmaps { for bit in bitmap.iter() { let vec_index = combine(index, bit); - result[vec_index] = false; + result[vec_index] = set_bit; } } result @@ -380,7 +390,7 @@ mod tests { } // this test is ignored by default as it's expensive to allocate such big vecs full of `true`. you can run it via: - // cargo test actions::action_definitions::tests::test_dv_to_bools + // cargo test actions::deletion_vector::tests::test_dv_to_bools -- --ignored #[test] #[test] #[ignore] fn test_dv_to_bools() { @@ -391,7 +401,7 @@ mod tests { rb.insert(30854); rb.insert(4294967297); rb.insert(4294967300); - let bools = super::treemap_to_bools(rb); + let bools = super::deletion_treemap_to_bools(rb); let mut expected = vec![true; 4294967301]; expected[0] = false; expected[2] = false; @@ -402,6 +412,26 @@ mod tests { assert_eq!(bools, expected); } + #[test] + fn test_sv_to_bools() { + let mut rb = RoaringTreemap::new(); + rb.insert(0); + rb.insert(2); + rb.insert(7); + rb.insert(30854); + rb.insert(4294967297); + rb.insert(4294967300); + let bools = super::selection_treemap_to_bools(rb); + let mut expected = vec![false; 4294967301]; + expected[0] = true; + expected[2] = true; + expected[7] = true; + expected[30854] = true; + expected[4294967297] = true; + expected[4294967300] = true; + assert_eq!(bools, expected); + } + #[test] fn test_dv_row_indexes() { let example = dv_inline(); diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index a60361179..4785173f8 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -7,7 +7,9 @@ use itertools::Itertools; use tracing::debug; use url::Url; -use crate::actions::deletion_vector::{split_vector, treemap_to_bools, DeletionVectorDescriptor}; +use crate::actions::deletion_vector::{ + deletion_treemap_to_bools, split_vector, DeletionVectorDescriptor, +}; use crate::actions::{get_log_add_schema, get_log_schema, ADD_NAME, REMOVE_NAME}; use crate::expressions::{ColumnName, Expression, ExpressionRef, Scalar}; use crate::scan::state::{DvInfo, Stats}; @@ -438,7 +440,7 @@ pub fn selection_vector( ) -> DeltaResult> { let fs_client = engine.get_file_system_client(); let dv_treemap = descriptor.read(fs_client, table_root)?; - Ok(treemap_to_bools(dv_treemap)) + Ok(deletion_treemap_to_bools(dv_treemap)) } /// Transform the raw data read from parquet into the correct logical form, based on the provided diff --git a/kernel/src/scan/state.rs b/kernel/src/scan/state.rs index f2fe69c73..5420feda0 100644 --- a/kernel/src/scan/state.rs +++ b/kernel/src/scan/state.rs @@ -3,12 +3,10 @@ use std::collections::HashMap; use std::sync::LazyLock; +use crate::actions::deletion_vector::deletion_treemap_to_bools; use crate::utils::require; use crate::{ - actions::{ - deletion_vector::{treemap_to_bools, DeletionVectorDescriptor}, - visitors::visit_deletion_vector_at, - }, + actions::{deletion_vector::DeletionVectorDescriptor, visitors::visit_deletion_vector_at}, engine_data::{GetData, RowVisitor, TypedGetData as _}, schema::{ColumnName, ColumnNamesAndTypes, DataType, SchemaRef}, table_features::ColumnMappingMode, @@ -74,7 +72,7 @@ impl DvInfo { table_root: &url::Url, ) -> DeltaResult>> { let dv_treemap = self.get_treemap(engine, table_root)?; - Ok(dv_treemap.map(treemap_to_bools)) + Ok(dv_treemap.map(deletion_treemap_to_bools)) } /// Returns a vector of row indexes that should be *removed* from the result set diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs index 43fd8e2ec..9cc28ddf3 100644 --- a/kernel/src/table_changes/data_read.rs +++ b/kernel/src/table_changes/data_read.rs @@ -1,14 +1,13 @@ -use std::{iter, ops::Not}; +use std::iter; use itertools::Either; use roaring::RoaringTreemap; use url::Url; use super::scan_file::{CDFScanFileType, ResolvedCDFScanFile, UnresolvedCDFScanFile}; -use crate::{ - actions::deletion_vector::treemap_to_bools, table_changes::scan_file::CDFScanFile, DeltaResult, - Engine, Error, -}; +use crate::actions::deletion_vector::selection_treemap_to_bools; +use crate::table_changes::scan_file::CDFScanFile; +use crate::{DeltaResult, Engine, Error}; /// Resolves the deletion vectors for an [`UnresolvedCDFScanFile`]. This function handles two /// types of `CDFScanFile`s: @@ -50,11 +49,7 @@ pub(crate) fn resolve_scan_file_dv( // Nothing has been selected, we do not read this data file Either::Left(iter::empty()) } else { - let added_dv = treemap_to_bools(selection_treemap) - .into_iter() - .map(Not::not) - .collect(); - println!("got dv: {:?} for type {:?}", added_dv, out_type); + let added_dv = selection_treemap_to_bools(selection_treemap); scan_file.scan_type = out_type; Either::Right(iter::once(ResolvedCDFScanFile { @@ -77,22 +72,13 @@ pub(crate) fn resolve_scan_file_dv( // beginning state of the commit, and `add_dv` is the final state of the commit. In // other words the dv went from being `rm_dv` to become `add_dv`. // - // 1. First, find the `rm_dv XOR add_dv`. This sets the bits for all rows that have - // been changed in this commit. - // 2. Mask the set of changed bits by either the remove or add deletion vector. The - // cases are as follows: - // - If we mask by `rm_dv`, then the row went from 1 (deleted) to 0 (restored). - // Hence this row has been added. - // - If we mask by `add_dv`, then the row went fro 0 (present) to 1 (deleted). Hence - // this row has been deleted. - - let xor_dv = &rm_dv ^ &add_dv; - println!( - "Remove dv: {:?}, add_dv: {:?}, xor_dv: {:?}", - rm_dv, add_dv, xor_dv - ); - let added_selection_treemap = &xor_dv & rm_dv; - let removed_selection_treemap = xor_dv & add_dv; + // The selection vector of add rows is calculated using `rm_dv - add_dv`. These rows went + // from 1 (deleted) in `rm_dv` to 0 (restored) in the `add_dv`. All unchanged rows will remain 0. + // + // The selection vector of deleted rows is calculated using `add_dv - rm_dv`. These rows went + // from 0 (present) in `rm_dv` to 1 (deleted) in the `add_dv`. All unchanged rows will remain 0. + let added_selection_treemap = &rm_dv - &add_dv; + let removed_selection_treemap = add_dv - rm_dv; // Generate the [`ResolvedCDFScanFile`] for remove if there are rows selected let removed = treemap_to_iter( From ebaf22523ab159ce0cc7733923088d5cbb37167c Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Fri, 6 Dec 2024 14:51:42 -0800 Subject: [PATCH 16/56] Rebase onto scan file --- kernel/src/table_changes/data_read.rs | 70 +++++++++++++-------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs index 9cc28ddf3..d4677a917 100644 --- a/kernel/src/table_changes/data_read.rs +++ b/kernel/src/table_changes/data_read.rs @@ -4,47 +4,47 @@ use itertools::Either; use roaring::RoaringTreemap; use url::Url; -use super::scan_file::{CDFScanFileType, ResolvedCDFScanFile, UnresolvedCDFScanFile}; +use super::scan_file::{CdfScanFileType, ResolvedCdfScanFile, UnresolvedCdfScanFile}; use crate::actions::deletion_vector::selection_treemap_to_bools; -use crate::table_changes::scan_file::CDFScanFile; +use crate::table_changes::scan_file::CdfScanFile; use crate::{DeltaResult, Engine, Error}; -/// Resolves the deletion vectors for an [`UnresolvedCDFScanFile`]. This function handles two -/// types of `CDFScanFile`s: -/// 1. The first case is a [`CDFScanFile`] paired with a remove deletion vector. The `scan_type` -/// must be [`CDFScanFileType::Add`]. In this case, both the add and remove deletion vectors are +/// Resolves the deletion vectors for an [`UnresolvedCdfScanFile`]. This function handles two +/// types of `CdfScanFile`s: +/// 1. The first case is a [`CdfScanFile`] paired with a remove deletion vector. The `scan_type` +/// must be [`CdfScanFileType::Add`]. In this case, both the add and remove deletion vectors are /// read if they exist. Then, we find the set of rows in the scan file that have been added, and /// the set of rows that have been removed. The set of removed rows (if any) will be represented -/// by a [`ResolvedCDFScanFile`] with `scan_type` = [`CDFScanFileType::Remove`]. The set of -/// added rows (if any) will be represented by a [`ResolvedCDFScanFile`] with `scan_type` = -/// [`CDFScanFileType::Add`]. +/// by a [`ResolvedCdfScanFile`] with `scan_type` = [`CdfScanFileType::Remove`]. The set of +/// added rows (if any) will be represented by a [`ResolvedCdfScanFile`] with `scan_type` = +/// [`CdfScanFileType::Add`]. /// /// Note: We allow the possibility for there to be both added rows, and removed rows for a /// single add/remove pair. -/// 2. The second case handles all other add, remove, and cdc [`CDFScanFile`]s. These will simply -/// have the deletion vector read (if present), and each is converted into a [`ResolvedCDFScanFile`]. +/// 2. The second case handles all other add, remove, and cdc [`CdfScanFile`]s. These will simply +/// have the deletion vector read (if present), and each is converted into a [`ResolvedCdfScanFile`]. /// No changes are made to the `scan_type`. #[allow(unused)] pub(crate) fn resolve_scan_file_dv( engine: &dyn Engine, table_root: &Url, - scan_file: UnresolvedCDFScanFile, -) -> DeltaResult> { - let UnresolvedCDFScanFile { + scan_file: UnresolvedCdfScanFile, +) -> DeltaResult> { + let UnresolvedCdfScanFile { scan_file, remove_dvs, } = scan_file; let paired_rm_dv = remove_dvs.get(&scan_file.path); match (&scan_file.scan_type, paired_rm_dv) { - (CDFScanFileType::Add, Some(rm_dv)) => { - // Helper function to convert a treemap to a [`ResolvedCDFScanFile`]. The `scan_type` - // of the [`ResolvedCDFScanFile`] is set to `out_type` This returns an empty iterator + (CdfScanFileType::Add, Some(rm_dv)) => { + // Helper function to convert a treemap to a [`ResolvedCdfScanFile`]. The `scan_type` + // of the [`ResolvedCdfScanFile`] is set to `out_type` This returns an empty iterator // if nothing is selected. fn treemap_to_iter( selection_treemap: RoaringTreemap, - mut scan_file: CDFScanFile, - out_type: CDFScanFileType, - ) -> impl Iterator { + mut scan_file: CdfScanFile, + out_type: CdfScanFileType, + ) -> impl Iterator { if selection_treemap.is_empty() { // Nothing has been selected, we do not read this data file Either::Left(iter::empty()) @@ -52,7 +52,7 @@ pub(crate) fn resolve_scan_file_dv( let added_dv = selection_treemap_to_bools(selection_treemap); scan_file.scan_type = out_type; - Either::Right(iter::once(ResolvedCDFScanFile { + Either::Right(iter::once(ResolvedCdfScanFile { scan_file, selection_vector: Some(added_dv), })) @@ -80,15 +80,15 @@ pub(crate) fn resolve_scan_file_dv( let added_selection_treemap = &rm_dv - &add_dv; let removed_selection_treemap = add_dv - rm_dv; - // Generate the [`ResolvedCDFScanFile`] for remove if there are rows selected + // Generate the [`ResolvedCdfScanFile`] for remove if there are rows selected let removed = treemap_to_iter( removed_selection_treemap, scan_file.clone(), - CDFScanFileType::Remove, + CdfScanFileType::Remove, ); - // Generate the [`ResolvedCDFScanFile`] for add if there are rows selected - let added = treemap_to_iter(added_selection_treemap, scan_file, CDFScanFileType::Add); + // Generate the [`ResolvedCdfScanFile`] for add if there are rows selected + let added = treemap_to_iter(added_selection_treemap, scan_file, CdfScanFileType::Add); Ok(Either::Right(added.chain(removed))) } @@ -97,7 +97,7 @@ pub(crate) fn resolve_scan_file_dv( )), (_, None) => { let selection_vector = scan_file.dv_info.get_selection_vector(engine, table_root)?; - Ok(Either::Left(iter::once(ResolvedCDFScanFile { + Ok(Either::Left(iter::once(ResolvedCdfScanFile { scan_file, selection_vector, }))) @@ -115,7 +115,7 @@ mod tests { actions::deletion_vector::DeletionVectorDescriptor, engine::sync::SyncEngine, scan::state::DvInfo, - table_changes::scan_file::{CDFScanFile, CDFScanFileType, UnresolvedCDFScanFile}, + table_changes::scan_file::{CdfScanFile, CdfScanFileType, UnresolvedCdfScanFile}, }; use super::resolve_scan_file_dv; @@ -138,8 +138,8 @@ mod tests { }); let path = "fake_path".to_string(); let dv_info = DvInfo { deletion_vector }; - let scan_file = CDFScanFile { - scan_type: CDFScanFileType::Add, + let scan_file = CdfScanFile { + scan_type: CdfScanFileType::Add, path: path.clone(), dv_info, partition_values: HashMap::new(), @@ -150,7 +150,7 @@ mod tests { let rm_dv = DvInfo { deletion_vector: None, }; - let input = UnresolvedCDFScanFile { + let input = UnresolvedCdfScanFile { scan_file, remove_dvs: Arc::new(HashMap::from([(path.clone(), rm_dv)])), }; @@ -164,7 +164,7 @@ mod tests { .unwrap() .map(|file| (file.scan_file.scan_type, file.selection_vector)) .collect_vec(); - assert_eq!(resolved, vec![(CDFScanFileType::Remove, Some(expected_sv))]); + assert_eq!(resolved, vec![(CdfScanFileType::Remove, Some(expected_sv))]); } #[test] @@ -181,8 +181,8 @@ mod tests { let dv_info = DvInfo { deletion_vector: None, }; - let scan_file = CDFScanFile { - scan_type: CDFScanFileType::Add, + let scan_file = CdfScanFile { + scan_type: CdfScanFileType::Add, path: path.clone(), dv_info, partition_values: HashMap::new(), @@ -198,7 +198,7 @@ mod tests { cardinality: 2, }); let rm_dv = DvInfo { deletion_vector }; - let input = UnresolvedCDFScanFile { + let input = UnresolvedCdfScanFile { scan_file, remove_dvs: Arc::new(HashMap::from([(path.clone(), rm_dv)])), }; @@ -212,6 +212,6 @@ mod tests { .unwrap() .map(|file| (file.scan_file.scan_type, file.selection_vector)) .collect_vec(); - assert_eq!(resolved, vec![(CDFScanFileType::Add, Some(expected_sv))]); + assert_eq!(resolved, vec![(CdfScanFileType::Add, Some(expected_sv))]); } } From bd4914238669fbabe5b65b02bc82f9f43abce4c6 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Fri, 6 Dec 2024 15:57:03 -0800 Subject: [PATCH 17/56] update doc comment for dvs --- kernel/src/table_changes/data_read.rs | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs index d4677a917..f3bc6c83d 100644 --- a/kernel/src/table_changes/data_read.rs +++ b/kernel/src/table_changes/data_read.rs @@ -68,15 +68,37 @@ pub(crate) fn resolve_scan_file_dv( .get_treemap(engine, table_root)? .unwrap_or_else(Default::default); - // We calculate the deletion vectors as follows. Note that logically the `rm_dv` is the + // Here we show how deletion vectors are resolved. Note that logically the `rm_dv` is the // beginning state of the commit, and `add_dv` is the final state of the commit. In - // other words the dv went from being `rm_dv` to become `add_dv`. + // other words the dv went from being `rm_dv` to become `add_dv`. We use a motivating + // example to explain the cases: + // rm_dv = [1, 1, 0] + // - add_dv = [0, 1, 1] // + // The result of this commit is: + // - row 0 is restored + // - row 1 is unchanged + // - row 2 is deleted + // + // # Insertion Selection Vector // The selection vector of add rows is calculated using `rm_dv - add_dv`. These rows went // from 1 (deleted) in `rm_dv` to 0 (restored) in the `add_dv`. All unchanged rows will remain 0. + // Applying this to our deletion vectors: + // rm_dv - add_dv = + // [1, 1, 0] + // - [0, 1, 1] + // = [1, 0, 0] + // The selection vector shows that row 0 was inserted // + // # Deletion Selection Vector // The selection vector of deleted rows is calculated using `add_dv - rm_dv`. These rows went // from 0 (present) in `rm_dv` to 1 (deleted) in the `add_dv`. All unchanged rows will remain 0. + // Applying this to our deletion vectors: + // add_dv - rm_dv = + // [0, 1, 1] + // - [1, 1, 0] + // = [0, 0, 1] + // The selection vector shows that row 2 was deleted let added_selection_treemap = &rm_dv - &add_dv; let removed_selection_treemap = add_dv - rm_dv; From 6287e6e68162eccfb69f1451532529a7f4881296 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Fri, 6 Dec 2024 16:04:51 -0800 Subject: [PATCH 18/56] fix comment --- kernel/src/table_changes/data_read.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs index f3bc6c83d..3969fda5e 100644 --- a/kernel/src/table_changes/data_read.rs +++ b/kernel/src/table_changes/data_read.rs @@ -84,7 +84,7 @@ pub(crate) fn resolve_scan_file_dv( // The selection vector of add rows is calculated using `rm_dv - add_dv`. These rows went // from 1 (deleted) in `rm_dv` to 0 (restored) in the `add_dv`. All unchanged rows will remain 0. // Applying this to our deletion vectors: - // rm_dv - add_dv = + // `rm_dv - add_dv` = // [1, 1, 0] // - [0, 1, 1] // = [1, 0, 0] @@ -94,7 +94,7 @@ pub(crate) fn resolve_scan_file_dv( // The selection vector of deleted rows is calculated using `add_dv - rm_dv`. These rows went // from 0 (present) in `rm_dv` to 1 (deleted) in the `add_dv`. All unchanged rows will remain 0. // Applying this to our deletion vectors: - // add_dv - rm_dv = + // `add_dv - rm_dv` = // [0, 1, 1] // - [1, 1, 0] // = [0, 0, 1] From 7fe4f4dc08647c340773db245398fdf83817f3a3 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sat, 7 Dec 2024 22:15:42 -0800 Subject: [PATCH 19/56] prototype of add/rm dv in cdfscanfile --- kernel/src/table_changes/data_read.rs | 222 +++++++++++--------------- 1 file changed, 92 insertions(+), 130 deletions(-) diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs index 3969fda5e..ee77dfecd 100644 --- a/kernel/src/table_changes/data_read.rs +++ b/kernel/src/table_changes/data_read.rs @@ -1,14 +1,17 @@ -use std::iter; +use std::ops::Not; -use itertools::Either; -use roaring::RoaringTreemap; use url::Url; -use super::scan_file::{CdfScanFileType, ResolvedCdfScanFile, UnresolvedCdfScanFile}; -use crate::actions::deletion_vector::selection_treemap_to_bools; +use super::scan_file::CdfScanFileType; +use crate::actions::deletion_vector::{deletion_treemap_to_bools, selection_treemap_to_bools}; use crate::table_changes::scan_file::CdfScanFile; use crate::{DeltaResult, Engine, Error}; +struct ResolvedCdfScanFile { + scan_file: CdfScanFile, + selection_vector: Vec, +} + /// Resolves the deletion vectors for an [`UnresolvedCdfScanFile`]. This function handles two /// types of `CdfScanFile`s: /// 1. The first case is a [`CdfScanFile`] paired with a remove deletion vector. The `scan_type` @@ -28,108 +31,75 @@ use crate::{DeltaResult, Engine, Error}; pub(crate) fn resolve_scan_file_dv( engine: &dyn Engine, table_root: &Url, - scan_file: UnresolvedCdfScanFile, + scan_file: CdfScanFile, ) -> DeltaResult> { - let UnresolvedCdfScanFile { - scan_file, - remove_dvs, - } = scan_file; - let paired_rm_dv = remove_dvs.get(&scan_file.path); - match (&scan_file.scan_type, paired_rm_dv) { - (CdfScanFileType::Add, Some(rm_dv)) => { - // Helper function to convert a treemap to a [`ResolvedCdfScanFile`]. The `scan_type` - // of the [`ResolvedCdfScanFile`] is set to `out_type` This returns an empty iterator - // if nothing is selected. - fn treemap_to_iter( - selection_treemap: RoaringTreemap, - mut scan_file: CdfScanFile, - out_type: CdfScanFileType, - ) -> impl Iterator { - if selection_treemap.is_empty() { - // Nothing has been selected, we do not read this data file - Either::Left(iter::empty()) - } else { - let added_dv = selection_treemap_to_bools(selection_treemap); - scan_file.scan_type = out_type; - - Either::Right(iter::once(ResolvedCdfScanFile { - scan_file, - selection_vector: Some(added_dv), - })) - } - } - - // Retrieve the deletion vector from the add action and remove action - let add_dv = scan_file - .dv_info - .get_treemap(engine, table_root)? - .unwrap_or_else(Default::default); - let rm_dv = rm_dv - .get_treemap(engine, table_root)? - .unwrap_or_else(Default::default); - - // Here we show how deletion vectors are resolved. Note that logically the `rm_dv` is the - // beginning state of the commit, and `add_dv` is the final state of the commit. In - // other words the dv went from being `rm_dv` to become `add_dv`. We use a motivating - // example to explain the cases: - // rm_dv = [1, 1, 0] - // - add_dv = [0, 1, 1] - // - // The result of this commit is: - // - row 0 is restored - // - row 1 is unchanged - // - row 2 is deleted - // - // # Insertion Selection Vector - // The selection vector of add rows is calculated using `rm_dv - add_dv`. These rows went - // from 1 (deleted) in `rm_dv` to 0 (restored) in the `add_dv`. All unchanged rows will remain 0. - // Applying this to our deletion vectors: - // `rm_dv - add_dv` = - // [1, 1, 0] - // - [0, 1, 1] - // = [1, 0, 0] - // The selection vector shows that row 0 was inserted - // - // # Deletion Selection Vector - // The selection vector of deleted rows is calculated using `add_dv - rm_dv`. These rows went - // from 0 (present) in `rm_dv` to 1 (deleted) in the `add_dv`. All unchanged rows will remain 0. - // Applying this to our deletion vectors: - // `add_dv - rm_dv` = - // [0, 1, 1] - // - [1, 1, 0] - // = [0, 0, 1] - // The selection vector shows that row 2 was deleted - let added_selection_treemap = &rm_dv - &add_dv; - let removed_selection_treemap = add_dv - rm_dv; - - // Generate the [`ResolvedCdfScanFile`] for remove if there are rows selected - let removed = treemap_to_iter( - removed_selection_treemap, - scan_file.clone(), - CdfScanFileType::Remove, - ); - - // Generate the [`ResolvedCdfScanFile`] for add if there are rows selected - let added = treemap_to_iter(added_selection_treemap, scan_file, CdfScanFileType::Add); - - Ok(Either::Right(added.chain(removed))) + let add_dv = scan_file.add_dv.get_treemap(engine, table_root)?; + let rm_dv = scan_file + .remove_dv + .as_ref() + .map(|rm_dv| rm_dv.get_treemap(engine, table_root)) + .transpose()?; + println!( + "initial {add_dv:?}, {rm_dv:?}, scan type: {:?}", + scan_file.scan_type + ); + let (add_dv, rm_dv) = match (add_dv, rm_dv, &scan_file.scan_type) { + (_, Some(_), CdfScanFileType::Remove) => { + return Err(Error::generic( + "CdfScanFile with type remove cannot have a remove deletion vector", + )); } - (_, Some(_)) => Err(Error::generic( - "Remove DV should only match to an add action!", - )), - (_, None) => { - let selection_vector = scan_file.dv_info.get_selection_vector(engine, table_root)?; - Ok(Either::Left(iter::once(ResolvedCdfScanFile { - scan_file, - selection_vector, - }))) + (_, Some(_), CdfScanFileType::Cdc) => { + return Err(Error::generic( + "CdfScanFile with type cdccannot have a remove deletion vector", + )); } - } + (add_dv, Some(rm_dv), CdfScanFileType::Add) => { + let add_dv = add_dv.unwrap_or_else(Default::default); + let rm_dv = rm_dv.unwrap_or_else(Default::default); + // Take the symmetric difference so we don't double count rows + let adds = &rm_dv - &add_dv; + let removes = add_dv - rm_dv; + ( + adds.is_empty().not().then_some(adds), + removes.is_empty().not().then_some(removes), + ) + } + (add_dv, None, CdfScanFileType::Add | CdfScanFileType::Cdc) => { + (Some(add_dv.unwrap_or_else(Default::default)), None) + } + (rm_dv, None, CdfScanFileType::Remove) => { + (None, Some(rm_dv.unwrap_or_else(Default::default))) + } + }; + let treemap_to_bools = if scan_file.remove_dv.is_some() { + selection_treemap_to_bools + } else { + deletion_treemap_to_bools + }; + + let rm_scan_file = CdfScanFile { + scan_type: CdfScanFileType::Remove, + ..scan_file.clone() + }; + let adds = add_dv + .map(treemap_to_bools) + .map(|selection_vector| ResolvedCdfScanFile { + scan_file, + selection_vector, + }); + let removes = rm_dv + .map(treemap_to_bools) + .map(|selection_vector| ResolvedCdfScanFile { + scan_file: rm_scan_file, + selection_vector, + }); + Ok([adds, removes].into_iter().flatten()) } #[cfg(test)] mod tests { - use std::{collections::HashMap, path::PathBuf, sync::Arc}; + use std::{collections::HashMap, path::PathBuf}; use itertools::Itertools; @@ -137,7 +107,7 @@ mod tests { actions::deletion_vector::DeletionVectorDescriptor, engine::sync::SyncEngine, scan::state::DvInfo, - table_changes::scan_file::{CdfScanFile, CdfScanFileType, UnresolvedCdfScanFile}, + table_changes::scan_file::{CdfScanFile, CdfScanFileType}, }; use super::resolve_scan_file_dv; @@ -159,34 +129,30 @@ mod tests { cardinality: 2, }); let path = "fake_path".to_string(); - let dv_info = DvInfo { deletion_vector }; + let add_dv = DvInfo { deletion_vector }; + let remove_dv = Some(DvInfo { + deletion_vector: None, + }); let scan_file = CdfScanFile { scan_type: CdfScanFileType::Add, path: path.clone(), - dv_info, + add_dv, + remove_dv, partition_values: HashMap::new(), commit_version, commit_timestamp, }; - let rm_dv = DvInfo { - deletion_vector: None, - }; - let input = UnresolvedCdfScanFile { - scan_file, - remove_dvs: Arc::new(HashMap::from([(path.clone(), rm_dv)])), - }; - // Remove: None deleted // Add: DV with 0th and 9th bit set (ie deleted) let mut expected_sv = vec![false; 10]; expected_sv[0] = true; expected_sv[9] = true; - let resolved = resolve_scan_file_dv(&engine, &table_root, input) + let resolved = resolve_scan_file_dv(&engine, &table_root, scan_file) .unwrap() .map(|file| (file.scan_file.scan_type, file.selection_vector)) .collect_vec(); - assert_eq!(resolved, vec![(CdfScanFileType::Remove, Some(expected_sv))]); + assert_eq!(resolved, vec![(CdfScanFileType::Remove, expected_sv)]); } #[test] @@ -196,44 +162,40 @@ mod tests { std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); let table_root = url::Url::from_directory_path(path).unwrap(); + let deletion_vector = Some(DeletionVectorDescriptor { + storage_type: "u".to_string(), + path_or_inline_dv: "vBn[lx{q8@P<9BNH/isA".to_string(), + offset: Some(1), + size_in_bytes: 36, + cardinality: 2, + }); let commit_version = 42_i64; let commit_timestamp = 1234_i64; let path = "fake_path".to_string(); - let dv_info = DvInfo { + let add_dv = DvInfo { deletion_vector: None, }; + let remove_dv = Some(DvInfo { deletion_vector }); let scan_file = CdfScanFile { scan_type: CdfScanFileType::Add, path: path.clone(), - dv_info, + add_dv, + remove_dv, partition_values: HashMap::new(), commit_version, commit_timestamp, }; - let deletion_vector = Some(DeletionVectorDescriptor { - storage_type: "u".to_string(), - path_or_inline_dv: "vBn[lx{q8@P<9BNH/isA".to_string(), - offset: Some(1), - size_in_bytes: 36, - cardinality: 2, - }); - let rm_dv = DvInfo { deletion_vector }; - let input = UnresolvedCdfScanFile { - scan_file, - remove_dvs: Arc::new(HashMap::from([(path.clone(), rm_dv)])), - }; - // Remove: DV with 0th and 9th bit set (ie deleted) // Add: No rows deleted let mut expected_sv = vec![false; 10]; expected_sv[0] = true; expected_sv[9] = true; - let resolved = resolve_scan_file_dv(&engine, &table_root, input) + let resolved = resolve_scan_file_dv(&engine, &table_root, scan_file) .unwrap() .map(|file| (file.scan_file.scan_type, file.selection_vector)) .collect_vec(); - assert_eq!(resolved, vec![(CdfScanFileType::Add, Some(expected_sv))]); + assert_eq!(resolved, vec![(CdfScanFileType::Add, expected_sv)]); } } From d4f95d5cbfb785f13bc8cf7e09a308f76fbb69fc Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 10:22:42 -0800 Subject: [PATCH 20/56] Use default for dv info --- kernel/src/scan/state.rs | 2 +- kernel/src/table_changes/data_read.rs | 20 ++++++++------------ 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/kernel/src/scan/state.rs b/kernel/src/scan/state.rs index 5420feda0..9bd7ec156 100644 --- a/kernel/src/scan/state.rs +++ b/kernel/src/scan/state.rs @@ -29,7 +29,7 @@ pub struct GlobalScanState { } /// this struct can be used by an engine to materialize a selection vector -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Default, Debug, Clone, PartialEq, Eq)] pub struct DvInfo { pub(crate) deletion_vector: Option, } diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs index ee77dfecd..ad1aba35d 100644 --- a/kernel/src/table_changes/data_read.rs +++ b/kernel/src/table_changes/data_read.rs @@ -33,7 +33,7 @@ pub(crate) fn resolve_scan_file_dv( table_root: &Url, scan_file: CdfScanFile, ) -> DeltaResult> { - let add_dv = scan_file.add_dv.get_treemap(engine, table_root)?; + let add_dv = scan_file.dv_info.get_treemap(engine, table_root)?; let rm_dv = scan_file .remove_dv .as_ref() @@ -61,8 +61,8 @@ pub(crate) fn resolve_scan_file_dv( let adds = &rm_dv - &add_dv; let removes = add_dv - rm_dv; ( - adds.is_empty().not().then_some(adds), - removes.is_empty().not().then_some(removes), + (!adds.is_empty()).then_some(adds), + (!removes.is_empty()).then_some(removes), ) } (add_dv, None, CdfScanFileType::Add | CdfScanFileType::Cdc) => { @@ -129,14 +129,12 @@ mod tests { cardinality: 2, }); let path = "fake_path".to_string(); - let add_dv = DvInfo { deletion_vector }; - let remove_dv = Some(DvInfo { - deletion_vector: None, - }); + let dv_info = DvInfo { deletion_vector }; + let remove_dv = Some(Default::default()); let scan_file = CdfScanFile { scan_type: CdfScanFileType::Add, path: path.clone(), - add_dv, + dv_info, remove_dv, partition_values: HashMap::new(), commit_version, @@ -173,14 +171,12 @@ mod tests { let commit_timestamp = 1234_i64; let path = "fake_path".to_string(); - let add_dv = DvInfo { - deletion_vector: None, - }; + let dv_info = Default::default(); let remove_dv = Some(DvInfo { deletion_vector }); let scan_file = CdfScanFile { scan_type: CdfScanFileType::Add, path: path.clone(), - add_dv, + dv_info, remove_dv, partition_values: HashMap::new(), commit_version, From 8a8f6bf247759373e05b091f29f503696f4d2d82 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 11:27:19 -0800 Subject: [PATCH 21/56] shorten tests, check error cases --- kernel/src/scan/state.rs | 8 ++ kernel/src/table_changes/data_read.rs | 185 +++++++++++++++++++++----- 2 files changed, 163 insertions(+), 30 deletions(-) diff --git a/kernel/src/scan/state.rs b/kernel/src/scan/state.rs index 9bd7ec156..aa5fd5e9c 100644 --- a/kernel/src/scan/state.rs +++ b/kernel/src/scan/state.rs @@ -34,6 +34,14 @@ pub struct DvInfo { pub(crate) deletion_vector: Option, } +impl From for DvInfo { + fn from(deletion_vector: DeletionVectorDescriptor) -> Self { + DvInfo { + deletion_vector: Some(deletion_vector), + } + } +} + /// Give engines an easy way to consume stats #[derive(Debug, Clone, PartialEq, Eq, Deserialize)] #[serde(rename_all = "camelCase")] diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs index ad1aba35d..bd59806f3 100644 --- a/kernel/src/table_changes/data_read.rs +++ b/kernel/src/table_changes/data_read.rs @@ -1,5 +1,3 @@ -use std::ops::Not; - use url::Url; use super::scan_file::CdfScanFileType; @@ -7,6 +5,7 @@ use crate::actions::deletion_vector::{deletion_treemap_to_bools, selection_treem use crate::table_changes::scan_file::CdfScanFile; use crate::{DeltaResult, Engine, Error}; +#[allow(unused)] struct ResolvedCdfScanFile { scan_file: CdfScanFile, selection_vector: Vec, @@ -28,7 +27,7 @@ struct ResolvedCdfScanFile { /// have the deletion vector read (if present), and each is converted into a [`ResolvedCdfScanFile`]. /// No changes are made to the `scan_type`. #[allow(unused)] -pub(crate) fn resolve_scan_file_dv( +fn resolve_scan_file_dv( engine: &dyn Engine, table_root: &Url, scan_file: CdfScanFile, @@ -51,7 +50,7 @@ pub(crate) fn resolve_scan_file_dv( } (_, Some(_), CdfScanFileType::Cdc) => { return Err(Error::generic( - "CdfScanFile with type cdccannot have a remove deletion vector", + "CdfScanFile with type cdc cannot have a remove deletion vector", )); } (add_dv, Some(rm_dv), CdfScanFileType::Add) => { @@ -94,24 +93,56 @@ pub(crate) fn resolve_scan_file_dv( scan_file: rm_scan_file, selection_vector, }); - Ok([adds, removes].into_iter().flatten()) + Ok([removes, adds].into_iter().flatten()) } #[cfg(test)] mod tests { - use std::{collections::HashMap, path::PathBuf}; + use std::{collections::HashMap, io::Write, path::PathBuf}; + use bytes::BufMut; use itertools::Itertools; + use roaring::RoaringTreemap; use crate::{ actions::deletion_vector::DeletionVectorDescriptor, engine::sync::SyncEngine, scan::state::DvInfo, table_changes::scan_file::{CdfScanFile, CdfScanFileType}, + Error, }; use super::resolve_scan_file_dv; + fn generate_dv(map: RoaringTreemap) -> DeletionVectorDescriptor { + let buf = Vec::new(); + let mut writer = buf.writer(); + let magic: u32 = 1681511377; + writer.write_all(&magic.to_le_bytes()).unwrap(); + map.serialize_into(&mut writer).unwrap(); + let buf = writer.into_inner(); + let inline_dv = z85::encode(&buf); + DeletionVectorDescriptor { + storage_type: "i".into(), + path_or_inline_dv: inline_dv, + offset: None, + size_in_bytes: buf.len().try_into().unwrap(), + cardinality: map.len().try_into().unwrap(), + } + } + + fn get_add_scan_file(dv_info: DvInfo, remove_dv: Option) -> CdfScanFile { + CdfScanFile { + scan_type: CdfScanFileType::Add, + path: "fake_path".to_string(), + dv_info, + remove_dv, + partition_values: HashMap::new(), + commit_version: 42, + commit_timestamp: 1234, + } + } + #[test] fn add_with_dv() { let engine = SyncEngine::new(); @@ -119,8 +150,6 @@ mod tests { std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); let table_root = url::Url::from_directory_path(path).unwrap(); - let commit_version = 42_i64; - let commit_timestamp = 1234_i64; let deletion_vector = Some(DeletionVectorDescriptor { storage_type: "u".to_string(), path_or_inline_dv: "vBn[lx{q8@P<9BNH/isA".to_string(), @@ -128,18 +157,9 @@ mod tests { size_in_bytes: 36, cardinality: 2, }); - let path = "fake_path".to_string(); let dv_info = DvInfo { deletion_vector }; let remove_dv = Some(Default::default()); - let scan_file = CdfScanFile { - scan_type: CdfScanFileType::Add, - path: path.clone(), - dv_info, - remove_dv, - partition_values: HashMap::new(), - commit_version, - commit_timestamp, - }; + let scan_file = get_add_scan_file(dv_info, remove_dv); // Remove: None deleted // Add: DV with 0th and 9th bit set (ie deleted) @@ -167,21 +187,10 @@ mod tests { size_in_bytes: 36, cardinality: 2, }); - let commit_version = 42_i64; - let commit_timestamp = 1234_i64; - let path = "fake_path".to_string(); let dv_info = Default::default(); let remove_dv = Some(DvInfo { deletion_vector }); - let scan_file = CdfScanFile { - scan_type: CdfScanFileType::Add, - path: path.clone(), - dv_info, - remove_dv, - partition_values: HashMap::new(), - commit_version, - commit_timestamp, - }; + let scan_file = get_add_scan_file(dv_info, remove_dv); // Remove: DV with 0th and 9th bit set (ie deleted) // Add: No rows deleted @@ -194,4 +203,120 @@ mod tests { .collect_vec(); assert_eq!(resolved, vec![(CdfScanFileType::Add, expected_sv)]); } + + #[test] + fn restore_subset() { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let rm_dv = generate_dv(RoaringTreemap::from([0, 1, 4, 5])); + let add_dv = generate_dv(RoaringTreemap::from([0, 5])); + + let dv_info = DvInfo::from(add_dv); + let remove_dv = Some(DvInfo::from(rm_dv)); + let scan_file = get_add_scan_file(dv_info, remove_dv); + + let mut expected_sv = vec![false; 5]; + expected_sv[1] = true; + expected_sv[4] = true; + let resolved = resolve_scan_file_dv(&engine, &table_root, scan_file) + .unwrap() + .map(|file| (file.scan_file.scan_type, file.selection_vector)) + .collect_vec(); + assert_eq!(resolved, vec![(CdfScanFileType::Add, expected_sv)]); + } + #[test] + fn delete_subset() { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let rm_dv = generate_dv(RoaringTreemap::from([0, 5])); + let add_dv = generate_dv(RoaringTreemap::from([0, 1, 4, 5])); + + let dv_info = DvInfo::from(add_dv); + let remove_dv = Some(DvInfo::from(rm_dv)); + let scan_file = get_add_scan_file(dv_info, remove_dv); + + let mut expected_sv = vec![false; 5]; + expected_sv[1] = true; + expected_sv[4] = true; + let resolved = resolve_scan_file_dv(&engine, &table_root, scan_file) + .unwrap() + .map(|file| (file.scan_file.scan_type, file.selection_vector)) + .collect_vec(); + assert_eq!(resolved, vec![(CdfScanFileType::Remove, expected_sv)]); + } + + #[test] + fn adds_and_removes() { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let rm_dv = generate_dv(RoaringTreemap::from([0, 2])); + let add_dv = generate_dv(RoaringTreemap::from([0, 1])); + + let dv_info = DvInfo::from(add_dv); + let remove_dv = Some(DvInfo::from(rm_dv)); + let scan_file = get_add_scan_file(dv_info, remove_dv); + + let mut rm_sv = vec![false; 2]; + rm_sv[1] = true; + let mut add_sv = vec![false; 3]; + add_sv[2] = true; + + let resolved = resolve_scan_file_dv(&engine, &table_root, scan_file) + .unwrap() + .map(|file| (file.scan_file.scan_type, file.selection_vector)) + .collect_vec(); + assert_eq!( + resolved, + vec![ + (CdfScanFileType::Remove, rm_sv), + (CdfScanFileType::Add, add_sv) + ] + ); + } + + #[test] + fn cdc_with_remove_dv_fails() { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let rm_dv = generate_dv(RoaringTreemap::from([0, 2])); + + let remove_dv = Some(DvInfo::from(rm_dv)); + let mut scan_file = CdfScanFile { + scan_type: CdfScanFileType::Cdc, + path: "fake_path".to_string(), + dv_info: Default::default(), + remove_dv, + partition_values: HashMap::new(), + commit_version: 42, + commit_timestamp: 1234, + }; + + let expected_err = + Error::generic("CdfScanFile with type cdc cannot have a remove deletion vector"); + + let res = resolve_scan_file_dv(&engine, &table_root, scan_file.clone()) + .err() + .unwrap(); + assert_eq!(res.to_string(), expected_err.to_string()); + + scan_file.scan_type = CdfScanFileType::Remove; + let expected_err = + Error::generic("CdfScanFile with type remove cannot have a remove deletion vector"); + let res = resolve_scan_file_dv(&engine, &table_root, scan_file) + .err() + .unwrap(); + assert_eq!(res.to_string(), expected_err.to_string()); + } } From eeaabb09327d5f844c3df57e0ff0e313463d2430 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 12:16:50 -0800 Subject: [PATCH 22/56] Rename to physical_to_logical, add comment explaining the deletion/selection treemap --- kernel/src/table_changes/mod.rs | 2 +- .../{data_read.rs => physical_to_logical.rs} | 68 ++++++++++++++++--- 2 files changed, 60 insertions(+), 10 deletions(-) rename kernel/src/table_changes/{data_read.rs => physical_to_logical.rs} (76%) diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index d27454444..f045da90f 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -15,8 +15,8 @@ use crate::table_properties::TableProperties; use crate::utils::require; use crate::{DeltaResult, Engine, Error, Version}; -mod data_read; mod log_replay; +mod physical_to_logical; pub mod scan; mod scan_file; diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/physical_to_logical.rs similarity index 76% rename from kernel/src/table_changes/data_read.rs rename to kernel/src/table_changes/physical_to_logical.rs index bd59806f3..59b3bf2f0 100644 --- a/kernel/src/table_changes/data_read.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -38,10 +38,6 @@ fn resolve_scan_file_dv( .as_ref() .map(|rm_dv| rm_dv.get_treemap(engine, table_root)) .transpose()?; - println!( - "initial {add_dv:?}, {rm_dv:?}, scan type: {:?}", - scan_file.scan_type - ); let (add_dv, rm_dv) = match (add_dv, rm_dv, &scan_file.scan_type) { (_, Some(_), CdfScanFileType::Remove) => { return Err(Error::generic( @@ -56,13 +52,67 @@ fn resolve_scan_file_dv( (add_dv, Some(rm_dv), CdfScanFileType::Add) => { let add_dv = add_dv.unwrap_or_else(Default::default); let rm_dv = rm_dv.unwrap_or_else(Default::default); - // Take the symmetric difference so we don't double count rows + // Here we show how deletion vectors are resolved. Note that logically the `rm_dv` is the + // beginning state of the commit, and `add_dv` is the final state of the commit. In + // other words the dv went from being `rm_dv` to `add_dv`. + // + // ===== IMPORTANT ===== + // It is important to note that `rm_dv` and `add_dv` are deletion treemaps. We specify + // two type of treemaps: + // - _Deletion_ treemaps (denoted `Treemap_d`) store the indices of deleted rows. + // For instance, `Treemap_d(0, 2)` means that rows 0 and 2 are _deleted_. When + // converted to a vector of bools, it is equivalent to a deletion vector [1, 0, 1]. + // - _Selection_ treemaps (denoted `Treemap_s`) store the indices of selected rows. + // `Treemap_s(0, 1)` means that rows 0 and 1 are _selected_. This is equivalent + // to the selection vector [1, 1, 0] when converted into a boolean vector. + // + // We use a motivating example to explain the deletion vector resolution. We read + // `rm_dv` and `add_dv` from the [`CdfScanFile`]. They are initialized to the empty map + // if no deletion vector is given. + // rm_dv = Treemap_d(0, 1) + // add_dv = Treemap_d(1, 2) + // + // The result of this commit is: + // - row 0 is restored + // - row 1 is unchanged + // - row 2 is deleted + // Thus for this commit we must generate `Treemap_s(0)` for the added rows, and + // `Treemap_s(2)` for deleted rows. + // + // # Insertion Selection Treemap + // The selection vector of added rows is calculated using set subtraction over deletion + // treemaps `rm_dv - add_dv`. These rows went from set (deleted) in `rm_dv` to unset + // (restored) in the `add_dv`. All other rows are either cancelled in set subtraction, + // or were not selected in either treemap. Hence, they are not selected to + // be in the set of changed rows. Applying this to our deletion treemaps: + // rm_dv - add_dv = + // Treemap_d(0, 1) + // - Treemap_d(1, 2) + // = Treemap_s(0) + // The selection treemap shows that row 0 was inserted + // + // # Deletion Selection Treemap + // The selection vector of deleted rows is calculated using `add_dv - rm_dv`. These rows went + // from unset (present) in `rm_dv` to set (deleted) in the `add_dv`. Once again, all + // other rows are either cancelled in set subtraction, or were not set in either treemap. + // Applying this to our deletion vectors: + // add_dv - rm_dv = + // Treemap_d(1, 2) + // - Treemap_d(0, 1) + // = Treemap_s(2) + // The selection treemap shows that row 2 was deleted + // + // # Conversion to Selection Vector + // The selection treemap is converted to a selection vector by setting the bits: + // Treemap_s(0) => [true] + // Treemap_s(2) => [false, false, true] + // All other rows are unselected (false). let adds = &rm_dv - &add_dv; let removes = add_dv - rm_dv; - ( - (!adds.is_empty()).then_some(adds), - (!removes.is_empty()).then_some(removes), - ) + + let adds = (!adds.is_empty()).then_some(adds); + let removes = (!removes.is_empty()).then_some(removes); + (adds, removes) } (add_dv, None, CdfScanFileType::Add | CdfScanFileType::Cdc) => { (Some(add_dv.unwrap_or_else(Default::default)), None) From fa13ade013ee1853ebfc763c383c09a1a1a316b4 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 12:19:52 -0800 Subject: [PATCH 23/56] Add note about ordinary scans for dv resolution --- kernel/src/table_changes/physical_to_logical.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 59b3bf2f0..d2fd08fe1 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -65,6 +65,8 @@ fn resolve_scan_file_dv( // - _Selection_ treemaps (denoted `Treemap_s`) store the indices of selected rows. // `Treemap_s(0, 1)` means that rows 0 and 1 are _selected_. This is equivalent // to the selection vector [1, 1, 0] when converted into a boolean vector. + // In ordinary scans, only deletion treemaps are used. However in the case of deletion + // vector pairs, we generate selection treemaps. // // We use a motivating example to explain the deletion vector resolution. We read // `rm_dv` and `add_dv` from the [`CdfScanFile`]. They are initialized to the empty map From 4dabdaf65b975ddbf65120dffe678661a0cd900e Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 12:27:06 -0800 Subject: [PATCH 24/56] Add doc comment to treemap_to_bools_with --- kernel/src/actions/deletion_vector.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/src/actions/deletion_vector.rs b/kernel/src/actions/deletion_vector.rs index a87d7c014..e418a9c02 100644 --- a/kernel/src/actions/deletion_vector.rs +++ b/kernel/src/actions/deletion_vector.rs @@ -225,6 +225,8 @@ pub(crate) fn selection_treemap_to_bools(treemap: RoaringTreemap) -> Vec { treemap_to_bools_with(treemap, true) } +/// helper function to generate vectors of bools from treemap. If `set_bit` is `true`, this is +/// [`selection_treemap_to_bools`]. If `set_bit` is false, this is [`deletion_treemap_to_bools`] fn treemap_to_bools_with(treemap: RoaringTreemap, set_bit: bool) -> Vec { fn combine(high_bits: u32, low_bits: u32) -> usize { ((u64::from(high_bits) << 32) | u64::from(low_bits)) as usize From 7fcb5310e425b1736a9aeec1a5e1ebb90f12b812 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 13:29:51 -0800 Subject: [PATCH 25/56] Update resolve_scan_file_dv docs --- kernel/src/table_changes/physical_to_logical.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index d2fd08fe1..958417b29 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -11,20 +11,19 @@ struct ResolvedCdfScanFile { selection_vector: Vec, } -/// Resolves the deletion vectors for an [`UnresolvedCdfScanFile`]. This function handles two +/// Resolves the deletion vectors for a [`CdfScanFile`]. This function handles two /// types of `CdfScanFile`s: /// 1. The first case is a [`CdfScanFile`] paired with a remove deletion vector. The `scan_type` /// must be [`CdfScanFileType::Add`]. In this case, both the add and remove deletion vectors are -/// read if they exist. Then, we find the set of rows in the scan file that have been added, and -/// the set of rows that have been removed. The set of removed rows (if any) will be represented -/// by a [`ResolvedCdfScanFile`] with `scan_type` = [`CdfScanFileType::Remove`]. The set of -/// added rows (if any) will be represented by a [`ResolvedCdfScanFile`] with `scan_type` = -/// [`CdfScanFileType::Add`]. +/// read if they exist. Then, we find the set of rows in the that have been added, and rows that +/// have been removed. The set of removed rows (if any) will be represented by a +/// [`ResolvedCdfScanFile`] with `scan_type` = [`CdfScanFileType::Remove`]. The set of added rows +/// (if any) will be represented by a [`ResolvedCdfScanFile`] with `scan_type` = [`CdfScanFileType::Add`]. /// -/// Note: We allow the possibility for there to be both added rows, and removed rows for a +/// Note: We allow the possibility for there to be both added rows and removed rows for a /// single add/remove pair. /// 2. The second case handles all other add, remove, and cdc [`CdfScanFile`]s. These will simply -/// have the deletion vector read (if present), and each is converted into a [`ResolvedCdfScanFile`]. +/// read the deletion vector (if present), and each is converted into a [`ResolvedCdfScanFile`]. /// No changes are made to the `scan_type`. #[allow(unused)] fn resolve_scan_file_dv( From 577b424331592bb598891b618c683f23932912cd Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 13:40:46 -0800 Subject: [PATCH 26/56] Add test and docs --- .../src/table_changes/physical_to_logical.rs | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 958417b29..99915e191 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -5,6 +5,8 @@ use crate::actions::deletion_vector::{deletion_treemap_to_bools, selection_treem use crate::table_changes::scan_file::CdfScanFile; use crate::{DeltaResult, Engine, Error}; +/// A [`CdfScanFile`] with its associated `selection_vector`. The `scan_type` is resolved to +/// match the `_change_type` that its rows will have in the change data feed. #[allow(unused)] struct ResolvedCdfScanFile { scan_file: CdfScanFile, @@ -370,4 +372,28 @@ mod tests { .unwrap(); assert_eq!(res.to_string(), expected_err.to_string()); } + + #[test] + fn cdc_file_resolution() { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let scan_file = CdfScanFile { + scan_type: CdfScanFileType::Cdc, + path: "fake_path".to_string(), + dv_info: Default::default(), + remove_dv: None, + partition_values: HashMap::new(), + commit_version: 42, + commit_timestamp: 1234, + }; + + let res = resolve_scan_file_dv(&engine, &table_root, scan_file.clone()) + .unwrap() + .map(|file| (file.scan_file.scan_type, file.selection_vector)) + .collect_vec(); + // TODO + } } From a970df137e4da368c386b18a292edc0fd6d9697f Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 14:07:49 -0800 Subject: [PATCH 27/56] Add documentation --- .../src/table_changes/physical_to_logical.rs | 115 +++++++++++------- 1 file changed, 68 insertions(+), 47 deletions(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 99915e191..a2311efa0 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -9,8 +9,14 @@ use crate::{DeltaResult, Engine, Error}; /// match the `_change_type` that its rows will have in the change data feed. #[allow(unused)] struct ResolvedCdfScanFile { + /// The scan file that holds the path the data file to be read. The `scan_type` field is + /// resolved to the `_change_type` of the rows for this data file. scan_file: CdfScanFile, - selection_vector: Vec, + /// Optional vector of bools. If `selection_vector[i] = true`, then that row must be included + /// in the CDF output. Otherwise the row must be filtered out. The vector may be shorter than + /// the data file. In this case, all the remaining rows are *not* selected. If `selection_vector` + /// is `None`, then all rows are selected. + selection_vector: Option>, } /// Resolves the deletion vectors for a [`CdfScanFile`]. This function handles two @@ -134,18 +140,14 @@ fn resolve_scan_file_dv( scan_type: CdfScanFileType::Remove, ..scan_file.clone() }; - let adds = add_dv - .map(treemap_to_bools) - .map(|selection_vector| ResolvedCdfScanFile { - scan_file, - selection_vector, - }); - let removes = rm_dv - .map(treemap_to_bools) - .map(|selection_vector| ResolvedCdfScanFile { - scan_file: rm_scan_file, - selection_vector, - }); + let adds = add_dv.map(treemap_to_bools).map(|sv| ResolvedCdfScanFile { + scan_file, + selection_vector: (!sv.is_empty()).then_some(sv), + }); + let removes = rm_dv.map(treemap_to_bools).map(|sv| ResolvedCdfScanFile { + scan_file: rm_scan_file, + selection_vector: (!sv.is_empty()).then_some(sv), + }); Ok([removes, adds].into_iter().flatten()) } @@ -184,9 +186,13 @@ mod tests { } } - fn get_add_scan_file(dv_info: DvInfo, remove_dv: Option) -> CdfScanFile { + fn get_scan_file( + scan_type: CdfScanFileType, + dv_info: DvInfo, + remove_dv: Option, + ) -> CdfScanFile { CdfScanFile { - scan_type: CdfScanFileType::Add, + scan_type, path: "fake_path".to_string(), dv_info, remove_dv, @@ -212,7 +218,7 @@ mod tests { }); let dv_info = DvInfo { deletion_vector }; let remove_dv = Some(Default::default()); - let scan_file = get_add_scan_file(dv_info, remove_dv); + let scan_file = get_scan_file(CdfScanFileType::Add, dv_info, remove_dv); // Remove: None deleted // Add: DV with 0th and 9th bit set (ie deleted) @@ -223,7 +229,7 @@ mod tests { .unwrap() .map(|file| (file.scan_file.scan_type, file.selection_vector)) .collect_vec(); - assert_eq!(resolved, vec![(CdfScanFileType::Remove, expected_sv)]); + assert_eq!(resolved, vec![(CdfScanFileType::Remove, Some(expected_sv))]); } #[test] @@ -243,7 +249,7 @@ mod tests { let dv_info = Default::default(); let remove_dv = Some(DvInfo { deletion_vector }); - let scan_file = get_add_scan_file(dv_info, remove_dv); + let scan_file = get_scan_file(CdfScanFileType::Add, dv_info, remove_dv); // Remove: DV with 0th and 9th bit set (ie deleted) // Add: No rows deleted @@ -254,7 +260,7 @@ mod tests { .unwrap() .map(|file| (file.scan_file.scan_type, file.selection_vector)) .collect_vec(); - assert_eq!(resolved, vec![(CdfScanFileType::Add, expected_sv)]); + assert_eq!(resolved, vec![(CdfScanFileType::Add, Some(expected_sv))]); } #[test] @@ -269,7 +275,7 @@ mod tests { let dv_info = DvInfo::from(add_dv); let remove_dv = Some(DvInfo::from(rm_dv)); - let scan_file = get_add_scan_file(dv_info, remove_dv); + let scan_file = get_scan_file(CdfScanFileType::Add, dv_info, remove_dv); let mut expected_sv = vec![false; 5]; expected_sv[1] = true; @@ -278,7 +284,7 @@ mod tests { .unwrap() .map(|file| (file.scan_file.scan_type, file.selection_vector)) .collect_vec(); - assert_eq!(resolved, vec![(CdfScanFileType::Add, expected_sv)]); + assert_eq!(resolved, vec![(CdfScanFileType::Add, Some(expected_sv))]); } #[test] fn delete_subset() { @@ -292,7 +298,7 @@ mod tests { let dv_info = DvInfo::from(add_dv); let remove_dv = Some(DvInfo::from(rm_dv)); - let scan_file = get_add_scan_file(dv_info, remove_dv); + let scan_file = get_scan_file(CdfScanFileType::Add, dv_info, remove_dv); let mut expected_sv = vec![false; 5]; expected_sv[1] = true; @@ -301,7 +307,7 @@ mod tests { .unwrap() .map(|file| (file.scan_file.scan_type, file.selection_vector)) .collect_vec(); - assert_eq!(resolved, vec![(CdfScanFileType::Remove, expected_sv)]); + assert_eq!(resolved, vec![(CdfScanFileType::Remove, Some(expected_sv))]); } #[test] @@ -316,7 +322,7 @@ mod tests { let dv_info = DvInfo::from(add_dv); let remove_dv = Some(DvInfo::from(rm_dv)); - let scan_file = get_add_scan_file(dv_info, remove_dv); + let scan_file = get_scan_file(CdfScanFileType::Add, dv_info, remove_dv); let mut rm_sv = vec![false; 2]; rm_sv[1] = true; @@ -330,14 +336,14 @@ mod tests { assert_eq!( resolved, vec![ - (CdfScanFileType::Remove, rm_sv), - (CdfScanFileType::Add, add_sv) + (CdfScanFileType::Remove, Some(rm_sv)), + (CdfScanFileType::Add, Some(add_sv)) ] ); } #[test] - fn cdc_with_remove_dv_fails() { + fn cdc_and_remove_with_remove_dv_fails() { let engine = SyncEngine::new(); let path = std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); @@ -346,15 +352,7 @@ mod tests { let rm_dv = generate_dv(RoaringTreemap::from([0, 2])); let remove_dv = Some(DvInfo::from(rm_dv)); - let mut scan_file = CdfScanFile { - scan_type: CdfScanFileType::Cdc, - path: "fake_path".to_string(), - dv_info: Default::default(), - remove_dv, - partition_values: HashMap::new(), - commit_version: 42, - commit_timestamp: 1234, - }; + let mut scan_file = get_scan_file(CdfScanFileType::Cdc, Default::default(), remove_dv); let expected_err = Error::generic("CdfScanFile with type cdc cannot have a remove deletion vector"); @@ -380,20 +378,43 @@ mod tests { std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); let table_root = url::Url::from_directory_path(path).unwrap(); - let scan_file = CdfScanFile { - scan_type: CdfScanFileType::Cdc, - path: "fake_path".to_string(), - dv_info: Default::default(), - remove_dv: None, - partition_values: HashMap::new(), - commit_version: 42, - commit_timestamp: 1234, - }; + let scan_file = get_scan_file(CdfScanFileType::Cdc, Default::default(), None); - let res = resolve_scan_file_dv(&engine, &table_root, scan_file.clone()) + let resolved = resolve_scan_file_dv(&engine, &table_root, scan_file.clone()) + .unwrap() + .map(|file| (file.scan_file.scan_type, file.selection_vector)) + .collect_vec(); + assert_eq!(resolved, vec![(CdfScanFileType::Cdc, None),]); + } + + #[test] + fn remove_file_resolution() { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let scan_file = get_scan_file(CdfScanFileType::Remove, Default::default(), None); + + let resolved = resolve_scan_file_dv(&engine, &table_root, scan_file.clone()) + .unwrap() + .map(|file| (file.scan_file.scan_type, file.selection_vector)) + .collect_vec(); + assert_eq!(resolved, vec![(CdfScanFileType::Remove, None)]); + } + #[test] + fn add_file_no_dv_resolution() { + let engine = SyncEngine::new(); + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let table_root = url::Url::from_directory_path(path).unwrap(); + + let scan_file = get_scan_file(CdfScanFileType::Add, Default::default(), None); + + let resolved = resolve_scan_file_dv(&engine, &table_root, scan_file.clone()) .unwrap() .map(|file| (file.scan_file.scan_type, file.selection_vector)) .collect_vec(); - // TODO + assert_eq!(resolved, vec![(CdfScanFileType::Add, None)]); } } From 5b70aaba706fb333ee061d183ee97ef5977af6e9 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 14:14:27 -0800 Subject: [PATCH 28/56] Rename to resolve-dvs --- kernel/src/table_changes/mod.rs | 2 +- .../table_changes/{physical_to_logical.rs => resolve_dvs.rs} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename kernel/src/table_changes/{physical_to_logical.rs => resolve_dvs.rs} (100%) diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index f045da90f..86ba0cc49 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -16,7 +16,7 @@ use crate::utils::require; use crate::{DeltaResult, Engine, Error, Version}; mod log_replay; -mod physical_to_logical; +mod resolve_dvs; pub mod scan; mod scan_file; diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/resolve_dvs.rs similarity index 100% rename from kernel/src/table_changes/physical_to_logical.rs rename to kernel/src/table_changes/resolve_dvs.rs From 3e0ead65358d85852a87dcddbb29f97d4d807a0b Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 23:07:18 -0800 Subject: [PATCH 29/56] Fixup naming and docs --- kernel/src/actions/deletion_vector.rs | 2 +- kernel/src/table_changes/resolve_dvs.rs | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/src/actions/deletion_vector.rs b/kernel/src/actions/deletion_vector.rs index e418a9c02..b0e4e1716 100644 --- a/kernel/src/actions/deletion_vector.rs +++ b/kernel/src/actions/deletion_vector.rs @@ -392,7 +392,7 @@ mod tests { } // this test is ignored by default as it's expensive to allocate such big vecs full of `true`. you can run it via: - // cargo test actions::deletion_vector::tests::test_dv_to_bools -- --ignored #[test] + // cargo test actions::deletion_vector::tests::test_dv_to_bools -- --ignored #[test] #[ignore] fn test_dv_to_bools() { diff --git a/kernel/src/table_changes/resolve_dvs.rs b/kernel/src/table_changes/resolve_dvs.rs index a2311efa0..efdbb53c3 100644 --- a/kernel/src/table_changes/resolve_dvs.rs +++ b/kernel/src/table_changes/resolve_dvs.rs @@ -64,7 +64,7 @@ fn resolve_scan_file_dv( // other words the dv went from being `rm_dv` to `add_dv`. // // ===== IMPORTANT ===== - // It is important to note that `rm_dv` and `add_dv` are deletion treemaps. We specify + // It is important to note that `rm_dv` and `add_dv` are deletion treemaps. We define // two type of treemaps: // - _Deletion_ treemaps (denoted `Treemap_d`) store the indices of deleted rows. // For instance, `Treemap_d(0, 2)` means that rows 0 and 2 are _deleted_. When @@ -76,7 +76,7 @@ fn resolve_scan_file_dv( // vector pairs, we generate selection treemaps. // // We use a motivating example to explain the deletion vector resolution. We read - // `rm_dv` and `add_dv` from the [`CdfScanFile`]. They are initialized to the empty map + // `rm_dv` and `add_dv`, and they are initialized to the empty map by default. // if no deletion vector is given. // rm_dv = Treemap_d(0, 1) // add_dv = Treemap_d(1, 2) @@ -169,7 +169,7 @@ mod tests { use super::resolve_scan_file_dv; - fn generate_dv(map: RoaringTreemap) -> DeletionVectorDescriptor { + fn treemap_to_dv_descriptor(map: RoaringTreemap) -> DeletionVectorDescriptor { let buf = Vec::new(); let mut writer = buf.writer(); let magic: u32 = 1681511377; @@ -270,8 +270,8 @@ mod tests { std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); let table_root = url::Url::from_directory_path(path).unwrap(); - let rm_dv = generate_dv(RoaringTreemap::from([0, 1, 4, 5])); - let add_dv = generate_dv(RoaringTreemap::from([0, 5])); + let rm_dv = treemap_to_dv_descriptor(RoaringTreemap::from([0, 1, 4, 5])); + let add_dv = treemap_to_dv_descriptor(RoaringTreemap::from([0, 5])); let dv_info = DvInfo::from(add_dv); let remove_dv = Some(DvInfo::from(rm_dv)); @@ -293,8 +293,8 @@ mod tests { std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); let table_root = url::Url::from_directory_path(path).unwrap(); - let rm_dv = generate_dv(RoaringTreemap::from([0, 5])); - let add_dv = generate_dv(RoaringTreemap::from([0, 1, 4, 5])); + let rm_dv = treemap_to_dv_descriptor(RoaringTreemap::from([0, 5])); + let add_dv = treemap_to_dv_descriptor(RoaringTreemap::from([0, 1, 4, 5])); let dv_info = DvInfo::from(add_dv); let remove_dv = Some(DvInfo::from(rm_dv)); @@ -317,8 +317,8 @@ mod tests { std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); let table_root = url::Url::from_directory_path(path).unwrap(); - let rm_dv = generate_dv(RoaringTreemap::from([0, 2])); - let add_dv = generate_dv(RoaringTreemap::from([0, 1])); + let rm_dv = treemap_to_dv_descriptor(RoaringTreemap::from([0, 2])); + let add_dv = treemap_to_dv_descriptor(RoaringTreemap::from([0, 1])); let dv_info = DvInfo::from(add_dv); let remove_dv = Some(DvInfo::from(rm_dv)); @@ -349,7 +349,7 @@ mod tests { std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); let table_root = url::Url::from_directory_path(path).unwrap(); - let rm_dv = generate_dv(RoaringTreemap::from([0, 2])); + let rm_dv = treemap_to_dv_descriptor(RoaringTreemap::from([0, 2])); let remove_dv = Some(DvInfo::from(rm_dv)); let mut scan_file = get_scan_file(CdfScanFileType::Cdc, Default::default(), remove_dv); From 0b207c4e30e9563ddc0fdefa7dcea167466a8ed8 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 12:44:02 -0800 Subject: [PATCH 30/56] address pr comments --- kernel/src/scan/state.rs | 5 ++--- kernel/src/table_changes/resolve_dvs.rs | 28 +++++++++++++------------ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/kernel/src/scan/state.rs b/kernel/src/scan/state.rs index aa5fd5e9c..12bbed552 100644 --- a/kernel/src/scan/state.rs +++ b/kernel/src/scan/state.rs @@ -36,9 +36,8 @@ pub struct DvInfo { impl From for DvInfo { fn from(deletion_vector: DeletionVectorDescriptor) -> Self { - DvInfo { - deletion_vector: Some(deletion_vector), - } + let deletion_vector = Some(deletion_vector); + DvInfo { deletion_vector } } } diff --git a/kernel/src/table_changes/resolve_dvs.rs b/kernel/src/table_changes/resolve_dvs.rs index efdbb53c3..8fdadb4d4 100644 --- a/kernel/src/table_changes/resolve_dvs.rs +++ b/kernel/src/table_changes/resolve_dvs.rs @@ -64,8 +64,7 @@ fn resolve_scan_file_dv( // other words the dv went from being `rm_dv` to `add_dv`. // // ===== IMPORTANT ===== - // It is important to note that `rm_dv` and `add_dv` are deletion treemaps. We define - // two type of treemaps: + // Both `rm_dv` and `add_dv` are deletion treemaps. We define two types of treemaps: // - _Deletion_ treemaps (denoted `Treemap_d`) store the indices of deleted rows. // For instance, `Treemap_d(0, 2)` means that rows 0 and 2 are _deleted_. When // converted to a vector of bools, it is equivalent to a deletion vector [1, 0, 1]. @@ -83,8 +82,8 @@ fn resolve_scan_file_dv( // // The result of this commit is: // - row 0 is restored - // - row 1 is unchanged - // - row 2 is deleted + // - row 1 is unchanged (previously deleted) + // - row 2 is newly deleted // Thus for this commit we must generate `Treemap_s(0)` for the added rows, and // `Treemap_s(2)` for deleted rows. // @@ -136,18 +135,21 @@ fn resolve_scan_file_dv( deletion_treemap_to_bools }; - let rm_scan_file = CdfScanFile { - scan_type: CdfScanFileType::Remove, - ..scan_file.clone() - }; - let adds = add_dv.map(treemap_to_bools).map(|sv| ResolvedCdfScanFile { + let resolve = |scan_file, sv: Vec| ResolvedCdfScanFile { scan_file, selection_vector: (!sv.is_empty()).then_some(sv), + }; + + let removes = rm_dv.map(treemap_to_bools).map(|sv| { + let scan_file = CdfScanFile { + scan_type: CdfScanFileType::Remove, + ..scan_file.clone() + }; + resolve(scan_file, sv) }); - let removes = rm_dv.map(treemap_to_bools).map(|sv| ResolvedCdfScanFile { - scan_file: rm_scan_file, - selection_vector: (!sv.is_empty()).then_some(sv), - }); + let adds = add_dv + .map(treemap_to_bools) + .map(|sv| resolve(scan_file, sv)); Ok([removes, adds].into_iter().flatten()) } From 3259769d6b6e650a8c0267229f4a34b3853933d4 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 16:28:14 -0800 Subject: [PATCH 31/56] Add test comment, ignore sv to bools test --- kernel/src/actions/deletion_vector.rs | 3 +++ kernel/src/table_changes/resolve_dvs.rs | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/src/actions/deletion_vector.rs b/kernel/src/actions/deletion_vector.rs index b0e4e1716..44efb083a 100644 --- a/kernel/src/actions/deletion_vector.rs +++ b/kernel/src/actions/deletion_vector.rs @@ -414,7 +414,10 @@ mod tests { assert_eq!(bools, expected); } + // this test is ignored by default as it's expensive to allocate such big vecs full of `false`. you can run it via: + // cargo test actions::deletion_vector::tests::test_sv_to_bools -- --ignored #[test] + #[ignore] fn test_sv_to_bools() { let mut rb = RoaringTreemap::new(); rb.insert(0); diff --git a/kernel/src/table_changes/resolve_dvs.rs b/kernel/src/table_changes/resolve_dvs.rs index 8fdadb4d4..c8ca2f97c 100644 --- a/kernel/src/table_changes/resolve_dvs.rs +++ b/kernel/src/table_changes/resolve_dvs.rs @@ -23,7 +23,7 @@ struct ResolvedCdfScanFile { /// types of `CdfScanFile`s: /// 1. The first case is a [`CdfScanFile`] paired with a remove deletion vector. The `scan_type` /// must be [`CdfScanFileType::Add`]. In this case, both the add and remove deletion vectors are -/// read if they exist. Then, we find the set of rows in the that have been added, and rows that +/// read if they exist. Then, we find the set of rows that have been added and rows that /// have been removed. The set of removed rows (if any) will be represented by a /// [`ResolvedCdfScanFile`] with `scan_type` = [`CdfScanFileType::Remove`]. The set of added rows /// (if any) will be represented by a [`ResolvedCdfScanFile`] with `scan_type` = [`CdfScanFileType::Add`]. @@ -75,7 +75,7 @@ fn resolve_scan_file_dv( // vector pairs, we generate selection treemaps. // // We use a motivating example to explain the deletion vector resolution. We read - // `rm_dv` and `add_dv`, and they are initialized to the empty map by default. + // `rm_dv` and `add_dv`, and they are initialized to the empty map by default // if no deletion vector is given. // rm_dv = Treemap_d(0, 1) // add_dv = Treemap_d(1, 2) From 2e9c29e1fd553f850b8a5135f26ea9e2cb3626d0 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 18:21:17 -0800 Subject: [PATCH 32/56] remove ignore from test --- kernel/src/actions/deletion_vector.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/src/actions/deletion_vector.rs b/kernel/src/actions/deletion_vector.rs index 44efb083a..953b73d24 100644 --- a/kernel/src/actions/deletion_vector.rs +++ b/kernel/src/actions/deletion_vector.rs @@ -414,10 +414,10 @@ mod tests { assert_eq!(bools, expected); } - // this test is ignored by default as it's expensive to allocate such big vecs full of `false`. you can run it via: - // cargo test actions::deletion_vector::tests::test_sv_to_bools -- --ignored + // Unlike [`test_dv_to_bools`], this test is not ignored because the large zero-initialized selection vector is fast to allocate. + // It just gets a bunch of empty pages from the OS. [`tet_dv_to_bools`] is slow because we must + // set every element to `true`. #[test] - #[ignore] fn test_sv_to_bools() { let mut rb = RoaringTreemap::new(); rb.insert(0); From 70fe573ed7eee6a63649fbf86984d93ce2e16814 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Fri, 6 Dec 2024 09:25:21 -0800 Subject: [PATCH 33/56] initial schema and expr for phys to log --- kernel/src/scan/mod.rs | 5 +- kernel/src/table_changes/data_read.rs | 98 +++++++++++++++++++++++++++ kernel/src/table_changes/mod.rs | 1 + 3 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 kernel/src/table_changes/data_read.rs diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 4785173f8..f03d62cc9 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -384,7 +384,10 @@ pub fn scan_row_schema() -> Schema { log_replay::SCAN_ROW_SCHEMA.as_ref().clone() } -fn parse_partition_value(raw: Option<&String>, data_type: &DataType) -> DeltaResult { +pub(crate) fn parse_partition_value( + raw: Option<&String>, + data_type: &DataType, +) -> DeltaResult { match (raw, data_type.as_primitive_opt()) { (Some(v), Some(primitive)) => primitive.parse_scalar(v), (Some(_), None) => Err(Error::generic(format!( diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs new file mode 100644 index 000000000..aa94ec306 --- /dev/null +++ b/kernel/src/table_changes/data_read.rs @@ -0,0 +1,98 @@ +use std::collections::HashMap; +use std::iter; + +use crate::expressions::{column_expr, Scalar}; +use crate::scan::{parse_partition_value, ColumnType}; +use crate::schema::{ColumnName, DataType, SchemaRef, StructField, StructType}; +use crate::{DeltaResult, Error, Expression}; + +use super::scan_file::{CDFScanFile, CDFScanFileType}; + +pub(crate) fn get_generated_columns() -> DeltaResult> { + // Both in-commit timestamps and file metadata are in milliseconds + // + // See: + // [`FileMeta`] + // [In-Commit Timestamps] : https://github.com/delta-io/delta/blob/master/PROTOCOL.md#writer-requirements-for-in-commit-timestampsa + let timestamp = Scalar::timestamp_from_millis(self.scan_file.timestamp)?; + let commit_version: i64 = self + .scan_file + .commit_version + .try_into() + .map_err(Error::generic)?; + let cols = ["_change_type", "_commit_version", "_commit_timestamp"]; + let expressions = match self.scan_file.tpe { + CDFScanFileType::Cdc => [ + column_expr!("_change_type"), + Expression::literal(commit_version), + timestamp.into(), + ], + CDFScanFileType::Add => [ + "insert".into(), + Expression::literal(commit_version), + timestamp.into(), + ], + + CDFScanFileType::Remove => [ + "delete".into(), + Expression::literal(commit_version), + timestamp.into(), + ], + }; + let generated_columns: HashMap = cols + .iter() + .map(ToString::to_string) + .zip(expressions) + .collect(); + Ok(generated_columns) +} +fn get_expression(scan_file: &CDFScanFile) -> DeltaResult { + let mut generated_columns = get_generated_columns()?; + let all_fields = global_scan_state + .all_fields + .iter() + .map(|field| match field { + ColumnType::Partition(field_idx) => { + let field = self + .global_scan_state + .logical_schema + .fields + .get_index(*field_idx); + let Some((_, field)) = field else { + return Err(Error::generic( + "logical schema did not contain expected field, can't transform data", + )); + }; + let name = field.physical_name(global_scan_state.column_mapping_mode)?; + let value_expression = + parse_partition_value(scan_file.partition_values.get(name), field.data_type())?; + Ok(value_expression.into()) + } + ColumnType::Selected(field_name) => + // We take the expression from the map + { + Ok(generated_columns + .remove(&field_name) + .unwrap_or_else(|| ColumnName::new([field_name]).into())) + } + }) + .try_collect()?; + Ok(Expression::Struct(all_fields)) +} +fn read_schema() -> SchemaRef { + if scan_file.tpe == CDFScanFileType::Cdc { + let fields = self + .global_scan_state + .read_schema + .fields() + .cloned() + .chain(iter::once(StructField::new( + "_change_type", + DataType::STRING, + false, + ))); + StructType::new(fields).into() + } else { + global_scan_state.read_schema.clone() + } +} diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index 86ba0cc49..a675a803e 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -15,6 +15,7 @@ use crate::table_properties::TableProperties; use crate::utils::require; use crate::{DeltaResult, Engine, Error, Version}; +mod data_read; mod log_replay; mod resolve_dvs; pub mod scan; From 876dd1531850a59187c18e0920058db372955289 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 15:04:27 -0800 Subject: [PATCH 34/56] physical to logical transform --- kernel/src/expressions/scalars.rs | 10 +++ kernel/src/table_changes/data_read.rs | 98 --------------------------- kernel/src/table_changes/mod.rs | 2 +- kernel/src/table_changes/scan.rs | 15 ++++ 4 files changed, 26 insertions(+), 99 deletions(-) delete mode 100644 kernel/src/table_changes/data_read.rs diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index a923f5e47..fe9ac3fd3 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -151,6 +151,16 @@ impl Scalar { pub fn is_null(&self) -> bool { matches!(self, Self::Null(_)) } + + /// Constructs a Scalar timestamp from an `i64` millisecond since unix epoch + pub fn timestamp_from_millis(millis: i64) -> DeltaResult { + let Some(timestamp) = DateTime::from_timestamp_millis(millis) else { + return Err(Error::generic(format!( + "Failed to create millisecond timestamp from {millis}" + ))); + }; + Ok(Self::Timestamp(timestamp.timestamp_micros())) + } } impl Display for Scalar { diff --git a/kernel/src/table_changes/data_read.rs b/kernel/src/table_changes/data_read.rs deleted file mode 100644 index aa94ec306..000000000 --- a/kernel/src/table_changes/data_read.rs +++ /dev/null @@ -1,98 +0,0 @@ -use std::collections::HashMap; -use std::iter; - -use crate::expressions::{column_expr, Scalar}; -use crate::scan::{parse_partition_value, ColumnType}; -use crate::schema::{ColumnName, DataType, SchemaRef, StructField, StructType}; -use crate::{DeltaResult, Error, Expression}; - -use super::scan_file::{CDFScanFile, CDFScanFileType}; - -pub(crate) fn get_generated_columns() -> DeltaResult> { - // Both in-commit timestamps and file metadata are in milliseconds - // - // See: - // [`FileMeta`] - // [In-Commit Timestamps] : https://github.com/delta-io/delta/blob/master/PROTOCOL.md#writer-requirements-for-in-commit-timestampsa - let timestamp = Scalar::timestamp_from_millis(self.scan_file.timestamp)?; - let commit_version: i64 = self - .scan_file - .commit_version - .try_into() - .map_err(Error::generic)?; - let cols = ["_change_type", "_commit_version", "_commit_timestamp"]; - let expressions = match self.scan_file.tpe { - CDFScanFileType::Cdc => [ - column_expr!("_change_type"), - Expression::literal(commit_version), - timestamp.into(), - ], - CDFScanFileType::Add => [ - "insert".into(), - Expression::literal(commit_version), - timestamp.into(), - ], - - CDFScanFileType::Remove => [ - "delete".into(), - Expression::literal(commit_version), - timestamp.into(), - ], - }; - let generated_columns: HashMap = cols - .iter() - .map(ToString::to_string) - .zip(expressions) - .collect(); - Ok(generated_columns) -} -fn get_expression(scan_file: &CDFScanFile) -> DeltaResult { - let mut generated_columns = get_generated_columns()?; - let all_fields = global_scan_state - .all_fields - .iter() - .map(|field| match field { - ColumnType::Partition(field_idx) => { - let field = self - .global_scan_state - .logical_schema - .fields - .get_index(*field_idx); - let Some((_, field)) = field else { - return Err(Error::generic( - "logical schema did not contain expected field, can't transform data", - )); - }; - let name = field.physical_name(global_scan_state.column_mapping_mode)?; - let value_expression = - parse_partition_value(scan_file.partition_values.get(name), field.data_type())?; - Ok(value_expression.into()) - } - ColumnType::Selected(field_name) => - // We take the expression from the map - { - Ok(generated_columns - .remove(&field_name) - .unwrap_or_else(|| ColumnName::new([field_name]).into())) - } - }) - .try_collect()?; - Ok(Expression::Struct(all_fields)) -} -fn read_schema() -> SchemaRef { - if scan_file.tpe == CDFScanFileType::Cdc { - let fields = self - .global_scan_state - .read_schema - .fields() - .cloned() - .chain(iter::once(StructField::new( - "_change_type", - DataType::STRING, - false, - ))); - StructType::new(fields).into() - } else { - global_scan_state.read_schema.clone() - } -} diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index a675a803e..b5cf6f695 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -15,8 +15,8 @@ use crate::table_properties::TableProperties; use crate::utils::require; use crate::{DeltaResult, Engine, Error, Version}; -mod data_read; mod log_replay; +mod physical_to_logical; mod resolve_dvs; pub mod scan; mod scan_file; diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index 87fd918f9..d137d8e7c 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use itertools::Itertools; use tracing::debug; +use crate::scan::state::GlobalScanState; use crate::scan::ColumnType; use crate::schema::{SchemaRef, StructType}; use crate::{DeltaResult, Engine, ExpressionRef}; @@ -188,6 +189,20 @@ impl TableChangesScan { let schema = self.table_changes.end_snapshot.schema().clone().into(); table_changes_action_iter(engine, commits, schema, self.predicate.clone()) } + + /// Get global state that is valid for the entire scan. This is somewhat expensive so should + /// only be called once per scan. + #[allow(unused)] + fn global_scan_state(&self) -> GlobalScanState { + let end_snapshot = &self.table_changes.end_snapshot; + GlobalScanState { + table_root: self.table_changes.table_root.to_string(), + partition_columns: end_snapshot.metadata().partition_columns.clone(), + logical_schema: self.logical_schema.clone(), + read_schema: self.physical_schema.clone(), + column_mapping_mode: end_snapshot.column_mapping_mode, + } + } } #[cfg(test)] From 35b38d47e790e2a60d235f6e0744794881f17e99 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 15:32:22 -0800 Subject: [PATCH 35/56] logical to physical --- .../src/table_changes/physical_to_logical.rs | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 kernel/src/table_changes/physical_to_logical.rs diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs new file mode 100644 index 000000000..db607f4d9 --- /dev/null +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -0,0 +1,76 @@ +use std::collections::HashMap; +use std::iter; + +use itertools::Itertools; + +use crate::expressions::{column_expr, Scalar}; +use crate::scan::state::GlobalScanState; +use crate::scan::{parse_partition_value, ColumnType}; +use crate::schema::{ColumnName, DataType, SchemaRef, StructField, StructType}; +use crate::{DeltaResult, Error, Expression}; + +use super::scan_file::{CdfScanFile, CdfScanFileType}; + +#[allow(unused)] +fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult> { + let timestamp = Scalar::timestamp_from_millis(scan_file.commit_timestamp)?; + let version = scan_file.commit_version; + let change_type: Expression = match scan_file.scan_type { + CdfScanFileType::Cdc => column_expr!("_change_type"), + CdfScanFileType::Add => "insert".into(), + + CdfScanFileType::Remove => "delete".into(), + }; + let expressions = [ + ("_change_type".to_string(), change_type), + ("_commit_version".to_string(), Expression::literal(version)), + ("_commit_timestamp".to_string(), timestamp.into()), + ]; + Ok(expressions.into_iter().collect()) +} + +#[allow(unused)] +fn get_expression( + scan_file: &CdfScanFile, + global_state: GlobalScanState, + all_fields: &[ColumnType], +) -> DeltaResult { + let mut generated_columns = get_generated_columns(scan_file)?; + let all_fields = all_fields + .iter() + .map(|field| match field { + ColumnType::Partition(field_idx) => { + let field = global_state.logical_schema.fields.get_index(*field_idx); + let Some((_, field)) = field else { + return Err(Error::generic( + "logical schema did not contain expected field, can't transform data", + )); + }; + let name = field.physical_name(); + let value_expression = + parse_partition_value(scan_file.partition_values.get(name), field.data_type())?; + Ok(value_expression.into()) + } + ColumnType::Selected(field_name) => { + // Remove to take ownership + let generated_column = generated_columns.remove(field_name); + Ok(generated_column.unwrap_or_else(|| ColumnName::new([field_name]).into())) + } + }) + .try_collect()?; + Ok(Expression::Struct(all_fields)) +} +#[allow(unused)] +fn read_schema(scan_file: &CdfScanFile, global_scan_state: GlobalScanState) -> SchemaRef { + if scan_file.scan_type == CdfScanFileType::Cdc { + let change_type = StructField::new("_change_type", DataType::STRING, false); + let fields = global_scan_state + .read_schema + .fields() + .cloned() + .chain(iter::once(change_type)); + StructType::new(fields).into() + } else { + global_scan_state.read_schema.clone() + } +} From dfdc49186632381043d484692005fa117fae10f2 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 15:35:06 -0800 Subject: [PATCH 36/56] remove to_string from generated columns --- kernel/src/table_changes/physical_to_logical.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index db607f4d9..644087df1 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -12,7 +12,7 @@ use crate::{DeltaResult, Error, Expression}; use super::scan_file::{CdfScanFile, CdfScanFileType}; #[allow(unused)] -fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult> { +fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult> { let timestamp = Scalar::timestamp_from_millis(scan_file.commit_timestamp)?; let version = scan_file.commit_version; let change_type: Expression = match scan_file.scan_type { @@ -22,9 +22,9 @@ fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult "delete".into(), }; let expressions = [ - ("_change_type".to_string(), change_type), - ("_commit_version".to_string(), Expression::literal(version)), - ("_commit_timestamp".to_string(), timestamp.into()), + ("_change_type", change_type), + ("_commit_version", Expression::literal(version)), + ("_commit_timestamp", timestamp.into()), ]; Ok(expressions.into_iter().collect()) } @@ -53,7 +53,7 @@ fn get_expression( } ColumnType::Selected(field_name) => { // Remove to take ownership - let generated_column = generated_columns.remove(field_name); + let generated_column = generated_columns.remove(field_name.as_str()); Ok(generated_column.unwrap_or_else(|| ColumnName::new([field_name]).into())) } }) From 020a19d460c2e52fed7e1fcacfc53a89ad0daf21 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 17:27:19 -0800 Subject: [PATCH 37/56] Add read phase and a test --- kernel/src/scan/mod.rs | 4 +- .../src/table_changes/physical_to_logical.rs | 135 +++++++++++++++++- kernel/src/table_changes/resolve_dvs.rs | 8 +- kernel/src/table_changes/scan.rs | 38 ++++- .../_delta_log/00000000000000000000.json | 3 + .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/00000000000000000002.json | 2 + ...r_61d16c75-6994-46b7-a15b-8b538852e50e.bin | Bin 0 -> 45 bytes ...4e51-827b-c3d5516560ca-c000.snappy.parquet | Bin 0 -> 635 bytes 9 files changed, 181 insertions(+), 11 deletions(-) create mode 100644 kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000000.json create mode 100644 kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000001.json create mode 100644 kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000002.json create mode 100644 kernel/tests/data/table-with-cdf-and-dv/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin create mode 100644 kernel/tests/data/table-with-cdf-and-dv/part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index f03d62cc9..41f65e619 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -125,7 +125,7 @@ pub struct ScanResult { pub raw_data: DeltaResult>, /// Raw row mask. // TODO(nick) this should be allocated by the engine - raw_mask: Option>, + pub(crate) raw_mask: Option>, } impl ScanResult { @@ -160,7 +160,7 @@ impl ScanResult { /// store the name of the column, as that's all that's needed during the actual query. For /// `Partition` we store an index into the logical schema for this query since later we need the /// data type as well to materialize the partition column. -#[derive(PartialEq, Debug)] +#[derive(Clone, PartialEq, Debug)] pub enum ColumnType { // A column, selected from the data, as is Selected(String), diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 644087df1..6a2ecee1c 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -2,13 +2,16 @@ use std::collections::HashMap; use std::iter; use itertools::Itertools; +use url::Url; +use crate::actions::deletion_vector::split_vector; use crate::expressions::{column_expr, Scalar}; use crate::scan::state::GlobalScanState; -use crate::scan::{parse_partition_value, ColumnType}; +use crate::scan::{parse_partition_value, ColumnType, ScanResult}; use crate::schema::{ColumnName, DataType, SchemaRef, StructField, StructType}; -use crate::{DeltaResult, Error, Expression}; +use crate::{DeltaResult, Engine, Error, Expression, ExpressionRef, FileMeta}; +use super::resolve_dvs::ResolvedCdfScanFile; use super::scan_file::{CdfScanFile, CdfScanFileType}; #[allow(unused)] @@ -32,7 +35,7 @@ fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult DeltaResult { let mut generated_columns = get_generated_columns(scan_file)?; @@ -61,7 +64,7 @@ fn get_expression( Ok(Expression::Struct(all_fields)) } #[allow(unused)] -fn read_schema(scan_file: &CdfScanFile, global_scan_state: GlobalScanState) -> SchemaRef { +fn read_schema(scan_file: &CdfScanFile, global_scan_state: &GlobalScanState) -> SchemaRef { if scan_file.scan_type == CdfScanFileType::Cdc { let change_type = StructField::new("_change_type", DataType::STRING, false); let fields = global_scan_state @@ -74,3 +77,127 @@ fn read_schema(scan_file: &CdfScanFile, global_scan_state: GlobalScanState) -> S global_scan_state.read_schema.clone() } } + +pub(crate) fn read_scan_data( + engine: &dyn Engine, + resolved_scan_file: ResolvedCdfScanFile, + global_state: &GlobalScanState, + all_fields: &[ColumnType], + predicate: Option, +) -> DeltaResult>> { + let ResolvedCdfScanFile { + scan_file, + mut selection_vector, + } = resolved_scan_file; + + let expression = get_expression(&scan_file, global_state, all_fields)?; + let schema = read_schema(&scan_file, global_state); + let evaluator = engine.get_expression_handler().get_evaluator( + schema.clone(), + expression, + global_state.logical_schema.clone().into(), + ); + + let table_root = Url::parse(&global_state.table_root)?; + let location = table_root.join(&scan_file.path)?; + let file = FileMeta { + last_modified: 0, + size: 0, + location, + }; + let read_result_iter = + engine + .get_parquet_handler() + .read_parquet_files(&[file], schema, predicate)?; + + let result = read_result_iter.map(move |batch| -> DeltaResult<_> { + let batch = batch?; + // to transform the physical data into the correct logical form + let logical = evaluator.evaluate(batch.as_ref()); + let len = logical.as_ref().map_or(0, |res| res.len()); + // need to split the dv_mask. what's left in dv_mask covers this result, and rest + // will cover the following results. we `take()` out of `selection_vector` to avoid + // trying to return a captured variable. We're going to reassign `selection_vector` + // to `rest` in a moment anyway + let mut sv = selection_vector.take(); + let rest = split_vector(sv.as_mut(), len, None); + let result = ScanResult { + raw_data: logical, + raw_mask: sv, + }; + selection_vector = rest; + Ok(result) + }); + Ok(result) +} + +#[cfg(test)] +mod tests { + + use std::{collections::HashMap, error, sync::Arc}; + + use arrow::compute::filter_record_batch; + use arrow_array::RecordBatch; + use arrow_cast::pretty::pretty_format_batches; + use itertools::Itertools; + + use crate::engine::arrow_data::ArrowEngineData; + use crate::engine::default::executor::tokio::TokioBackgroundExecutor; + use crate::engine::default::DefaultEngine; + use crate::{DeltaResult, Error, Table}; + + #[test] + fn basic_cdf() -> Result<(), Box> { + let table = Table::try_from_uri("tests/data/table-with-cdf-and-dv")?; + let options = HashMap::from([("skip_signature", "true".to_string())]); + let engine = Arc::new(DefaultEngine::try_new( + table.location(), + options, + Arc::new(TokioBackgroundExecutor::new()), + )?); + let table_changes = table.table_changes(engine.as_ref(), 0, None)?; + + let x = table_changes.into_scan_builder().build()?; + let batches: Vec = x + .execute(engine)? + .map(|scan_result| -> DeltaResult<_> { + let scan_result = scan_result?; + let mask = scan_result.full_mask(); + let data = scan_result.raw_data?; + let record_batch: RecordBatch = data + .into_any() + .downcast::() + .map_err(|_| Error::engine_data_type("ArrowEngineData".to_string()))? + .into(); + if let Some(mask) = mask { + Ok(filter_record_batch(&record_batch, &mask.into())?) + } else { + Ok(record_batch) + } + }) + .try_collect()?; + let formatted = pretty_format_batches(&batches)?.to_string(); + let expected = concat!( + "+-------+--------------+-----------------+--------------------------+\n", + "| value | _change_type | _commit_version | _commit_timestamp |\n", + "+-------+--------------+-----------------+--------------------------+\n", + "| 0 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", + "| 1 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", + "| 2 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", + "| 3 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", + "| 4 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", + "| 5 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", + "| 6 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", + "| 7 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", + "| 8 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", + "| 9 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", + "| 0 | delete | 1 | 1970-01-21T01:35:06.498Z |\n", + "| 9 | delete | 1 | 1970-01-21T01:35:06.498Z |\n", + "| 0 | insert | 2 | 1970-01-21T01:35:06.498Z |\n", + "| 9 | insert | 2 | 1970-01-21T01:35:06.498Z |\n", + "+-------+--------------+-----------------+--------------------------+" + ); + assert_eq!(expected, formatted); + Ok(()) + } +} diff --git a/kernel/src/table_changes/resolve_dvs.rs b/kernel/src/table_changes/resolve_dvs.rs index c8ca2f97c..6f79df000 100644 --- a/kernel/src/table_changes/resolve_dvs.rs +++ b/kernel/src/table_changes/resolve_dvs.rs @@ -8,15 +8,15 @@ use crate::{DeltaResult, Engine, Error}; /// A [`CdfScanFile`] with its associated `selection_vector`. The `scan_type` is resolved to /// match the `_change_type` that its rows will have in the change data feed. #[allow(unused)] -struct ResolvedCdfScanFile { +pub(crate) struct ResolvedCdfScanFile { /// The scan file that holds the path the data file to be read. The `scan_type` field is /// resolved to the `_change_type` of the rows for this data file. - scan_file: CdfScanFile, + pub(crate) scan_file: CdfScanFile, /// Optional vector of bools. If `selection_vector[i] = true`, then that row must be included /// in the CDF output. Otherwise the row must be filtered out. The vector may be shorter than /// the data file. In this case, all the remaining rows are *not* selected. If `selection_vector` /// is `None`, then all rows are selected. - selection_vector: Option>, + pub(crate) selection_vector: Option>, } /// Resolves the deletion vectors for a [`CdfScanFile`]. This function handles two @@ -34,7 +34,7 @@ struct ResolvedCdfScanFile { /// read the deletion vector (if present), and each is converted into a [`ResolvedCdfScanFile`]. /// No changes are made to the `scan_type`. #[allow(unused)] -fn resolve_scan_file_dv( +pub(crate) fn resolve_scan_file_dv( engine: &dyn Engine, table_root: &Url, scan_file: CdfScanFile, diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index d137d8e7c..247483f26 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -4,11 +4,14 @@ use itertools::Itertools; use tracing::debug; use crate::scan::state::GlobalScanState; -use crate::scan::ColumnType; +use crate::scan::{ColumnType, ScanResult}; use crate::schema::{SchemaRef, StructType}; use crate::{DeltaResult, Engine, ExpressionRef}; use super::log_replay::{table_changes_action_iter, TableChangesScanData}; +use super::physical_to_logical::read_scan_data; +use super::resolve_dvs::resolve_scan_file_dv; +use super::scan_file::scan_data_to_scan_file; use super::{TableChanges, CDF_FIELDS}; /// The result of building a [`TableChanges`] scan over a table. This can be used to get a change @@ -203,6 +206,39 @@ impl TableChangesScan { column_mapping_mode: end_snapshot.column_mapping_mode, } } + + pub fn execute( + &self, + engine: Arc, + ) -> DeltaResult>> { + let scan_data = self.scan_data(engine.clone())?; + let scan_files = scan_data_to_scan_file(scan_data); + + let global_scan_state = self.global_scan_state(); + let table_root = self.table_changes.table_root().clone(); + let all_fields = self.all_fields.clone(); + let predicate = self.predicate.clone(); + let dv_engine_ref = engine.clone(); + + let result = scan_files + .map(move |scan_file| { + resolve_scan_file_dv(dv_engine_ref.as_ref(), &table_root, scan_file?) + }) // Iterator-Result-Iterator + .flatten_ok() // Iterator-Result + .map(move |resolved_scan_file| -> DeltaResult<_> { + read_scan_data( + engine.as_ref(), + resolved_scan_file?, + &global_scan_state, + &all_fields, + predicate.clone(), + ) + }) // Iterator-Result-Iterator-Result + .flatten_ok() // Iterator-Result-Result + .map(|x| x?); // Iterator-Result + + Ok(result) + } } #[cfg(test)] diff --git a/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000000.json b/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000000.json new file mode 100644 index 000000000..4ab50e731 --- /dev/null +++ b/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}} +{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableChangeDataFeed":"true","delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}} +{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000001.json b/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000001.json new file mode 100644 index 000000000..6ddbf539f --- /dev/null +++ b/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"remove":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","deletionTimestamp":1677811194426,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":635,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}} diff --git a/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000002.json b/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000002.json new file mode 100644 index 000000000..8e39274e6 --- /dev/null +++ b/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"remove":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","deletionTimestamp":1677811194426,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":635,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}} +{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/table-with-cdf-and-dv/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin b/kernel/tests/data/table-with-cdf-and-dv/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1a01e661cdcca08ff5d67e7d2de53381980735a GIT binary patch literal 45 lcmZQ%U|>+Wc-b+>@u6oHnRZaa}xk;gAOoS z#<6}t%0rSCNt@&m3E|*~#Cc*mq6q?Bjf0sEC&T3yyEjN{9KCms%M}*aS7r27rN~Tj zzA#1W7Im$F+m7qFUCSlpIkYWjA7Cc8`45BN8(r(ouK2DKhm&oqokhKfIaLJUzYWIu zPlHKl678-<$u_Z>3nwp@5?4qBej=(UKN%Q>#iA`8S!W3Kv+Rn6JI+Zl%15 zS5`$GRbJ1F6QviWH~GBwGEAG$c3l+NBa^IBOI45~tF^{Z6NZvi&--82o2)mRFB=fg z4w&qQd5e|0q&+nA%=X0kY0=qF(izCXnGE_3#gJBXUG{z7KkJ-?b)pv?9F9Xjj^mLU lg%~U_#xTV3XgCPMU~!JGNsB^M@u}mw^bwBeg)ZpZ{R3QVoUi}@ literal 0 HcmV?d00001 From 8a923791517a40a80f94bfa76f7b7a3002e606a1 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 18:27:40 -0800 Subject: [PATCH 38/56] factor out test --- .../src/table_changes/physical_to_logical.rs | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 6a2ecee1c..9d5746052 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -144,18 +144,21 @@ mod tests { use crate::engine::arrow_data::ArrowEngineData; use crate::engine::default::executor::tokio::TokioBackgroundExecutor; use crate::engine::default::DefaultEngine; - use crate::{DeltaResult, Error, Table}; - - #[test] - fn basic_cdf() -> Result<(), Box> { - let table = Table::try_from_uri("tests/data/table-with-cdf-and-dv")?; + use crate::{DeltaResult, Error, Table, Version}; + + fn read_cdf_for_table( + path: impl AsRef, + start_version: Version, + end_version: impl Into>, + ) -> DeltaResult { + let table = Table::try_from_uri(path)?; let options = HashMap::from([("skip_signature", "true".to_string())]); let engine = Arc::new(DefaultEngine::try_new( table.location(), options, Arc::new(TokioBackgroundExecutor::new()), )?); - let table_changes = table.table_changes(engine.as_ref(), 0, None)?; + let table_changes = table.table_changes(engine.as_ref(), start_version, end_version)?; let x = table_changes.into_scan_builder().build()?; let batches: Vec = x @@ -177,6 +180,12 @@ mod tests { }) .try_collect()?; let formatted = pretty_format_batches(&batches)?.to_string(); + Ok(formatted) + } + + #[test] + fn basic_cdf() -> Result<(), Box> { + let cdf = read_cdf_for_table("tests/data/table-with-cdf-and-dv", 0, None)?; let expected = concat!( "+-------+--------------+-----------------+--------------------------+\n", "| value | _change_type | _commit_version | _commit_timestamp |\n", @@ -197,7 +206,7 @@ mod tests { "| 9 | insert | 2 | 1970-01-21T01:35:06.498Z |\n", "+-------+--------------+-----------------+--------------------------+" ); - assert_eq!(expected, formatted); + assert_eq!(expected, cdf); Ok(()) } } From 3d2e53a9cf766a6b70f3304c810cd33ea0305fdf Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 18:40:11 -0800 Subject: [PATCH 39/56] Add cdf test and text --- .../src/table_changes/physical_to_logical.rs | 37 +++++++++++++++++- ...-bf66-fc2a968c4feb.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-9f56-d45f47d5dea5.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-ae43-5bf5b4c36a3d-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-b674-deb4d1b82aee-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-a847-cc5d1415f35d.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-bf76-1987f87901f1-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-a1bc-7a81e4a5ddce-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-be02-282f3629694c.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-89d4-0d73a5d5b971.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-8ea5-ddd162a84e94.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-a2df-a50985051257-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-a93f-c1bf180e7008-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-94e2-e7987a580b6a-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-be61-76396629a546-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-acd1-d2642d1a778c-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-8dbc-812b7274a4e5-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-9288-5a6503bac41b-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-931d-168b2821adca-c000.snappy.parquet.crc | Bin 0 -> 24 bytes ...-8239-f53ca8761e0e.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-9a42-c551810ffef9.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-b0ab-5b9937a8bd12.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-9053-48dc314be509.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-8e90-e33319ebf581.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-aefb-cd68c1672e02.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...-a938-58954786d23f.c000.snappy.parquet.crc | Bin 0 -> 28 bytes ...4651-8239-f53ca8761e0e.c000.snappy.parquet | Bin 0 -> 2459 bytes ...4ee9-9a42-c551810ffef9.c000.snappy.parquet | Bin 0 -> 2242 bytes ...4293-b0ab-5b9937a8bd12.c000.snappy.parquet | Bin 0 -> 2442 bytes ...4ccc-9053-48dc314be509.c000.snappy.parquet | Bin 0 -> 2451 bytes ...4dbf-8e90-e33319ebf581.c000.snappy.parquet | Bin 0 -> 2444 bytes ...46d1-aefb-cd68c1672e02.c000.snappy.parquet | Bin 0 -> 2437 bytes ...47f1-a938-58954786d23f.c000.snappy.parquet | Bin 0 -> 2444 bytes .../_delta_log/.00000000000000000000.json.crc | Bin 0 -> 68 bytes .../_delta_log/.00000000000000000001.json.crc | Bin 0 -> 36 bytes .../_delta_log/.00000000000000000002.json.crc | Bin 0 -> 36 bytes .../_delta_log/.00000000000000000003.json.crc | Bin 0 -> 16 bytes .../_delta_log/.00000000000000000004.json.crc | Bin 0 -> 24 bytes .../_delta_log/00000000000000000000.json | 13 ++++++ .../_delta_log/00000000000000000001.json | 10 +++++ .../_delta_log/00000000000000000002.json | 10 +++++ .../_delta_log/00000000000000000003.json | 3 ++ .../_delta_log/00000000000000000004.json | 3 ++ ...4b6f-bf66-fc2a968c4feb.c000.snappy.parquet | Bin 0 -> 2188 bytes ...41b6-9f56-d45f47d5dea5.c000.snappy.parquet | Bin 0 -> 2175 bytes ...456d-ae43-5bf5b4c36a3d-c000.snappy.parquet | Bin 0 -> 1958 bytes ...4613-b674-deb4d1b82aee-c000.snappy.parquet | Bin 0 -> 1965 bytes ...4ae7-a847-cc5d1415f35d.c000.snappy.parquet | Bin 0 -> 2182 bytes ...4166-bf76-1987f87901f1-c000.snappy.parquet | Bin 0 -> 1958 bytes ...43be-a1bc-7a81e4a5ddce-c000.snappy.parquet | Bin 0 -> 1951 bytes ...40da-be02-282f3629694c.c000.snappy.parquet | Bin 0 -> 2175 bytes ...41f0-89d4-0d73a5d5b971.c000.snappy.parquet | Bin 0 -> 2175 bytes ...4459-8ea5-ddd162a84e94.c000.snappy.parquet | Bin 0 -> 2168 bytes ...47af-a2df-a50985051257-c000.snappy.parquet | Bin 0 -> 1958 bytes ...4703-a93f-c1bf180e7008-c000.snappy.parquet | Bin 0 -> 1958 bytes ...4462-94e2-e7987a580b6a-c000.snappy.parquet | Bin 0 -> 1965 bytes ...4dc4-be61-76396629a546-c000.snappy.parquet | Bin 0 -> 1958 bytes ...445d-acd1-d2642d1a778c-c000.snappy.parquet | Bin 0 -> 1971 bytes ...4891-8dbc-812b7274a4e5-c000.snappy.parquet | Bin 0 -> 1972 bytes ...4ab2-9288-5a6503bac41b-c000.snappy.parquet | Bin 0 -> 1951 bytes ...4a23-931d-168b2821adca-c000.snappy.parquet | Bin 0 -> 1958 bytes ...-a497-6969cdf3966c.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...-90bd-9416b10ba6a6.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...-bf85-3b9f5027578c.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...4fc0-a497-6969cdf3966c.c000.snappy.parquet | Bin 0 -> 1028 bytes ...41e1-90bd-9416b10ba6a6.c000.snappy.parquet | Bin 0 -> 1028 bytes ...42c6-bf85-3b9f5027578c.c000.snappy.parquet | Bin 0 -> 1021 bytes ...-8c41-71e38c07fdc2.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-b7e4-90bf8d04898e.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-aa88-7d5f5228d781.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...4b10-8c41-71e38c07fdc2.c000.snappy.parquet | Bin 0 -> 1021 bytes ...4f3e-b7e4-90bf8d04898e.c000.snappy.parquet | Bin 0 -> 1021 bytes ...4713-aa88-7d5f5228d781.c000.snappy.parquet | Bin 0 -> 1014 bytes ...-a62e-2ecc8dc24035.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...-af76-3b32bab79832.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...-a9c8-05c1d4f79d6a.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...470a-a62e-2ecc8dc24035.c000.snappy.parquet | Bin 0 -> 1034 bytes ...406f-af76-3b32bab79832.c000.snappy.parquet | Bin 0 -> 1028 bytes ...4533-a9c8-05c1d4f79d6a.c000.snappy.parquet | Bin 0 -> 1021 bytes ...-9739-fc9d4db24308.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...-b2cf-91e882f4c500.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-b0ac-e70b1e2115b1.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...-99ed-062c0a337c29.c000.snappy.parquet.crc | Bin 0 -> 20 bytes ...4d37-9739-fc9d4db24308.c000.snappy.parquet | Bin 0 -> 1041 bytes ...44af-b2cf-91e882f4c500.c000.snappy.parquet | Bin 0 -> 971 bytes ...4227-b0ac-e70b1e2115b1.c000.snappy.parquet | Bin 0 -> 1035 bytes ...4198-99ed-062c0a337c29.c000.snappy.parquet | Bin 0 -> 1028 bytes .../_delta_log/.00000000000000000000.json.crc | Bin 0 -> 44 bytes .../_delta_log/.00000000000000000001.json.crc | Bin 0 -> 40 bytes .../_delta_log/.00000000000000000002.json.crc | Bin 0 -> 40 bytes .../_delta_log/.00000000000000000003.json.crc | Bin 0 -> 20 bytes .../_delta_log/00000000000000000000.json | 13 ++++++ .../_delta_log/00000000000000000001.json | 13 ++++++ .../_delta_log/00000000000000000002.json | 13 ++++++ .../_delta_log/00000000000000000003.json | 3 ++ ...-9c61-120d006eb3b8.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-8fba-035e60e71ab2.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-bc83-487a583eb01b.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-b334-3a50c28185bb.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...4236-9c61-120d006eb3b8.c000.snappy.parquet | Bin 0 -> 694 bytes ...4ac9-8fba-035e60e71ab2.c000.snappy.parquet | Bin 0 -> 904 bytes ...42c8-bc83-487a583eb01b.c000.snappy.parquet | Bin 0 -> 904 bytes ...45ff-b334-3a50c28185bb.c000.snappy.parquet | Bin 0 -> 897 bytes ...-9db1-e985867a1a6c.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-bee3-df831f4abf3c.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-ac4f-56538beeddae.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...4acc-9db1-e985867a1a6c.c000.snappy.parquet | Bin 0 -> 680 bytes ...4e3b-bee3-df831f4abf3c.c000.snappy.parquet | Bin 0 -> 687 bytes ...43f8-ac4f-56538beeddae.c000.snappy.parquet | Bin 0 -> 687 bytes ...-b7bf-93f2f37c0cb9.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-9a85-e4f8d4501a67.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-adb9-21feeeee2c31.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet | Bin 0 -> 694 bytes ...4083-9a85-e4f8d4501a67.c000.snappy.parquet | Bin 0 -> 687 bytes ...45ad-adb9-21feeeee2c31.c000.snappy.parquet | Bin 0 -> 700 bytes ...-8bb3-721fa82961c6.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-bcc3-5df022ec6b35.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-93f1-6dc27cd2e980.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...4bbc-8bb3-721fa82961c6.c000.snappy.parquet | Bin 0 -> 701 bytes ...4b3b-bcc3-5df022ec6b35.c000.snappy.parquet | Bin 0 -> 680 bytes ...4cb2-93f1-6dc27cd2e980.c000.snappy.parquet | Bin 0 -> 687 bytes ...-9c5b-b99e676ddd06.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-8377-aa36cfe5762f.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...-b729-42b7d7d7f5ca.c000.snappy.parquet.crc | Bin 0 -> 16 bytes ...46a8-9c5b-b99e676ddd06.c000.snappy.parquet | Bin 0 -> 917 bytes ...4d88-8377-aa36cfe5762f.c000.snappy.parquet | Bin 0 -> 911 bytes ...44f0-b729-42b7d7d7f5ca.c000.snappy.parquet | Bin 0 -> 904 bytes 127 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00000-3cb5dee7-9ab2-4b6f-bf66-fc2a968c4feb.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00000-94321f1e-f3e8-456d-ae43-5bf5b4c36a3d-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00001-69317b6f-2e84-4ae7-a847-cc5d1415f35d.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00001-75bdbc7a-6029-4166-bf76-1987f87901f1-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00004-5de927bd-552f-4462-94e2-e7987a580b6a-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00006-a200a57d-283c-445d-acd1-d2642d1a778c-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00009-24d335c6-4da8-4a23-931d-168b2821adca-c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-4a1d2de0-dda2-4651-8239-f53ca8761e0e.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-3083ab22-0fc6-4ccc-9053-48dc314be509.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-d6ab6a1a-7a19-47f1-a938-58954786d23f.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-4a1d2de0-dda2-4651-8239-f53ca8761e0e.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00001-3083ab22-0fc6-4ccc-9053-48dc314be509.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00002-d6ab6a1a-7a19-47f1-a938-58954786d23f.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000000.json.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000001.json.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000002.json.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000003.json.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000004.json.crc create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/00000000000000000000.json create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/00000000000000000001.json create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/00000000000000000002.json create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/00000000000000000003.json create mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/00000000000000000004.json create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00000-3cb5dee7-9ab2-4b6f-bf66-fc2a968c4feb.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00000-94321f1e-f3e8-456d-ae43-5bf5b4c36a3d-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00001-69317b6f-2e84-4ae7-a847-cc5d1415f35d.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00001-75bdbc7a-6029-4166-bf76-1987f87901f1-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00004-5de927bd-552f-4462-94e2-e7987a580b6a-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00006-a200a57d-283c-445d-acd1-d2642d1a778c-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00009-24d335c6-4da8-4a23-931d-168b2821adca-c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/.cdc-00000-59fa51a4-edbb-4fc0-a497-6969cdf3966c.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/.cdc-00001-308c0cab-92b2-41e1-90bd-9416b10ba6a6.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/.cdc-00002-ea0bad63-f199-42c6-bf85-3b9f5027578c.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/cdc-00000-59fa51a4-edbb-4fc0-a497-6969cdf3966c.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/cdc-00001-308c0cab-92b2-41e1-90bd-9416b10ba6a6.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/cdc-00002-ea0bad63-f199-42c6-bf85-3b9f5027578c.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/.cdc-00000-fb59d34a-5bd7-4b10-8c41-71e38c07fdc2.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/.cdc-00001-985fd824-b34a-4f3e-b7e4-90bf8d04898e.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/.cdc-00002-831078a2-a13d-4713-aa88-7d5f5228d781.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/cdc-00000-fb59d34a-5bd7-4b10-8c41-71e38c07fdc2.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/cdc-00001-985fd824-b34a-4f3e-b7e4-90bf8d04898e.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/cdc-00002-831078a2-a13d-4713-aa88-7d5f5228d781.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00002-ddca9e04-03ef-4533-a9c8-05c1d4f79d6a.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00002-ddca9e04-03ef-4533-a9c8-05c1d4f79d6a.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/_delta_log/.00000000000000000000.json.crc create mode 100644 kernel/tests/data/cdf-table/_delta_log/.00000000000000000001.json.crc create mode 100644 kernel/tests/data/cdf-table/_delta_log/.00000000000000000002.json.crc create mode 100644 kernel/tests/data/cdf-table/_delta_log/.00000000000000000003.json.crc create mode 100644 kernel/tests/data/cdf-table/_delta_log/00000000000000000000.json create mode 100644 kernel/tests/data/cdf-table/_delta_log/00000000000000000001.json create mode 100644 kernel/tests/data/cdf-table/_delta_log/00000000000000000002.json create mode 100644 kernel/tests/data/cdf-table/_delta_log/00000000000000000003.json create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/.part-00007-8cd4b5a3-b4dd-4bbc-8bb3-721fa82961c6.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/.part-00008-436dbf31-f213-4b3b-bcc3-5df022ec6b35.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/.part-00009-685aacbb-c7ac-4cb2-93f1-6dc27cd2e980.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/part-00007-8cd4b5a3-b4dd-4bbc-8bb3-721fa82961c6.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/part-00008-436dbf31-f213-4b3b-bcc3-5df022ec6b35.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/part-00009-685aacbb-c7ac-4cb2-93f1-6dc27cd2e980.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00002-7dd6bbed-a0c1-44f0-b729-42b7d7d7f5ca.c000.snappy.parquet.crc create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet create mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/part-00002-7dd6bbed-a0c1-44f0-b729-42b7d7d7f5ca.c000.snappy.parquet diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 9d5746052..9f94ed825 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -184,7 +184,7 @@ mod tests { } #[test] - fn basic_cdf() -> Result<(), Box> { + fn cdf_with_deletion_vector() -> Result<(), Box> { let cdf = read_cdf_for_table("tests/data/table-with-cdf-and-dv", 0, None)?; let expected = concat!( "+-------+--------------+-----------------+--------------------------+\n", @@ -209,4 +209,39 @@ mod tests { assert_eq!(expected, cdf); Ok(()) } + #[test] + fn basic_cdf() -> Result<(), Box> { + let cdf = read_cdf_for_table("tests/data/cdf-table", 0, None)?; + let expected = r#" + +----+--------+------------------+-----------------+-------------------------+------------+ + | id | name | _change_type | _commit_version | _commit_timestamp | birthday | + +----+--------+------------------+-----------------+-------------------------+------------+ + | 7 | Dennis | delete | 3 | 2024-01-06T16:44:59.570 | 2023-12-29 | + | 3 | Dave | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 | + | 4 | Kate | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 | + | 2 | Bob | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 | + | 7 | Dennis | update_preimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-24 | + | 5 | Emily | update_preimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-24 | + | 6 | Carl | update_preimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-24 | + | 7 | Dennis | update_postimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-29 | + | 5 | Emily | update_postimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-29 | + | 6 | Carl | update_postimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-29 | + | 3 | Dave | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 | + | 4 | Kate | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 | + | 2 | Bob | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 | + | 2 | Bob | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 | + | 3 | Dave | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 | + | 4 | Kate | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 | + | 5 | Emily | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 | + | 6 | Carl | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 | + | 7 | Dennis | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 | + | 1 | Steve | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-22 | + | 8 | Claire | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 | + | 9 | Ada | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 | + | 10 | Borb | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 | + +----+--------+------------------+-----------------+-------------------------+------------+ + "#; + //TODO + Ok(()) + } } diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00000-3cb5dee7-9ab2-4b6f-bf66-fc2a968c4feb.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00000-3cb5dee7-9ab2-4b6f-bf66-fc2a968c4feb.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..a1dea1429683aa46ab2f44d32556edbfa031778a GIT binary patch literal 28 kcmYc;N@ieSU}Es9FO%5DYd1&HN-!=*W0CnmKumAu6 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..e26d3add58bd1d8aa84bf9c20cbf9b398aef21bc GIT binary patch literal 28 kcmYc;N@ieSU}BJ1Kbt|&)I2M2yYzv{H)geItYQ5C0B=zVy8r+H literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00000-94321f1e-f3e8-456d-ae43-5bf5b4c36a3d-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00000-94321f1e-f3e8-456d-ae43-5bf5b4c36a3d-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..5ec7564c505cfc172a06e21a987a803fe047ba96 GIT binary patch literal 24 fcmYc;N@ieSU}7i`l>hQi*T3@cquP6G+4>{_UknKx literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..77e39b81d4e27661219185bdbbe580dfbd31da24 GIT binary patch literal 24 gcmYc;N@ieSU}EsPDtBR*yT*Z&+xcH)f6s0L0Asuf_W%F@ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00001-69317b6f-2e84-4ae7-a847-cc5d1415f35d.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00001-69317b6f-2e84-4ae7-a847-cc5d1415f35d.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..997e9db293a7e5e51a8ac707196e2cd6286ac674 GIT binary patch literal 28 kcmYc;N@ieSU}AW@Q0C$Zw~mUQA_I5%IUj06qZUsG0FFBgKmY&$ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00001-75bdbc7a-6029-4166-bf76-1987f87901f1-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00001-75bdbc7a-6029-4166-bf76-1987f87901f1-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..9c7d70c58ac9ef5a4d2bcf7e0aae10e13a656e5d GIT binary patch literal 24 gcmYc;N@ieSU}8vgdCu@{v3Kd=N459Xvh_&<0AvdZdH?_b literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..ef39d10bbd547a5ef5b591ff49217ab62bb84812 GIT binary patch literal 24 gcmYc;N@ieSU}Dg{-Iuw`WJB6+hObEzw){H=0AmUYSO5S3 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..1303651224a1838491b966cedcd26cd67941019c GIT binary patch literal 28 kcmYc;N@ieSU}CtnM@QsL&r_F$&$V|>zA>v!V-4#E0GEagQUCw| literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..7526ee54eda1b51950905f7c825c2b4b93698468 GIT binary patch literal 28 kcmYc;N@ieSU}ESuX3Nab2s!?CN<+@%8?)Lp*06p60DZ9v*Z=?k literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..b2547772ce81e13f55794138ebd4df957f98c916 GIT binary patch literal 28 jcmYc;N@ieSU}6wuH{?yqSW%H`KXdN2uhO;a8nZG1Yz7Iw literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..b5de62a3de667a8490a9034859c38595b4194138 GIT binary patch literal 24 fcmYc;N@ieSU}CW37IBtNDla_zsP^7kwmwM!N|pze literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..14e9fdf09f3a05ec7abb97b0edac743377c75053 GIT binary patch literal 24 gcmYc;N@ieSU}C70u!>rIY%=5FN459Xvh_&<09tGaw*UYD literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00004-5de927bd-552f-4462-94e2-e7987a580b6a-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00004-5de927bd-552f-4462-94e2-e7987a580b6a-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..40542b6be889719ecf700cfc1a7b4458bd8758d1 GIT binary patch literal 24 gcmYc;N@ieSU}89XyOwu7BOl+%?ffsYzh^fA09?fg{{R30 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..37d56abc452027394a4a87d85a4c6d6cfb98b72f GIT binary patch literal 24 gcmYc;N@ieSU}Bh8683(|YA^A_k81C&W$Tj!0B1-Ei~s-t literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00006-a200a57d-283c-445d-acd1-d2642d1a778c-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00006-a200a57d-283c-445d-acd1-d2642d1a778c-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..5c6926f37065b51786fff2bff0109b4068611f58 GIT binary patch literal 24 gcmYc;N@ieSU}A9T_$+Ji=Z;K(@M`aA{13MQ09#@R&;S4c literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..38b817e1ef42aec86dace3a147545cfbbb72b532 GIT binary patch literal 24 gcmYc;N@ieSU}89|Hs|lM^Qxlf*e-VL`KWdS0B8CNL;wH) literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..2cf62caf4bef9e0e46340dd4083a6429a4344387 GIT binary patch literal 24 gcmYc;N@ieSU}9K4$?>OCb=<4n3}2HbZ25N#0BBkY%>V!Z literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00009-24d335c6-4da8-4a23-931d-168b2821adca-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00009-24d335c6-4da8-4a23-931d-168b2821adca-c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..9b93f676dc8138675f0d4957294518aa4a14a3ee GIT binary patch literal 24 gcmYc;N@ieSU}Bi-K3#YI_D9BtAJyJl%ho3e0A@W2r~m)} literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-4a1d2de0-dda2-4651-8239-f53ca8761e0e.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-4a1d2de0-dda2-4651-8239-f53ca8761e0e.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..1da6ec9e30c8d83c2e3d88d45b7297ff43e1bbb8 GIT binary patch literal 28 kcmYc;N@ieSU}E?%^V)jBnOSw~k8e}R4D8`xs7!4I0F>1V*Z=?k literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..ae94613f9fe4305b7ff4c038ffc444943bbca9de GIT binary patch literal 28 kcmYc;N@ieSU}6xvZ@a|VMB;TpV)D_Iid~b%70z%10D1cfcK`qY literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..796311786525ecf4b0dc5d1c9dc85fb850c80e64 GIT binary patch literal 28 kcmYc;N@ieSU}6xC6uFhQ=GoLa)#kocX(9{PZ1D>K0DmP4Pyhe` literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-3083ab22-0fc6-4ccc-9053-48dc314be509.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-3083ab22-0fc6-4ccc-9053-48dc314be509.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..20ffcc417dfe2d6861030d5346f012b9c5927ddc GIT binary patch literal 28 kcmYc;N@ieSU}D%4bL;HZiEADCKHxyk@$XIz0I2H?xBvhE literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..90d40e3abf3db72f5f71bde46eca768272da1d76 GIT binary patch literal 28 kcmYc;N@ieSU}7-zn)K{-b*b&%b`^exE%Pp9C}`RP0E0;h&j0`b literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..287fdeea8cd34f2abd0155c45c8c89ffb15158b2 GIT binary patch literal 28 kcmYc;N@ieSU}7+EHM_EAbK})k=7=e;G#h7pxfFa70EjLN@c;k- literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-d6ab6a1a-7a19-47f1-a938-58954786d23f.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-d6ab6a1a-7a19-47f1-a938-58954786d23f.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..47f198ed0d28bc984aeea421c6aa03ce6fecf7fe GIT binary patch literal 28 kcmYc;N@ieSU}9ieV0&$|;*Z2r@3QwXY?*f^(8KsgtvgIUAEei$20#%_3EcwV9d#1L<_|us& zP$iL26uRny4PSKuU64)a%0i_pgQ_l2E3vBB(M4B)ga9cEng#dH-;Jd@ay;*y``)?d z-On3uz5HgGvXrA)`qi)B9OP;=L0LlB>>MG45}K!r^yYuxf1ZLLM2+DpY476?vxr@iopgP4iMc)|`0Z4BmuXo^BOjF}lm6c{E3a6x=veK3rfSquPpapMr_ z$_EDu4MB=OT6*vwNCH^)O4v!gn(MR`_A$m7SH+*!=hUh324v$iF2NwYO z_1CiYK(pSf+s(Go)o32TrG${^=!9lr!1lDRu|zf{cR2uDqhmR3UAwZB-82zo0nC{F zUaNkoW036tbf4Y`xjm zdM%^wTp0u!cvh6eYf4oD?W<{)6HzKn@GNigj#5=P$twcyn^%F;B zqKsLIW_;ljRS`-bv&vN#qzAnejZpeYbc*yZ(BC6h`Y08Z%M&H%dc@KhrIfuC6c4tj z_l-G-W>Y-eL=QLSF%?JP0gDHdJ5jbS|5XOEag?g9k>w}*TR3qC#A9{`J^V6H-y!^+ zG|ws=|9g57E`g#T^sSbv4K(Ol)ooz)eZ#Ul{hpAjsH@m42+m1DCI&x8Vo7tADZ*R+Z1%Roe1MDw@mNi<21we0Tni{mV zT!o!RQ4UDu>h|t#G2HBOgxkSOr`MW&cW~!K0`NsJ`~K*9iWm@^onhiWi%5KUcAoeh z7}0p>>@1VlX2fH+`xYi|#-zmW*BHr6$$58|2p9ZnI@`4he)$#3YPq783(JjR$J7gY zW2L-aDVNUctBrG&6{ER&zEaZH&y||GW|Y^;#!6{j*Bjay@fy5grhdq`@GjnhpJo38 DonIWd literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..30adb9956b391702af138b3dfafcbe64cd46f4b2 GIT binary patch literal 2242 zcma)8O=ufO6rNqLq>ZY&PU0DML9}9tMFP$uAv*=%&4Nb6RIkZ{ru09FUyk3?2_ODI_7Z}pTR$V`sZf_Nv&k0 zY6|Q#nje1qbw}FHSYG?+?{~g^_0$@V=l&;mpS~oiW$`E?wMP#g-In8#b91RrV``pO zO+#bV;4iG21QrY~rs?N`w5746!6M0BSm>;1>|+|aC|N0rX();W=vhjjpNZ18#!>+< zNk+m;k%w+@fm>9Gj;})O$gGksUgy%Gl7kNw+~w$zir2V!qT&u0k7sh-0}XF56u7^- zR*ANq(T&ZhzeR@*o&#_}HB`?n48&X==i=D#3{H?>wI%|?lfa%zAKykEhiYz z&3Jb^i-D+S)x1^Lp*76qq7DUwP_ar>@+P@j*W2lFr%!^;Cx@S|pmNY#kRC$haS6|p z2O73fy<5c<&XnYuaCOLfz1z-uBzprN@0tveIiE7lg95&U>W7L5FRxb~slvffIc zEK0GG89&!CA>S2Pln;SX<6%n0PB^cVF6_5OQwj;V^Hvq8?Bfpe@mB%Aj_OA$6o~tn zJio+Eo2-FK8+shRK%Bf~s$)j=brD;regafwPBn4t3DY&w0SyQ^n+hSZfS*TYFTXP{ zKNVTP$Lzx5UY_Zsyx7MxP;540>5~;(tYqHDjQ8UbEB1jVdSapn6V@)-01ar4bWpQ5 zFh7IH>>r*2_Io904sur`YF#J4X!9`4i-r-72DY>9!1&q8B-%IWDDuLGbyEo)HWT zo1S6fViF`Ao}Nd(%mht}PR}xV{|TOGD;%-IoAOZPi&7$)>TurMN5*yDO&5ow8gIW^ z({8rxX6<}$?D=lZ?JYG|TFu54ce!`DwM6^NS6U5s<#MC%I<$GINtYTcuG@1in781k P;)P!n!Vl;={CoKy0~PDI literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..447df150cbd749981590d7ceb93f0e1e41b3886e GIT binary patch literal 2442 zcma)8O^6#+9DkWiva>O6w|3uS1_liSJ8U5x60+N-OORDONLkD3(GS8TGfkqCkIqb5 z+U*`I9*T$;1y>JVdI=)3NE9g^qz4yo_9iGETU_vi9=416|Ic^2fhHs~@BQBY{r>PDkim$jSt<}B$Q&%D(5aGZ#fJHKVTA z=@uP-@Xe)!tTSz2)(M~(4h-Ej>jT^DboG|GNTgG8-*LT(bdr-q9#a&&C?_$PcreBu zU@rv;7()p#Mm1qAgpriQXqZ?|r6e(xN&rF_8?X?&#LgGP!$b!VPlQl# zzJNq#Rr(?vg-9~nOGU$nDS9zLkGKfElcpO1I_4vEl!=B18F~W^fPDAmyfe`4cj`{F zZFY6K2;ed&avg)vJPi1r-ZhuWrtFymAT&C*+cxxT%lS^jsfYE^LlYEuKVH^qsFa+j_5M*4=A^Py^#7 zS-PcGWzfEsFh*X58;VFmxfiJ>#b99N_&r*%3 zd(a5^fYUp*$Lg{hu%ccETC5!ZiL~)-3hkFU%Gk$qNc#|IQK|s>k7DOhUtkwuwdU8J z$YUWQF;=2KgJgqHNC+(4MaH`x;}Hj~5eQ8L-s7~{b@hl-f8gY}ZsQTqN7WpoQ}q9W z-u{`R&lB}d8bk#8C$s7~tg#I+sX;fu7x=w*Qutp;^_4&sqKurR|WQab_sgFD2jc% zrRf75npN}qR=Z-_PN&}!GZk%3D-rSZF$?Fir9q2d9-6LZ_pfSQ)79Y))x|x2Tfx>+ z))mLKhfP;0Dqase%K97I3U)p$bqs(5Gw4*aW!eDrhL)v6H!JI~(<~|>sj|Mkvr~*V zdmPbr@Y3nEX5SrNHL(D^`)A)D-%2q9VzV<$Tw^gw0ME{oy!v7qADx|L^0th5oOa*F zZ`h&2^Zj;5$k7hTO zr)?!pduWCK3E@$4~NKEloaxLO__IG-LOGvUapPBi5 zKl7c>cartzU#$>K1d=EJT)Xivt`m-6gm9im2oXdIWS)F=^v%b!GR(pxLj;5bnS;S} z8l?ewics?F-Gd*vVC77at$eX-*&8P>M?@w?^3l$RpUg3SN+=)<=fPOD>fby5IuffG5)KqLP~#Yv^_iePbcNW-|~2 zn6vx6cH^>T+OP}(@MN>!w@tklVmZL_Mt{__%@AT?W}U8X+g8u@!G+@5tIP#}TWMDjgOmu|th z`XeXD-i=#8A2~WkC+Po;Mf)#HoN|lb zNIk$Z$pym43=v5`!0LM#q=#h`x0v+f=mhB>JOT0U3rzYRA3NnSw|NnnYxa2cT^@YH zs?+~wXAt)$MEH_L*qvcVaR?EZZ`k3EIo9KUKLw|8$k)%I#m7rqJXZe6oE@+TKh2Og z5dA(wu`1BtGxJdYB~k1T+nPSmp%+~USYS~yhUv3$?SvglRm&z-K(bUgK*Wt%y;v0RIL1-QR Gj{OgPbQ)&> literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..809770b5fdbe4aa2f77781ad0daf851077098cc1 GIT binary patch literal 2444 zcma)8PiP}m7=KA#(n*ZntlKxtz@TBELl?3`LfYMK2vR-CT9@k4KZHqUHVIDt?94=2 z+CmW&4>@?!O9k13dnwXGDOGxqB0Vg4RJ^&T^&klLAjK8+`{sYPflWwe-uu1p{eIv3 z{=9bMmA5L4GLGfh>2Kq4zRr@25<>IvAdIjATVUV(efY(^1Xh$~7)J;vC1Ej@A}K)7 z;Om{!pR@i>PQsn9YmV{k-GoW3$bS0oyS*&Z6OVp=*`s&fAjF2BFY@X6!AT|*Nr4D{ zW)YcZFR({Xe%(vNMCJrfBsjv4|Li_UhwLwX{QH|nFJ{-C&F+Qn{(~of=EBe=X7uZA zw!w}*`}R)Et!o|6txG^L8tM?m#?UglJ*{mNhZf=G%&IE*bP5~7eu!~iac4cG_6*fPb#gNhr2NW2gnCOm*}DS-U* z`6LowW3R$dh$!)giEwzHV7GmACl#V6DYoXL$23Iu)8X(e&9=}0$ai1M+e6KIzhSpJ zMo(h}02h-&va1u8hXL(tJ!6rqORhNpT(fIA9bLP(m|r&$qyXm3LBHL&(=|-kMgVxe zIT)CR)(@~8V0nEoYMMp>q0GG9(@eA5cRX;hxW0F#a2Z)qJKtz^w0_%YIQNEu4?Gnm zajeuN@O>piIT2lDNuKg1?@U1SrYpLUxfcE#}@cMGE61P zAv8kX=X6i)vO4@0tSDPRit5{% z6bliFu@b!*BpV!sguuc>WPIo{?xdhKd`Dw}58d5KDW^#(s@rG;^kFqe=oJ4Sq1yQz zdzmOFphkTEC$s7b*7-FssY5rw7x>+G-0{DV>X2hHQI4s^a=zme#SprFOqGu)xE{1n zG(y)eqElQSKL>ihaOnDeI;xVVD$P~IirG&q|AM01ncfp~fPL!CrYPJ;h0n8?+%r(X zn!)5wRIAJXNdk)JEM31&mY!~F;X>=jV|IiJKV;dvg#SLnQ-$LnXBMFMOM)=4+Nw6x zpjlP7Z`BVB%kB>PLb|H1sAVEty>HlPNO6Tq;hR@d%F~Fb~(cB zpwjKPXWt!MHIV~&_s_mRx|JdZ#AauhxW*z9AD*2je)UB(9y&YAlI_UysGO>?W#D2KP*$f literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2059c5f06e0fafb23d204ef5ae5848d064369f34 GIT binary patch literal 2437 zcma)8PiWg#7=M;!Tk$;CUGszlVl#q>bhyICNt+rovYtlCO19(vU}Q_R&1B1!Am0g-tYUp z@6Ss%-+Z@5S<2A@J^u37M_iLKlqG~MFoY0FXo)V;$-m!!QIud6OS6{kE4%ILQgV6FdR3|MYzKUU&jddhPSS-nswk!usj#e&oKn_w1j1 z6gkv(`c6XMqnq^pogc0zWL@a_vd#d-c%*Bt(HhxCf1q`Z5)scymg9O8@hl?>EUpb+ zlo5HDAe@HS`_BZ(VkBWL1~mb5VH`yv8YaxClqjT934jYD1M92ldM!YYnFxKDrt1NEz((jK6Ah0tbQ>1{`R-c< zXQbI5wVY1R7-+Nv;1VM+eVxz(4A`MIFqX)M!8GtK_c^}(g`#@@NoS!6}ug;uAh4ZB9m zy*Uas@T@3_x0Qwj+E=qIC!$o!@GNiguF_CA$xpxkKGFXN9BK+Mv zUF84HQk5wC&<2Ho(>t}tYV)6hMcJ0T^6?%(ilyX@XHZV^l;`i}=<7sz3X~|5gWN}P z^JuT~*TCBJ>rU{B5Vc|z`ZGv0h=gQ-$U|g2%%R9*2AU!enFzYa>GIoPR}L8|uGx45 z^id_p=nVaj(A$$7eS;_u(;ybec@7@EuJoUn1ME?M zHp9a%dbqQIsW<`;STC5|sY>1m`N15hDvixF83n$t>8ME)u!-EC-0pTBHc~;^0 zKeCI^_Z2~~?5?VfG-y=S>sj?&T7ww`A;kF!gnSgJsaAM%S-R-gHDzku+yl> zA*tNh+S#c@n>~(bJ9z02yYuf3ubEf?-umUe?U#hPdoz=_rs=juq+R-(mcCluxRM&L9t(_Nd!ylEIU-DI!kWKgp?0F1wBX(UaSZaYA^LrG++r)B1jbv1y5cC5ACTJK`;l2R>bd{|Jww*VKejI?|tw0 z``-8G?XJD}dWBKOvpoCi+wXqK)>(>CLTElq2w{X3*c|)vpS@3JWU!(v%XmU~IR%U9 zG)V)320z^V;=@zHPF{xXmPqQi}b{&pIP+jN3Rm%z<(g}_vc^# z#zi6-5W&AmL}u8t?BU}FcesSel<13uNBH^Qo!gm+{rUHQe{KJ{>6NFm`=Psa`|+Q- zD0JL}e!aog*#1Y~TuHcfuH(Bk3lyWFp}S^dXq(-h-Zl$FIwKDp*PBSEQ<6wAN5PAw zBmpKKjKK%k%RvIhSOSb;O<40`BqcE#CYCu)5;-mb@L_DgJ{%^NDIOkFycopdh43)Z z0mRu53eFdh$U=?11VJ!p$6; z6aL6Dji`Ij2>F21JGIB^3LCJZZU8M-j{ij3cyNSUhoFEpgUKDMR*(M!4vOb6Q$J4@o@{IJSnDTawvP%wPP4a&@O@UGDla_B&Oz^& zL~&rZHGQZFRKY>f&9x zu3&2^HN|o5QOi|Iir2%AQhRep*ViIn2dQt<8 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000003.json.crc b/kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000003.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..a4a1dee7a0ac284130dffb7957d2f95726ee6606 GIT binary patch literal 16 XcmYc;N@ieSU}8vXTYh}~YF$?VCT|5{ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000004.json.crc b/kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000004.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..f559c66ea438096c1bb9bf7ffab9243b9fe8e159 GIT binary patch literal 24 gcmYc;N@ieSU}DI#oyz@c>!QE^mu#KL_V5OJtP_i+4*v7t3dXk>k9UP4O^nc&` zec$`u_dRFx&C4}P=mOQ~-_QTJ_ezP%@S3A4A%)X2A!J6%!xOYJJx#xP@bjKvG2e_W za$xbcmNOR@2gWUZSZ&=eEj7*L=la7~Tr9V~cy0b9QqgyDtGjLa zJ!fmUx0A%cWWA){tf)`~`btiP3_++=W-@z=U8tzd_=w{tBWI!mkKZ73P(`pkLfAM@ z7ua_SHA$siqD4-M_8PHTY_rmCCUuc~1CLiqqKM3&koioZtE6%$W#OgyN`pL6=M%Dz?>pjnqRZ4mTKYsZ52GNVp2uR1~K4t#C4ZmyVBqMd!gBHZ;% zXv*D@Gc<$2HM8#wE!!Ge+I?l;z-BNSMmP*c-J!8$L|qpejraEr>{Xa@ZIHPc^p?|e z0ucO>=ULEyMgvwlOGW}U8v6$aOX=c>kuC=>uHTz}b#ggm0YKc&^y{;GBqN|UJz?VF z$sk;wo+DmX875Yzr%c{&87JHx1bBE;9*TGo3XrJ|=gI-mF7ax5qaT!c`IT#Ct!~!J zXF4O-v&(j8xwclXRnOZiopbeNr@L~#UbWZGRlBz3)K+WGa&^tNJJwnKCj1oq?}xk# KKZ=|1f5^X_k=gVB literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..dc29aa1f310c4d0b4a6ea58ad5b240886d09b6f7 GIT binary patch literal 2175 zcma);L1-IC6ozM4Yk8xp#z{QGE{H@dwMf8OBxEO!Fvcw<2|*QUucerkb|Z_fBzsp% z3C0&gX(&1L6nYRXE)+@$Ap{?C@S&xYme50cDwH0I3ndT;A%PxC-kY7>9W_2kh}F!m zH{W}2=Dm@wz4=~+5;{i>diRen_ml!vVVb2nA-OkHLP$o*z!S6rEz;e8_P&-)W)>q8 z4NTs4Zg_(9;H&3%GDncYcM{T1-~7HeC7Hw|1`$-Vzxlg#4Gl*mz@&eDHh{PK5BGRe+yN5nj?nudm3AU>#S zGMI1~QwEu5F3FfC$UNIr*5lMnBN<9CMKKLUkpw+X0_@|V^tFb;gBPBI@B#>s2wJJc zEHIs3f>?3Xpbaq^DT#QfB3Z{|Uf|{+=*>EN|$G+33O-hXd%?lFG zm9wt0MyUZTEQaWbtaK)`en!?)&KhAm30SgENpK?th*(F_6N%|eCjL2185>PubcaCH z1X0@LB*;TxCIy&|fPI82`*@B%9!=3zQo66g#t0wd=f>Tvur)AQfqI5FfcdA4x}^i^ zR+d_%^bAlrff_q*PP)SCU;zQg9gvPN;8#`I%NOY7+bjlrfY*`m^1lj7Hv1q0n~m)h zJ#n+snatfs?&B|8%0EFJq_Z{dvs|G8*1uKSHwe6jq zg=8^jB+J1|r`Mi*b$ltL0YKc|BPF0VIbrO=Ng+a>oFiXSDJD`Ur;OigDJR_M z2Y7fB9*TScN|1>T=Yt($Tou)Ht{W6Z`4ua6rD|7-XPU!~?-t$WVr98nDPMG#nir~z qUTf)Mwd^imD7Re4tDLWRi{)k4Z8~So+wc=`!FkqL-i_BHrcTaZjyF4QA=A8 zK?Lc+Q%^m45u`}*ARc;A5yVryh#(Z{)gr~CLh;S+>_=@W31Kty-#5SapP4t4>G7L6 zjBq!W@y92x4o;`A2;DfA5bC@nB7`D>08h|TIE}yVeE%B{g(M4ZNEAZGP5OY7UO)f6 z772xUB>TEgI{38xCY;P8v1IH6q$ECvKRo^MEEXbl$lMb*^zO^{Lg=+ex%l|!jkgz0 zjd4FG%}nVM6*D}@6JQQ0DN@2E-dY!YaaZuwO%adrfPE1cd2=h`#zLOe$%H+xTX$#d${elf zIEk>>Bg$gcKsW)7XkD+-9#l-R-C%E|a@A_i8Tw*RqG&QP3Sy_(sL#&asZtY$ArQ`# z8x51{bx-RAEnzfTWs`amgC*^nZkpA4(-nKt#f|f4yMRhWC8aE9B-mADATGf=gCi^w zkz6Aa8EMqrEB8rIud_;S4^{>C8@vMKTO4GZ6TSd;e#58@hWZ@CD$48ws*|G*?W#kX zBt?h-!(KTDEoS(r8qV^bW!ZBKZ?VgRXZf#C{Kejiv|9>{`!WutKz1TT%t+)3xs;hG<7&2>IjUa{i+95 zN0wGr`u)KU!ygV=s`c6SM|-C?FaW-3`{RRy9uSz;KH<>K5J0%xJ|}#e1Q=IspK|yD z3OM#$!(y9v;-=hQLGl!@gUdB5%?l@;({gz&m+mdMs-}@P%7xreK9?Of2Fio^0<8=T l=d;GpV76lDGR7GTgF9zvx8_p=tPs{s;M?qtE~V literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e4afd94337d5b1553ec8202a7a5dcfa733619db7 GIT binary patch literal 1965 zcma)-&ubGw6vt;bNxRk42)zjP9}o|{2_AA2r1)lcHoLW@B!u0W@4WfEH}mGrE?&Q# z!w3&x8Gn8H{@`>Pi_q=C5<-2KM1)XG5a0>gAXf3WKRXAhh{@SW-y{JhcU-zgIjZ%h z{VEo*3sBBY50%Cle6YLsG3qTf7J363^O(Zt@t0>mUnC-=UYVQ0IKzDYzPB0~4k#BN z|GM>Q@YF1i=cJ`6{h}hr*&_K1iwc1blM~*ef_G#f&Q=2U-hx7ViDB4Jkvq-EEJ09Y zL5K)a5GU9N&B*XEQKgB5=O8?!q)8Dk^47M{5%&dLWl2cx$H3O#*7KcPxteOZXp%JeeH9CX}Y4GQbNTsUnE}O>sP_ke#F%DLJR-;~7 zx?iOhOhX`?DK#1vHR^%Z2U^l>G)oo@B<3dN)C|k2)?Hs5$`syRIM)wUA}VQRQ!h!hImd zd5Dqy^q32zsO0q?LYgnRk#uyi(iC)Vw8 zVB8Nqa2&|b7!fsz{EZDj8I8(v!>;JYiUBpCd*z_7QQN6D>T)8lkLy_^j~uaZRxKSW zXSGRP-EKV8Yt%JN!!_jJu?^K-U!m%>>bQ2Z?5d-xR~n8weRo4;1qDM@6J%}%1wbp* z2BF@xECY%}oraloR1Ki&^v2fKXlJs==u8JM)q17-*+B(|0YKce`}tw9hXkf|PdIct zgb*%w&xzhBA;wp`ryRb7LXNZCuvzj>Ov=42BrouDaJgn@_`%8K^ju!gWrj=5s%2)( z(pYXXpUX~}}Z~srfu$!z+^@=riE14b)W8TDQtTh``zgzo!dUx80qeNzx(`t zclZ09z5LnNC5-R{*6{6nPyR{eungTaRuRgak`Y29QVO1+O=1H-es%qa5y9lxTw?Nh z+x0d$YH#=GC($lK1z$2$4(IW&Zw?>H0%~j~L@D5net1 z{=%|OphM*pTXXal1{AtbuxIN{?A2&-x)wKuS(2n{k|YRfv<$G1n!>O`!v`;H72FFT zKtj|?MtY8@cnPAVtvX)jtsN;AUrM;n)jb(6^5&6@YrJ_d65DPoc!8?GzH=!XtXclG zm7uxmv@JXi;*_k(u8nXO8cN4%J5#8ZW9A^J4cG6j+SbNYw&qb$0WsrsJFS&(T*rfH z2!u}>-LB_Yok+`ombJV6hUY{Qbu$*UEzfg1y-=LW*Y2MjKTfICcXp+@YIR!9N^fH= zih;;_PQOu5;e7SkvQ(G*g41{SRO)Te-uxU-xX}4 zLOqAGoW$13ST(X-s8@Mb@E6d+Ea82T_muKpEBG8L>_|yy@m|FN?nn?J&86^3BTRZC z@x>@6WH&`4dW^WO7Qw7w`G&WG-2v8Z08aQ%Bx?;#csMDI(^;@R3W>v+eXO+v-``LuJ#TgSpy&6SJ!9Hn6&DzlE1L$bQy6k>khvMumD6&35ITL& zv!M2j3e0q-jRPX{k&ZfpB?v zj(AZem{1*_GI+NooM5%<)59C`P{d16fDCmwSGSOMnP=14wx8$emoJ&6vRTTXZuDKx z&fATd(pF|9LL|x{&BQz%kn*DV5w!uMZtz$R`U-M+CfCN#kxkw?9QZ1vpYIFTZjsg zL=Zxqg1~za5*{Qx2vQM5Cp!e5M7(rJ!gB}d_vY_2C03R=(VD_MEaO-}sO=h$5Q=bg1TBTrczOTl%X5rLvez|^+#w*%7g*PRb%kvJgHKOiOXCH(ot{;L>6Y7t$BoEOl??>lqMYd@Z=&wkz6ywW=2 z#q;I)`(xIb9}jgVweyk~rBv^Md5QQkJrl^z(HNPY17znop)B~ZOJXo(U>qlj9LETP zA0^oPQ|5KliB5DLo}3~A9)sYJlp;l3^k%DEARcgmy3XSf&)|s11#h|)acu#(GM=c< zs@B7)dU-~#syK---^q)7MMF3NlW0w?>Yb>NqS>Htgi^(_XEb%bGf^<87zMG-Fl*CO z4=TC=+YkunOQvb)YR%W$Kuc(5qh#p5L}AH#RW*!C&347kbYboC#de?)UP?)e83D2? z^~D9aXNU;%geSMiSVkCjbLEZ%bvc=HS8ywkZ-^BjyK&q>o<^~PGQ|{5cqHdY4r_uG zGeuz0%(^3Gg)5eINLlL+Yn2Pb3a%pvZj}QfEedy}#BjwDpR`~?_8j5?5Tjmr%v}aS z9so0_rf>%AGgQ{cYwBYqhHs+GIu94)`8b&#$IUPq0h3`UPWS=Xd9tWAiuxAAGRhnQ zs?9^4$f`pcCk3#8fcXM0ycG$r`JX}wE_*8imrYy59qDC4gQL^_~_dPLE*rWJKoh5A>VR!|=4R=r}@#8_VGQ?f|xK4np#GZg5W zxrS~lmibtz>b9z>wkrOPEXel!tS%49b=zu`ZMjEw+M+HG-CK}pGr_Bh2C`=c#jj86 z76`S5VW`j`@(}FQdt@If4=pS$^#q$8Mz9^cRBF@B@AkVl3;^QRo8KQcdPtyJ^Mn&$ zh7jV(&2!kVNr-XP<|!vXKq03-V_GzM|4qvMB_vLGb#T3ErM<#Q=agJt$)&qWjf$bA zwNh_xAfL+)YJH{te6L>a8_Z|5f&Oe+Q}x`9oZg!q(6o};EiJ-V_Vf?EjUhA%|Dpc^ DUD2YI literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e78a4c29def15715c0bbf5fa7ff6ffceb1d535dc GIT binary patch literal 1951 zcma)-O=#3W6vrpY?l#tK{pt*fEV=|6TB%`|ZMBvn)q_Y`7xgMqvQ2Ckvzs=XRiqXK zQ9(R-wKq{bNlzkD1QiuLsvvmq(2HJ#o<&4_Gnph)719D zz#6{$^xDTQ1uVm34yy?D?voKhibO}ycH=$x#l4?jWWl7b6qzVsa?G!D(DUbyA8!+^ zVo-RJfeN^Yf4zP8WUFA(KN^7~0MQ?i`teTu>el+5tYFroaWhz_n3wP0EegvqW$*QG zr=RZGGR5P0b942V=q!$hI@5Q^x|X3-e#5d(xU9e{=^7mcF+<-1Y}w1CwrU)?t|e1p zmn7+$Bng5V#|ZZER9Mci*fe(Ia)GG$IE04u0;%CDpRGuVxGv!uS65{`#itu`V!Nh9 zwg9|+BHv!Hg9|h5*?G5V<9>wYL0OX<4#IhuWLkF99Yoav%>`Xi>y03sckIi9`Km|7 z42Zp+-?E${f9y646ch4r_aw&yikVI&R~tB-bV-vLx2 z>jnLasX|Wmv78F$3=v_O$mAF~ZmN?}rlOIcA(pr32~GvF4Y2~`LkMS^6Xrf8;PB zpIBt;K+N#agmYxfX;L%SftgfNIs^7URME$s9`JD^i;p046?zqMADdId+)R=wFqwqH zgbTpzE}|Y%)axuZkhuY!{1)i#+muJ+=R~GP^k{irNGh-y%!Aqkx*ZuCebJGAI zZnyjWX_Kb}s&!A;^i@b9T<)GDeo0bHq;^l){P9US?Rh_-$@_0o&MhNtn%BXBW>DmX zQ!HEMid8NS)jJK(DLVDh@_40OI_!+q4^>9p*|Ecwk~4m&H0#)I`C!={EsZ-)-5%Dj Rz!&!K3*FBmbPE2Oe*^i7pa}o~ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..6fc4437eea121e659f5095870be83edd31aace51 GIT binary patch literal 2175 zcma);L5LGq7{}jCX0y{6msR&YW+2!wbc4b=BxG0Duq;~-R%{H~YmsG=d1gZ=$?nX= zBFjRtN<}>MP{czJ^eAnSvIh@)sFczoJ=lW>p$Aznf*@j{^i=%bn>RBr)kBsrdG9~p z`+eX0-uJ!i*^fV~V1#F|f$#kG^y|F^tim*lb%b&ssR*HrqQMii0xsfvKV1JRFPTiu zrzW2|Ar+*b?!LL1k<4OJa3&#LdGhafvPn$JfHU?7q$#`~|8o7=w-XX+(uf=?BA|zV zKHiXC$CMARzC8Ef;0G%ro?pND@xMKiSsafNIm+W{{LkNiJkLoc*%|H#nfp}J&{zt@ z7gkLI6E3qYgUoj>$!wb-^WCPh9&gPwlA#1s6w^=?Nzmg>fPFlaUTZ9P@WS^XyZ{0u z#H>_g7Kn~dLac1mzzs1PDv7wOB-XBL7dq5~We9}5 z&3@mbPA}GSpyl2Epy|zjFUtbnVRw!2^1lj7lD(6G zWV3dPo-EnvOy+Jg?u#r-_GK-tg^}9h)EZd^i>0Hi8IwYK#8%&�L?%r4hBHlLy)i zH0hjS^!>K&tUJ(FHt#U|BK3n#zh_KT?FGAxjKl9(1RI_W&A2h3q3!p-w7WEPTqksl zZEe$HO<>imAoK^V(3-P&w*^-1^G%EOD7@;pAPY0-DcYt!2)%*lInZ}j4OY@QD~4LN z&8@AuWHDzX%fU;h*B*a$d?};>K-_fv_38bQ5|}nVVdTO|AwnLXBVSS}CQ`?zjNWT0 zCs^zI?C{1s6!`*_AY&cQ`7LCe7S(j5>la1&6)Se7YFCPfnuCt#7TxB2WwBZ*A9oj; t$Ex$RwQ#&zb{CJ8TdqSZM=Nx`yy&`3=dgJLegfY6L7!w1It%}Lyalgf(K7%5 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..805de42f30f413008e3ba11337ada86bd7d44594 GIT binary patch literal 2175 zcma);L1-IC6ozNlD`}&s#z{THE{IetwMf8OBxEO!Fve{m4M7!YuOXO~b|Z_fBzsp% z3C5Q|X($f#6iN{}^jJy>#fKbxXeosd3T^K`6gP)JAcU0m*!I2I+1*i+gN#_s{QKs6 zZ{EB&@|AZ!s!~GdsX-rp_2e*Dq$*6aR3{|=wn_+@QL^v^tw>As*E^37XC;%_rN~4B zllPp!7o_jMesOn3GK)#!2MOu#z5niLl1WU;015wqG)qs?pFaQNi;QHJGa`qI2WaqgfVxCY$+%6B`EFBLkGEzT$xwnRifJf{Bu|c}a#n{7L8w$^Dtnh*EbFV$@kUQZPV>W#W{^3kAlM!uY(Gcm*$*1ENx501 z1wrC_<*duBS#AOgiy?XuTf)vOp_gy681MmJN5acX6_h0VUue1yR?lHGCWZ6}SKrOiQ-pn~F}19-2iiO| z>7ro_f{yKMInY)%?=bs@7lhqG-^kSLWxGO*Gp|^L+rACWxIOZQb};zF?s-GUb%u_y zukBda1XkS&hry^lv=%MiZJ|}ax?^FF!mF+evM__5@;Y7sLVx7@4)mQ>hn3!<6+^B1 z&hGAFvY0cH<=~~;?@Yfsz7)~`AZ~B^_38bQ5>T6-Fmd6e5Ft;`kuRwf6RFcvChxVB z6K)OyJiI9nMZN$f$W(`OZI>9AMKzu61tn2_rK(-6+11kN)~M^dCAYOyU8z+o7u@C6 t`P!1#UcOMPxGU!?ZP)Rt=c?XPWyN({&KYwLegaJ|9c<9|zp(tH2_ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cbff95901324050e4cfe4a48f5c363ed1e3e2287 GIT binary patch literal 2168 zcma);O^72!6vwMOJ(=E^9cFc^=$0^s#tFMJHVK*8oeg0aS6G%MhSjUc&`GzMU?)A( zohZvNu&jt6d)U*UgBQi4xuurB@BtOIK!T|J-WG@ z#K2UuWZtRhaJS}qUWXHcP^rpPc7yB&nWszaD~;Nu z(k{`WAo0C&)-~3uw3|F5cmpKN4&IkJ-y-L+MqeP6BP9zdJ)<j67W$xM}fJVqZ6=IAD=OrZLS@G*U7+)a~h zfk_i`8NLAKzcT8!4yfCCYLUt*pb7#tb=B#Ko&qV&{;Y=qCpD3&vX>9h%V&8E_!u7| z;pJc1lOFbQ4h|bjDQ@wwvysf5AorI%9`-FQ%Y&)Bm*BM5u8v+F=f*}S^!x4kgx2Sa17Zm-)_VqEx-MYQMJP>6eDZ)AtT zJNCdEIj%Euj4!o)3k$$%SkWjPcSqK$#Vakc8n5qLSf4P}cR?0rP)}aZ3qcr;ecyqa zvl_6{TeT9X)!08cSWOpmM!Fol^oPCKS0@ib768QU&AvW+J2C=lvlFHsn+zi4**Wq9 zm0@CacFOddmT{t;AjHd?@lxc+PlC*JId2>gqF)=5^OE*Q@TvrE1r8yxPT@w^rS7-Hvm?yaUhw|9;8Kc|vZ$e~y0u D6@kuz literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5bad38065767d8aacaa058f0c3d49407e0950088 GIT binary patch literal 1958 zcma)-Pe>F|9LL|x?&@TlW%(X6u+%c-qF}==yPJQIum};^7V8=zv(8JGW_NXV)({m! zh#)+4>Li4RC_=)6hYo^NM9`^DNe2lJU6SzJA^N?UH-AitmF3NQ-}$}I@AuyC_Z!Er z-pXKvJF$${zdc(M5?F*~3ziUSyC@=rB0>a4&=NR_mw&8151X{-T$3o6T-WC)Cv9(U zZk|4dWZ&>fA76f5i-k-)663KyAhqMO`2FMWPr_!cvg=Ub0loe5dG;8}g@-?HzBzwl zn8)+s(d$1aLuTH|1aPEuh>9F#XXFnoD#Vl7nLu`q7LwUHKz5E3$dVVkBKy-2Ob`@V z5JH0FMG5xaH1s;^L?=5BE+>eDM<6()BuEh#`D|72#XZ4SH$*(l4Gu(H;M1*$YYV`Y z(Rh7Ex9&~UC#Pve$L$DDhXvV zB|*BBe2WD43=v_Gh~yd>Nl8O)uH2EJZYOi@3T_4R4Y2}brv-PC?&ENJ95+K`7)*wsIN=9i=NU$=Gt{RT zR#EBzP;DG_D60->loY@M0`|%|n6ZG5s$nnhn3sJg;H^k-&HogNf7x3RxNO!U?#M1X zScSPejQgquN`dS~h?tVd??@-KP_Hc4tSL>O(V_k|rxmn0YSqiNsvOH|c`c3Po+B3Z zSyO|qnQc&8vugLX3bl1Zw{`hfWI?s(XQ(=;)@`dXX{)`e(-w7g@b-ernh9Q&4UoAR z6hED!76{dbY3k4*>LBc-y{ZRQ2N#x>di~7~!`}{G%GIgncYEC%1ORc<=JyAU9uSz; zJmJunA%JkXc~1Ci5@1}ldCK7rP{65A*DRL2|0d70siHIh literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06067bbfef819108d5fc05dff5330ba513e798b8 GIT binary patch literal 1958 zcma)-O=uHA6vt;bNxQ}vzdFM%1PzM~Ez~6;sn$@WdJxeV)T>C@WXCkPN!rauky;Qx z5K0do#8WSxgrd@e2M+~N5y6ul1igs#sz{IGL4312yF0Za3Crxvf8YGxn|W_0XOEpP zV1$RTiXXoHd}~Jr%P<_k3PQ;PGD0XO#i0c)gR^+$$>NQaV3ICHCZ|o8a?*O5;8F_2Soi3Bhbo^&Lt)pqC%s&k3Ic%Kkgw&OP6^ zZHmY9=I+zq+Xb^A9tj+2yJby{vo-P;mNgQ{Y)v40OA}AHl9bz9CkCd>X(*r)xHCF`|%dForRx}b>-x+JwV=YTjSmAbr!7WQbWJTe(j2MkX;*CK}$QO^e4#YSQ zPuR;a$aP?b)fDxB{SQ_2@qqbQO5md?w<^QNa37n~#=)2qAsnC=~mu3o+{ud~AhpPClP z+zg7JHmCzayJy=b91y(>GwGNfK=tzc!opZM*<*y$!ArB<=zn%_dZPd!ZrcC+=%7ag zru9$QbTdQ{F89w7Z<7e)tNl|pUqBJ3JJWGk^8TBYiwj6S#p~c;%gOS>$rg-4(I{j` zYQ3gyWvyDNFi|Yzk67ci!^IM~5RgRZ82N zwop8Xc$D4*K@sUeJQbxBJk~$J(nD`T!Ct(G_+~QMOl^@}mdwm|-hAH6ym^!Ht9Nr4 z;cl$qm#;T}CDT}jVH;Kv>NqDOgc4E`o}l$$18?jtFC_$%lS8q|^|I&Ban#E9H=o-D zy9m{|%}{Ba!5@EaeUb%}&i)XkfH(RBQYSu*H&%9@i$S$(p_v5p6!YcV*6b0Kvrm8A z`P6f4jK{ON{Ql2z!7PeL0!Mn6tZNE=N#0;tCtOalm(sPc6($t=CScEAD7IIl=yk2x z6;?@-u1S(0s8O6?A9aOkg~g|_hRbQ9;u{bi($l1f$9Zp6YKprOE^>8E#$&v@FXIC5 zt|vlU06x2!s?FHJ!-?ADv|F}uC&F^CtjV5(a0)s~#V)(OsF0@lpfhU83+mI3J=dEm z_*7It?C`6Vsfh=k>%%ex!iiF~>brI&(mFs(In_qVcO!|yI%{Ry_q|Fz6nis;cV|v@ z0hP#lT3;|#NSHp*roui$L|7&=xkhf7>S&m)@JY~lmb-8Sy8;=9SOK!#hP%lV1zX50 zrg4gs*efwsfsC8QQJz}f04+>4AB((w%3D+LC1kEjacIpgX$H6|L4@>Rgijh_JQ9gZ z?U;~V7U3=s6&|9npNx5n6wO^=HcwRC1NITB=;Iaj@gj+@Aaf1M4RId_Q^eeik})tD zg#!wI0JBy^tx?og5*x_e2UG_~9mp&r-6RFDfPkYT9rtL!hgH$bSL$V-1-wqL8t&zv z!_wUB^#p7-EtBv`H#;7Q+-=JJ&<4kW>?DY6s^oW~8_K9p)2hLgWzX190~RX>>yaDO zylO>D=B)uMi?j=eENZj91(h?~aO+l3eQcH8y6xC?TiZ*_8}+#v*BCKs^`J3XH~I`# z8Z~3&-n>Bz3Z^^L?9PvhpF`?Qz<=`b0b86GofF|$1Nx8p>v|Icf zTr39}esD55E0?!&ne(NF=Q|mv)Sny5=d#1jKxr`F?@kU3=d;exV0O~6-Q4Az+n*hB WoRWP(Ux1tT@E^T}uhTeusQ&^3tftNY literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f41d4ef4baf8a12f267daab98fc7f6f80554a6c1 GIT binary patch literal 1958 zcma)-O=uHA6vt;bA6;V%Ry)Hk1PzM`EyN`uiETrXs!&Rap?Vc5o9vn{ZjyGBD5Wik zAVO)ulLzTtks>{M=)n&J@uJ>(s3OIKV3A(Dcu?Q$cWjY_Fgx>~H@|r^@4emW3s%uih$GW8fovYlC)0C)Y#zsxImdT_ce_3q!w5XX z_yo@J66~F>Z#rmt$D0pZjuQ?~fOANQlQJ&by>-SFcNtgRWbwFdu+QR>-Q5aUwg6n6 zjJD=vwOTKqagY7nV{S zgi9!dIk;!=2(yGG7s*758?$0%wFC{Ck+Vi{D-dt+6(HMTJV5RSv4m3PIF8yR^GGIZ zij-4jVA050EoFHlKWmS&Hci$#R!fQA$WOe}hY9&?5_f?ZwB7UF zrR(G_Fx_nOdcZzJ_3!bJ?y(xe=TT~tg^RKG=njv$%@`R6lQBq6_ygE{`cWGc^)7@( zl-dVW#71>R)g(=l5?Fx4PC9!%>hM9;zsoDS%bw}*R=~UFe-g#L?5zM?HZ2jWrI+oE z{M>EIeHw;TAfE$-O>yK`U;s)e&GSv8CdqR$E|fJJLw zm!N9qJ8D}pnzyC8+Ljf$E%QGDi(-3WP8Ew{t8H|uZ80sHWziCg*A_)uOfaP>Alqh; z{Ax`#KxlMyU4{Y?i?C8niw;yQE-o*p-Nh!uT@Jfwjau)ko$B=*0Ke7V*L#KT5vbNX zq3g}yL2S8qj{j}qVJx+GO7{!MhPnpBYhiF=4y7xrP$7*2EUoYHevZ z#W=pT6ig32_0WUKF^AHC38hdl4IwulQph#I7#|8LIhbO6XiMLWMyqig!Y)D@{pZc^ zy?Jk*cH#Q1Dn@t|>-gW}zaM^B!YaJxv4&9LqKXj8C_``ut&A_?-=4i%lPu2FV~adk zd`g4Biv()(<-^~HB*O&N`;4PX0NefR_{V}|GCYy80LMHI4bMS<$S9E}UJ#=# zr7s>RxGB_a70-&{p^6(~xRZ%(0r=f~FCRn``mhXvaJJRy_|yp!tpHll?etncO(f=K zIBGk-?*-jh94j||I)82is6;hN#(hhJk{OeE4U!2FVU?)l2AQ+8dR(vgPS7P@z4#5X z0yT$N0kWURqvY!>wvp8=;i4e%Epb+ZELhEYQrqMSJW=gLk(oz~xt+yR$l6lU@cL=# z2f3v{lAH_4z9GrvN$;9s$H$w)$*lQ&-2}~+nT6O*Q%9k?qus~ZGtXNUaM8y j>8q7R*P+#^DxIiIyKc+5Y}|*h?dc0W$|1A>f7ky5OlPXN literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..071189c3bc74eabad3f6d9104d95391e2dfafae6 GIT binary patch literal 1972 zcma)-O=uHA6vt;bn{=(Q_|X}5A!u0Ir9xd2(r_1uj=S+x01DefqlgJt;Uw zti|gAD+9DgPd>j)3oboFA*%~m><>si_%wd^@XKS7)TM=P5{xs<+ue@~|Dc>(`+oEF z*<+JDpZDuqzmE%UQ9cq(q@R*?O<}9#FD&bX%c*cxy2cG*NMUP%h+s)^gc{|qYi(1A zk|bS|BtcN4Ou;`gg<&PgPzzpM&JYz}g#?kFA!R(xtxc&d?n}7L)omG1a`QmOC2sB{ zLSF!GTuV3RZU5e##>^b8+PDW{xnI^~*FiW9hElVuv>%l+tRiegt+;-3&as#J(jgW$mBA)YO2MsUg49VfuMTf7wihu9C8K7UJCb;2MV^3S-XiKXLw!nO1DWhD=;o+H)eT73NC`Y3;pkY$7EAc3D$eqO&GIWqcqb8W^Ir$1 zz1cen*lgA+;gfB4{3UYt8242Qjs*FdAhM~F--%vmr9n-r`?HokZ$lqgK}%SR)Ni=; znwBhB!&VMy=Z|%ainrRw?M$jG& zW8&7b!I}z&TnA+C22DU`sSiS}<#{$ViZKC^bkK;P#>DdK>R>xKV6?+w7PmIrd34mp zaRQJx?L0ni_?W=7&IyNZh#11<&NOL#pe~WT^6=_ra99*dS zS$=S`c`IMA^4Wn(%k`YBQyI#S74o@pXSgz27@{-7d*)1lm$<5cYP S`U>2(M}O#L5}_OL|N0N|aH<3V literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f285aedb14ccaad38f75eda3b3abeb273d1c641e GIT binary patch literal 1951 zcma)-O=uHA6vtu--ag(&0L@8~- zOBKXJ&%OGA2gzBA7r*Y>gNS-4B3^{xQ9X!nX1}H?lCaFq{`bxAy?O7=W^VRk9wR(} z75w?bxA*A`7U404WrTW*o<4pQ;zKH}Tvz8fv-hSHnn&hlP z(Dm~Sl)+j2=>DVUGH231=7Yd3`vIvRAHpy0>~8X;eTr`;zOoOe5_F5f4T~xXWC^yUDs<$;1lM|!Y)gp5T>+N^wJqYAVA>HQ+il6W z1>oJY>Bf@jUAxklU$mUUzM?7HYkqrLB6a;@Y7^-SjuArt3PjrY{axc#g$KyZ1U8Uf&fs)F zVq0RYv!tY#i>$D001_=SoAaD6l+#Y&<4E5U;*g?LM1gGy5E*RI5Lhl>w7#P=Wui?~dO0r;# z8#$zm{AJNtb`3Z&%PqTUc=a1b)oz-W*))}ow5l~%mTYZWYc##qd{Z0M*pX;x)0b8? zdXV6iV}Tr)LF3yCwg*D3<+>&u4s9A%+M`+s)uvb1*GHqpj1esdFHUWt`_gWR<#nW-3;N(h)yoEr zlg%6Xf|1V-S6Ys1Wv$9sezK6yOD>w5P z;VvxWjc;FmCR131VF#8Fik}w|LJ=VjEodp6#?Mw?uSB_!WUm_%g^*F(DSM>PJ1e^} zF3cy*Uk^z82j4e3xe%Z9`}d>Cdsiz0kdpW`e)st6Do^T^-GuZ#HT34w$GLw{&aLd; z*f@J^#Pjq1;j4q=T$t}i0NctbQIVr`jot%`3h`ySCXm_EIGL^kWVW18mVEDu9CSID zASkjRaDwE!gm}Npy^b<>GJAM(ib(hxm_tg6lyJ%GtqFm+BLwQYh)2ADA0jS#-OY#_ z3&1C1iN>t%+?i@rX3Uz7lL(95qAXSogcHz-*7cg%jfyGA$KFWgs?(e?^o8z3(V}7$ zqT-fapPss1H7%HiKsZshZOhc_z7_{9Vc4y*W%?3LlWf#<%c|C!uGpO}zC3gC6i|t% zq?DzM1jnWHbx5$!;0TLEB%|b7MjCd{%54eiVJGK~U{~P0!7D(vI&c?x5XBnGlu|h1 zk=T+LYn+raC1BB$bz8~`M?7nfvep@EO$coT+Yy*s6M#sI!fh!r9Pz~aotThqM%)2n z)U(I!Wngj#m_ap#Jz)Ps<#RlxIiAGuWt3SLVPm`;hmVJ)873nTG7QBDKLE2Qk9tW_ zA7WTVnIC|Pd#FRF%1C3R2oYe|FXymF4IfqcEN^L+J!W__5^nQfg%WJ`W&}2yUJ9ac2zs9bDHg7tOs$G|31+7oZA^F0Q zh{l|y!L6BVnN7{H?`k!(sT+Dzm-i!!YI9-MREN|?(`i+jYF1^}qM;7mT2$#}f>%`o zWG@Vg-<&ob5b7<<(&2)rLon0Ksyvl^oiY&4$W$`LnCY@m6{3+3-`-s+ z2Eni_5f!1^_c21KpcG&P%^+2B@$2&are&*{T8dzK!p;TQnxsv>{C)fJ&IaioPi@;^ zn@u^3MSnVpVsFewLcfc3ttj`%UIgnD7KtpGASdXxyq#lcIVqzUR3(RiL%KnGq?gT> z3K_~Nn2w*8qbUnt^`pV57&B4=aR+PIp~;l(3f2uodjTE$OQnC53iP6;X_Ihdvnhk3+e!WHa}h?~B+za(?3P_e+Fs5Po%uGj>TD)r zJDj|>$0BBMmzeflnVYg3X2*=;a5jj|nmP1@&m*(*d~V7P!Y7Y|oY}k}79$aY5X^j^ zxjq;mQq;^9)a=X`i`sQ?!nh8HUA$m)^XSbhvrgh>lot!69c9OUFb=C(ovM!QG;F83 z>&`r%SGn7C+6||Ez+3KqqbUZhgGQaV_v-`Bg!9l5&3c=2m+k3YxXI~*{=uy~f&c#s DSH1eV literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/cdc-00001-308c0cab-92b2-41e1-90bd-9416b10ba6a6.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/cdc-00001-308c0cab-92b2-41e1-90bd-9416b10ba6a6.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9420cd532b0856f38fdc2c35142f7443bd5c4c01 GIT binary patch literal 1028 zcma)*L5tHs6vrpiT7p(tp)(|sLkVu!g&G>#y4})?9u`@bvUn9Olg>07Ow(?XDpCrf zH@|?FUc3qmegN+tymTQe=Dk)_cK{H65d^mgis=8sh(Oz2~vP6KbO}gaEpLb{1Hf*F?oWxa#ciZ0^`rkLKR2bS8;_Lq$m?WDWSR!|r8400kwKb1(0zQ)8x*1#auoRk)j zvJUCM8URanBO4`az1n1bm8_2`Yo)AV{}rD?T{^Z_O262^q5|0pZ7#YttN2+Wl>Wfv zDWbm=iY=A?R%&n-4NaRyp3PQCen5bj`%dN;iyjMBC(_C z3wt6G26u^R-<5?aPs8k)aT3i(iPO1b- z4|tusZKvCEn)|%t?zP%t)Y)$}d3Ucl;!HSq9no%fId|Ev-iMov1!P~9!ilAidWGvo0+7Gn@zeOiiCjR zj}UwF2k22ede*bxMQ`5x0YW|a?nm2|y%fTmcb<9PXWp+pdhp05h}4Nf-hO)h?Yc%V z9IHe|=*C@)5UNOt1#E-dA)nv<{e=sZTUJJ?0A+vf0IN;9A)EF)0U(Ll#=$Oeo^O3qVoZhd&oGIE`|O}JQZ~41Q`8t^U4UkRcdjWRm|dNg;4q( zlc$LOmMAu4`bVn69GZ$U52hwtFle}$_TGFhg3z7&%C(l+F)gHQml)xRXTq3HViB3a z{DnCa5rZva$`5I&CzH^7dKd-qIMSQ?#1$S7_1@#9o=gv}Tn=zfbNxt6MF4;wdmeK< zAV8#O>gS?*Z@F4EFM?Bzi*R_w^`}>lF0RXE5?51svO>zkyzHG>(8&ALuuZ#V+KnwI zc0JzUPTTIbY-^8qoZVJij5~WRi+6XeF=xWQZ;Q6o<=kQ0Y9F>Z`=K+~xFh)R{{hanzyiNsR(iU0q-Z-iYr2BGpkS~kLiAIP6zMBW7;@ScDP*K1 zfrNEP+fEx;sut;}Sc}y%>#JhDPgpbP)*SX<@(J{W3TLLZ*6Oz=z+S60Uh^zx@H0m! z{f^0FM1N}(J39TN6`>Dh!x)Br#z!0q&XU@*a~Ve7FfguFS)Emov9sog&U^-KIvdNF zg~JzYAY%@fh#Nn&iLE+eH|;16$GzAt+s{1di^y(1nb@j(@W~S(XEra0WnYFM1Y_Ul zT^|e(Da-a8YBwj-Y56iZVO)m8E?&^Tdi3&jStM~a%F`(_4zpw58HA;*P9>MQRpyqq zyJOE6CDE<9jjCJO7xnI5wI+M@{c1%t_9{KWrMv6OTBRXGm+zP@xW(y%{=$trh9CbQ Dzc=>Y literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/cdc-00001-985fd824-b34a-4f3e-b7e4-90bf8d04898e.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/cdc-00001-985fd824-b34a-4f3e-b7e4-90bf8d04898e.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fe6fc81be2e26c6bb3119d40edd220d7c6c2ba60 GIT binary patch literal 1021 zcma)*L5tHs6vrpiT7py;p)(|qLkVu!g*r60-R+iM^svafl*OYW-Dzj+hE3CMk}6UP z3*P(;y?7NA@!-vaC&6zZsGqh?Iyy-hBQ3 z>V`!yEDJrN~|cn-ms_tdJll;nv$7VVHuHk)fi zs418Z9~7f84^O*M|3nTsDS)_%4eSX-iZICsd?+{3p{49fD0cfNJQ&Ds{BnHpfnw7# z+l~%xF&juPm@1xNis^m2@96Dxcj-vcR{#V|(UfblVhhO?yi?70jhZxk5UYOKMk$k<+UL}xyOV>+A2n1!Pk zY$#(6mxvobwW+O6!fx4798UVNU9q2e(if54dNQ@u>A@#YfSlR9AeI9ef)Gr6pZ9z) zK%}hL7f`!3oy{tj!3pCs9I|-9;Of!K*JV+})hJJA$T-NheP?*v}wr29!ilAibqA#*~~Ou+-%bQP$cB! zudp}o9*QR~f`WfR@#4{oKR_=czWdR3O)iD-=ACDr_nG%=4<01?_@25u#@$Ql)<-Vj`ojkxH(VCXj0F z)1J}qrVU9AC?(BF{i4q2MCUCgcad=_T?+k|cq({EIWW3Qy;}m{rN#a(af%syD-lY+ zV)7W#pAyA}On*sLm_c1pCc(&LQwGg8)4rR}MG(3ZU%A#WTc(AS%_T;7;+Zg_lUPJ% zFnM8)MZ{oznDSkk>B$uIjvhuqJdE_Ze&h;|hkED9Oiv~US1t!Qr@4M4Mj`;fk3Ek$ z9uOc>)b(>wy)&E7>leYP#zi>1;`*bjM;BM+GKs6HJe?!uVP5vVaZt4U4z8ts!T^-m*p0YIE+eO|=Ufoc+*m*tP@s;{O4P C)%2GD literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..918300ab2b1e7ea3b4e42473bcb3a7cc133188b7 GIT binary patch literal 20 ccmYc;N@ieSU}8|<{+N7bgUep~*NZj+06bv_v;Y7A literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..e8e5b3d53e230762c4b02bf23f74db4f35f928ad GIT binary patch literal 20 bcmYc;N@ieSU}8x0_R28K=4Nwp;#dj*Edc~~ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00002-ddca9e04-03ef-4533-a9c8-05c1d4f79d6a.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00002-ddca9e04-03ef-4533-a9c8-05c1d4f79d6a.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..7a2a49872da09d39de424c2608896318f644853c GIT binary patch literal 16 XcmYc;N@ieSU}Dhzuy-~a$8JXeBd`Rx literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4298341a2afb6d6fe625be454d762f3f5d47d467 GIT binary patch literal 1034 zcma)6O^ee&7@kbmZjee9Izs|El&~9ip@znOY`5g1$Rbio@u*05+L`SJ)3lqUD@*Cc zAF^kUi&qi!;6L!HNAcvzg9w6eKH81#Wg*PG^UU);@7Ivt!)F#jq(l_*_xG>2MU7xM z=825Z?H!B|%1PU>0#+tFs5@f*p=X0gNq=jm0kNXc0XE3jN_J^G2c{UQCx46swwQ z8#2_aHsn0i0{#S3OdruBL$;F&la->qxZdOhc|p@4CPMUMj+E#dOiW~SHBv~G;sj!? z4(%CTd)b!d4ti;$mPl6fOQd;)NfQ~D(zWP+Ma1IAlmo-w=;H&UGe@YY^GW{c!U>+4knfN1ZHq0tg_BR;8nQOwh&SoMs z{mE-{EJ6kkiYZ^ExgJeKZ|gzm&xWC1(N7%V@<4CDnCsCL;mY9vr!>b4#Yp%7cr({! z0~ZM3DJuF>RBz80i^|G5)>yei6vrFg+&lYhu9LW#%F6{(o~C8rAN%FBPi4!rYNl1* z8_XP+m-(P>wQ5%NkT(VgwYnHK4r^83I;ajg6IRm_^=gar0ozw?c+JU&ayf*0@E7<8 D^o;$F literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3f6d0df77d0b4ad352512302f91f68e13f7ed889 GIT binary patch literal 1028 zcma)5&x_MQ6rN1CB}kp{f3|9}_qyjQ`CzWnGmwwHx4Gw+-4eecJ-^dCI62qI;okoO$#KKb(b^5g9cPd!d}C6Jm; zISa*L>I>UtW1-%`sC23PuFyW6kBBxhSoRDC5c^dUVmtgaw%+FrPBw#IIYQ}A zOr9ZnB~fh1^p8}Ax2P)0#2=e%%Ao0H+<)_x@B@3|DfznDG;2uN-CzV~jtNh7HWQ)g zPhOj@2pNnLQ+`NuJ$enjtp}k$8-;pRe`yPc2YUPYT#sH5u51o)O0&ICjD-(?H**{| zbbtV!qN*=N_4a(Rs9rnA8rSX+#rDQG_s)*YDv6t^yjURRNm}-O*RP~~s#vB~H?7Lf zaArHa!iNp3Rkvz~yg59mH^iuUSg-NcL2bmDu=Xv{sI@pBvR$E9sY!J7wf;>CYp_2j{We}Le{mmj;W?PVcMX1@8p&%F0$a(wTR0|ZK-!I!tc z&aWFlaajPBkQ=utAta|1a74|3GJJdU^L%5;vQbYhd9*y>frqxcunV6*z5j4?$)=yA zwoSB6#)3z(I}W5fVb%u*Xj*%!Q%8u=$u{!|kN^_6r$=?#S1!)$E2{CECuKBRunLJf^dW zjO<|a(jLl)<0azScV%kECtY(1Hp@#*0wSD>8PTtAWn8KB@# zJdgK0G+?Bxm={p9HJ!~Wm%$0+G8}ht{lV3vm#@nriK|hb&WLuHm3?Ozl(RmS9ownd zPIG$FQ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..ea2c2b42234071eea2f00c6bda9ef050c478d8e0 GIT binary patch literal 20 bcmYc;N@ieSU}E_C<<1ok;a-uMj*G(pNR|hq literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..60e5f4b8ba53c3e6ee5a057068461b51adbc033c GIT binary patch literal 16 XcmYc;N@ieSU}Bio*&5lI!0iD5BUl7< literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..75deae3c0d39cb59dd46a8ddd5e0faed4ed7f7c2 GIT binary patch literal 20 bcmYc;N@ieSU}D(TyU3TM literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..9acb3a1d1b2864693dc208c112bd1bb19379bea7 GIT binary patch literal 20 bcmYc;N@ieSU}A_~a$v50fze$jCyu26JT(S( literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0169eb7b25f12b0dc95de5372959343a4a62c560 GIT binary patch literal 1041 zcma)6O^ee&7@kbmZqR}iIzu8kl&~9ip@xRGyWJ%hMHZ1#idWGx=}fy})3lqU%2Ims zKX}`-7cX8t>&1T{=-HEa@F1w*n~!!Qxh#a4cb<9P=lvQoxc|f=h}4Ke{{H&;=C(#K z9Lq#T=;jVa2$iI5SOKe(9rFIe*NYsZYNw1cFdhoucSGQ{Ntb;3{rc^#9PutsiB%vj z=A1=hIG=@)J7p80-odJ}6$i;Sf>jEK7%r!(8zXmBE$Ao(D5GmsCMO_?RE_q@AhRwd zGLB6k^&VBiISbB);pkLM87Tv}ixup0L@Hox`D`k7QBRBMF;E$fPMJRu!{}nZzMxpu zRNIiDY_%hop&IZfm}2^Xo*1&7)|jpo?IjhbC&&x>1~Cz$A4{Z0UtwY*qpy)lrj#U* zX!Yp8=-cbYtas4MTD3;n+a literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ff7097554e42a7a8c015442c56cc6df254c343b7 GIT binary patch literal 971 zcma)5O>5gQ7?z#RGbOMMB4L9M!L+7@OvE^8(~!%?C}V^WavdWmk(Qb|PHHOL*Ua~7QsxT81A8Jw-jr~K{8TVSPS3*STq!+3s?{ZZr`IV=C}z;hZ7Gm# z&9&5AtM~{xYvo$>MiZ%cj|_kT>Y07TEY)u!+v@L2n2=9WoqSP<>geQ$QUR}PhB1pK zl+78KN>hL7D-p&1EHv(Tv`Op8IQ)kZUj!7c;$k5Z8qHqQsYn=S2{XPbOIx~ax9vEI z7URUO*)M$&@YrrYU)u79Amwv_bDAF}Vj>~{!bK3UQ2+#}6g9gLwcE?psgDIE|{gPgR$?9(Ah+qlF*vDjzl6mgm+_cyn~@HN?1i j;?;TUxIX4gxJRyN)LWd7*rC~hmzsU(JG`ns{KkI)8^hyO literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9d860c43c2d88334ffbfc4f9f85757b6bfa57471 GIT binary patch literal 1035 zcma)6O^ee&7@kbFOVENVbcRHNl;DP4sG+f~+bz8)xQLWeyo#1-XW9*$q}?P{meQN( z9}v9l&7&88g5t@$H*cOih&K;<@XbfJkz5wS%)HM$@AG~Q8Qgnp6GSRRC2!t+{$0`u z26LMz2wk~_5kj(5f)%iv#2}x3zJ4X=DA(E>$^*}F&w$q?ZSv*M+xJ&9kJ<+38#& zllTav-otV@XTixZ93Kmhk!=8Xv5FmzNEr@WKJ&yb>gh2(2Fkk20yymzY?{?CYeQDJ2Oc zT0J^2``xT9*#>H9n=(7E_chl0fXN+Xo=Y3S{|ZQiM|1*=Zn23)1mYE1U1a66_#qQY zzhm+Y(O(k9rb7Qp6_`g&RcFD}Vsi$Kw~`rH&qWYAGhZz=td>VAO{p1Kx{>vSO^ zE112oJP|Q?QB3_NEsc00M#l)FU@?x2nlW*N%R{5{WNE}xge!*woY5RV5>pWX;4fU4 zja(prr>GfgQKPe5t!fv}iN=LHL~;D-<-POIW|PF_RGzMo`Y0>=wii^hK2>eYZdi78 zZ?te+Uge{v-EP?RL*5!4G@4@EI&9Q=`=CDNOxSmA(X6*QAF+L{3$Ho-kSrrKfZxDB DVWa)g literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f126fc421b7c282ce0041194faa6c464480935f0 GIT binary patch literal 1028 zcma)*&x_MQ6vrpiT7#Blh0c&j4kfr@7wXW^*6o&FR9IwL%HmbDOghtU*fi}XsUoEp z@16zezaaPr_y-6c^bc4)?ZMOj0qa3ue(biA%R-pUeDi&udGF2S=<#!#AW|V3`TFtq z*=>VhSeA&2(47YuAyiOGFoI@~D*68Y{A^>*ve8T}MX)^Ofd{rGX_Ie%K76{nX1gy^ z+a}m%Q^6zIpH9Na9rKaY?_pgl#(QKhf^`auM2?vtC+L;DnPX@0EQy!f3!@;o}b5a6v3v1XFh?HSc^!ZqBp+h6K$3nS3IOhIH_M^Ac zs}B_GhTgSQC|Pf*C8z}A38t7nq5GEFP3ucXinfz#(-q_eJ%c?FqF)N6Lf>J+kkvCt zIU^+rB&-A4vwFaayOEADYqeTu{fJqg6V_5$!Tw7=fjV?(EtPJ*fq4aX%e6W0TF>IV zKq&o%$ty%J6pAgCUMdwhi<+iQf)V3W4ozoC|JiF9gzm)GZZ%ko)seQ7bA+cJgQGf~ z$%qA$H*72;4tI%bKb5%|pN82n!zh>yBC}==UFnI??7Wzp@d?5wSAd+^TtAW{8Gzu= zJdZmb7$8#C%qytbnJ*T#>)?cO9S*y={^;h>>sMx##LXx#7f3tE%6>Qws#%|^HnSVd zu5LRs*ArFYH0^f7uJ4JKv)gFOL2Iv37wz5pKyYb4vSqX07Q*2>dKYeT`k=pX>yF_6 F{|6~W`@H}F literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_delta_log/.00000000000000000000.json.crc b/kernel/tests/data/cdf-table/_delta_log/.00000000000000000000.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..966584a04c4328886ab83c78df27572a9006e49d GIT binary patch literal 44 zcmV+{0Mq|ta$^7h00IDUk%Z=vu(=k31Mq)Kxwuo5lbU3kNk9|X9#G3K!g{%EbSK1$ CdlMG` literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_delta_log/.00000000000000000001.json.crc b/kernel/tests/data/cdf-table/_delta_log/.00000000000000000001.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..557f7ea9da9b9895baf1fbd4db3b402667c4e415 GIT binary patch literal 40 wcmYc;N@ieSU}8wSJ}ctDoL>TlPIdBetu57WuC8cdeqS?(DXgfz=iTe~06g;&RsaA1 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_delta_log/.00000000000000000002.json.crc b/kernel/tests/data/cdf-table/_delta_log/.00000000000000000002.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..0ae38ad3321724a2861ae841d233f3ae0d674ccc GIT binary patch literal 40 ycmV+@0N4Lxa$^7h00ICkOw`gMt}PU))ua)m%SKJdhWehtli`3YjIJ3LZb0}q)e&?6 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_delta_log/.00000000000000000003.json.crc b/kernel/tests/data/cdf-table/_delta_log/.00000000000000000003.json.crc new file mode 100644 index 0000000000000000000000000000000000000000..26b5bbb5695c3a6763b6ba56d3a645ea52eafae9 GIT binary patch literal 20 ccmYc;N@ieSU}E@u^2L;7mh27xW}65A07(i5(f|Me literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/_delta_log/00000000000000000000.json b/kernel/tests/data/cdf-table/_delta_log/00000000000000000000.json new file mode 100644 index 000000000..4c635f5dc --- /dev/null +++ b/kernel/tests/data/cdf-table/_delta_log/00000000000000000000.json @@ -0,0 +1,13 @@ +{"commitInfo":{"timestamp":1703265018828,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[\"birthday\"]"},"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"10","numOutputRows":"10","numOutputBytes":"6897"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.0.0","txnId":"d05345ec-8304-433e-88ff-6498dc37ca19"}} +{"metaData":{"id":"d38a7090-96be-4b1b-b20f-b85ad8ae1a38","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"birthday\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["birthday"],"configuration":{"delta.enableChangeDataFeed":"true"},"createdTime":1703265016759}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":4}} +{"add":{"path":"birthday=2023-12-22/part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":694,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"name\":\"Steve\"},\"maxValues\":{\"id\":1,\"name\":\"Steve\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-23/part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":680,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"name\":\"Bob\"},\"maxValues\":{\"id\":2,\"name\":\"Bob\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-23/part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":687,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"name\":\"Dave\"},\"maxValues\":{\"id\":3,\"name\":\"Dave\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-23/part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":687,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"name\":\"Kate\"},\"maxValues\":{\"id\":4,\"name\":\"Kate\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-24/part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":694,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"name\":\"Emily\"},\"maxValues\":{\"id\":5,\"name\":\"Emily\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-24/part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":687,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":6,\"name\":\"Carl\"},\"maxValues\":{\"id\":6,\"name\":\"Carl\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-24/part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":700,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":7,\"name\":\"Dennis\"},\"maxValues\":{\"id\":7,\"name\":\"Dennis\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-25/part-00007-8cd4b5a3-b4dd-4bbc-8bb3-721fa82961c6.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-25"},"size":701,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":8,\"name\":\"Claire\"},\"maxValues\":{\"id\":8,\"name\":\"Claire\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-25/part-00008-436dbf31-f213-4b3b-bcc3-5df022ec6b35.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-25"},"size":680,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":9,\"name\":\"Ada\"},\"maxValues\":{\"id\":9,\"name\":\"Ada\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-25/part-00009-685aacbb-c7ac-4cb2-93f1-6dc27cd2e980.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-25"},"size":687,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":10,\"name\":\"Borb\"},\"maxValues\":{\"id\":10,\"name\":\"Borb\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} diff --git a/kernel/tests/data/cdf-table/_delta_log/00000000000000000001.json b/kernel/tests/data/cdf-table/_delta_log/00000000000000000001.json new file mode 100644 index 000000000..d6d9b7833 --- /dev/null +++ b/kernel/tests/data/cdf-table/_delta_log/00000000000000000001.json @@ -0,0 +1,13 @@ +{"commitInfo":{"timestamp":1703265021675,"operation":"UPDATE","operationParameters":{"predicate":"[\"id#1065 IN (2,3,4)\"]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"3","numRemovedBytes":"8187","numCopiedRows":"0","numDeletionVectorsAdded":"0","executionTimeMs":"808","numDeletionVectorsUpdated":"0","scanTimeMs":"639","numAddedFiles":"3","numUpdatedRows":"3","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"6","numAddedBytes":"2705","rewriteTimeMs":"167"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.0.0","txnId":"afc86e2b-99e4-4dc3-bf82-2bfd5c6762bb"}} +{"add":{"path":"birthday=2023-12-22/part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":904,"modificationTime":1703265021654,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"name\":\"Dave\"},\"maxValues\":{\"id\":3,\"name\":\"Dave\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-22/part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":904,"modificationTime":1703265021655,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"name\":\"Kate\"},\"maxValues\":{\"id\":4,\"name\":\"Kate\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-22/part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":897,"modificationTime":1703265021655,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"name\":\"Bob\"},\"maxValues\":{\"id\":2,\"name\":\"Bob\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"cdc":{"path":"_change_data/birthday=2023-12-22/cdc-00000-59fa51a4-edbb-4fc0-a497-6969cdf3966c.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":1028,"dataChange":false}} +{"cdc":{"path":"_change_data/birthday=2023-12-23/cdc-00000-fb59d34a-5bd7-4b10-8c41-71e38c07fdc2.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":1021,"dataChange":false}} +{"cdc":{"path":"_change_data/birthday=2023-12-22/cdc-00001-308c0cab-92b2-41e1-90bd-9416b10ba6a6.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":1028,"dataChange":false}} +{"cdc":{"path":"_change_data/birthday=2023-12-23/cdc-00001-985fd824-b34a-4f3e-b7e4-90bf8d04898e.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":1021,"dataChange":false}} +{"cdc":{"path":"_change_data/birthday=2023-12-22/cdc-00002-ea0bad63-f199-42c6-bf85-3b9f5027578c.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":1021,"dataChange":false}} +{"cdc":{"path":"_change_data/birthday=2023-12-23/cdc-00002-831078a2-a13d-4713-aa88-7d5f5228d781.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":1014,"dataChange":false}} +{"remove":{"path":"birthday=2023-12-23/part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet","deletionTimestamp":1703265021672,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-23"},"size":687}} +{"remove":{"path":"birthday=2023-12-23/part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet","deletionTimestamp":1703265021672,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-23"},"size":687}} +{"remove":{"path":"birthday=2023-12-23/part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet","deletionTimestamp":1703265021672,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-23"},"size":680}} diff --git a/kernel/tests/data/cdf-table/_delta_log/00000000000000000002.json b/kernel/tests/data/cdf-table/_delta_log/00000000000000000002.json new file mode 100644 index 000000000..abbe569bd --- /dev/null +++ b/kernel/tests/data/cdf-table/_delta_log/00000000000000000002.json @@ -0,0 +1,13 @@ +{"commitInfo":{"timestamp":1703886093785,"operation":"UPDATE","operationParameters":{"predicate":"[\"id#39 IN (5,6,7)\"]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"3","numRemovedBytes":"8268","numCopiedRows":"0","numDeletionVectorsAdded":"0","executionTimeMs":"3666","numDeletionVectorsUpdated":"0","scanTimeMs":"3237","numAddedFiles":"3","numUpdatedRows":"3","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"6","numAddedBytes":"2732","rewriteTimeMs":"427"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.0.0","txnId":"179df3d2-696a-460b-bebe-eb911c63e0b8"}} +{"add":{"path":"birthday=2023-12-29/part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":917,"modificationTime":1703886093724,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":7,\"name\":\"Dennis\"},\"maxValues\":{\"id\":7,\"name\":\"Dennis\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-29/part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":911,"modificationTime":1703886093724,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"name\":\"Emily\"},\"maxValues\":{\"id\":5,\"name\":\"Emily\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"add":{"path":"birthday=2023-12-29/part-00002-7dd6bbed-a0c1-44f0-b729-42b7d7d7f5ca.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":904,"modificationTime":1703886093724,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":6,\"name\":\"Carl\"},\"maxValues\":{\"id\":6,\"name\":\"Carl\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} +{"cdc":{"path":"_change_data/birthday=2023-12-24/cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":1034,"dataChange":false}} +{"cdc":{"path":"_change_data/birthday=2023-12-29/cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":1041,"dataChange":false}} +{"cdc":{"path":"_change_data/birthday=2023-12-24/cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":1028,"dataChange":false}} +{"cdc":{"path":"_change_data/birthday=2023-12-29/cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":1035,"dataChange":false}} +{"cdc":{"path":"_change_data/birthday=2023-12-24/cdc-00002-ddca9e04-03ef-4533-a9c8-05c1d4f79d6a.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":1021,"dataChange":false}} +{"cdc":{"path":"_change_data/birthday=2023-12-29/cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":1028,"dataChange":false}} +{"remove":{"path":"birthday=2023-12-24/part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet","deletionTimestamp":1703886093764,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-24"},"size":700}} +{"remove":{"path":"birthday=2023-12-24/part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet","deletionTimestamp":1703886093764,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-24"},"size":694}} +{"remove":{"path":"birthday=2023-12-24/part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet","deletionTimestamp":1703886093764,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-24"},"size":687}} diff --git a/kernel/tests/data/cdf-table/_delta_log/00000000000000000003.json b/kernel/tests/data/cdf-table/_delta_log/00000000000000000003.json new file mode 100644 index 000000000..26b9aa78d --- /dev/null +++ b/kernel/tests/data/cdf-table/_delta_log/00000000000000000003.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1704559499570,"operation":"DELETE","operationParameters":{"predicate":"[\"(name#40 = Dennis)\"]"},"readVersion":2,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"1","numRemovedBytes":"917","numCopiedRows":"0","numDeletionVectorsAdded":"0","executionTimeMs":"3479","numDeletionVectorsUpdated":"0","numAddedFiles":"0","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"1","numDeletedRows":"1","scanTimeMs":"3157","numAddedBytes":"0","rewriteTimeMs":"322"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.0.0","txnId":"ef48960f-ceb5-4bc2-9b59-8c947083ae58"}} +{"remove":{"path":"birthday=2023-12-29/part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet","deletionTimestamp":1704559499540,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-29"},"size":917}} +{"cdc":{"path":"_change_data/birthday=2023-12-29/cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":971,"dataChange":false}} diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..7ccb48758195a95ffceb5d0040f4b2200480acd2 GIT binary patch literal 16 XcmYc;N@ieSU}D%UTPz}^ctZ^U9;O4! literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..124dae08bffa1ba80380fc227ff8c61cbb209822 GIT binary patch literal 16 XcmYc;N@ieSU}9+fvp@9FhuO9OD`p0I literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..031ec7d55e658fc2c15070e5253061455ff037e7 GIT binary patch literal 16 XcmYc;N@ieSU}9M7HaYashuO9OCZq+x literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..41c3d9679b50f7a556562a8b25282ea5478092e2 GIT binary patch literal 16 XcmYc;N@ieSU}C6#wJqqTfanJRCpHC^ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7a24bef8d2e6e3b0fd0b1594754f560924ee9bc8 GIT binary patch literal 694 zcmah{L5tHs6rPOj64s&!oneAO%HW1wh(lu9-EQec4~xiBiXKH|(iv+oO}k0DBBf_< z9z5vHpWsjM=1qT)m*Pt@E(;#K%$xV#_rCYOc{6?VEF_3@2_ru~eB0f02?lqM*a+?4 z#|WVtmIWuo?h#IY{r=%2;@y%(lJbRpPK8OPQA(4%x(;@ zHJXfno$nI{zMmF(i0DU)V$Y_ZtuCna8C&LaAy*QViCUz%)OnFCGuDp8Km>>#{evhj zQUULAvDT%?m+!siTt{yNdAD){KdhF8x7sYF58z>=)g zbDcvVTc@dvQvg7v_W55@KH6-z{#9|!aa9gol5GCJ>X%!@o|=IlEb^XF*b7A%iLiGV zuai{uRNN1TQ5c-6L3|SR^=xn&1!{N_%#_sOaj5&jP^nlRIX*19+Vx#n)f9g0AM;q8 A?f?J) literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f5e6762c81d7862c68259450a176dc6ead959261 GIT binary patch literal 904 zcma)5L2DC16rP>Nu#_MQdBZL&5*8a;h(nenZ5l#Bks_sp;#H(fGGn@Qvq^V16$!a` z@<$Y;2XCJBA9(U4o(i4>@h*7s;G5k|8u3zCX5ahfd*8hG?d;LRrydX}g9Y!e-d-9G zP)v(p60&}u5<=FDHCz#MpaQRdT)tbS+-_!+0#Y7}bA_}L+=1V}K7U_dQ7V>G8nN$E z+bU?i`~}ozn53|-M>v(qVZ*kNNi>TR+hHc`BV9*5=;hIc0ed<`(&3X*JQ309aXdIt zBLPJOH>gE@iEUv}2*pTkkV8k)HBdSpoQQC!j+0lD#Rj9cV|S}2wr{tHS+sRMP)6A! zwqG^7S^wF}$YwfCmSJBwKb}bl`%0nAt_~@~bT`PdIeQ6=}O;&HFRy~<{P>IbqS`%SOi@ans=)!%6})u6T8sLS?F eeISMMwmsFXx25chExU_1Fnh>Hye&ug5B>$xH_9Ub literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a6c9b9265cbdd71e2a9a8e31d1abb4a0aa0dd073 GIT binary patch literal 904 zcma)5L5tHs6rPN=gr$l?-;h9%65O&2bx3Ht+bv~54~wizS-gsrX=m&P)3lqUij;!j zMf?qW@hW)Lf1npn;zhxe7w>{6557sWZNeIE#v!GhNxzrMKQ z0L8QjCLtR)DIw&NQNR^32P*L9?W^adHOke-igHh+3TY*{2ETuO`mwP_E0!}Fv9D9x zDrmj@1=MDkWU#JBIFrd?!?ut~G>a13VI~|PT}NH$=Fyn}`#MC@(cMxq5%J?*GB{Bq z0YwBisYL^cwlF9}Vx%_7k)!DvDD?&>A{we*`fPH(!Km%novMlU?Itmcwyp=tD7(WB zs%B^5f3Y&Mm36bo&=>ZPXA;7`P$;u+25?gCI#9}$G6R{`A?sGVsHMAItXgZ8t!w?% zS}&>AIx|*c|CrB2+iV{hogF=^rISAt7<)(I24NozMyn?KY?N_^HOm^uLoOx)hsCqG z@h2)yf^lSBuJb1Mh_(G6BRLH@j`(z{QXY?=@{vk~6sfSj8#7l=(QUg)8czqQTXP=; zDwK)aelT$5Vo6c=77> zPrWj6F_njNVjbpf-x|f0d`=ag`*rSDw))c`loi=;_^rC{?a5|;x86{L=3d>CtzB;* ch4OcN)$m$U_Qkf{!5g@E$a}mk$M_Ha1%VUF6#xJL literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..51ff3c55b03994ff69af0002e461618b84e833ef GIT binary patch literal 897 zcma)5L5tHs6rN1il3gk&bcRIAQi2X{HqzpI2tlO z;b0U?J!8*h6#L`QxKU?K<{)F|KSX>MFtGP*DiaosU$Bu(xZnvlzH2i}IcT-5IEki% z#Hv|OeHn<@YCoP?%5~WC1(b7`A0~1rBPfK^AmCmA0O*u8Ye8zYXY+Y&*_?7L+u;;H z9A3YB^)p|IxE{-sIWivRW8WV|)!e75%iKD1tJ~hx4@6aX4YyTyoqf^t_Ua8eXzte? g(b{tcf=hSTl?|sQgvWQx4&1)%LGR$EoWRfb7mvipT>t<8 literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..7e5ff66214161ea46c5957ad547770fe96c6085a GIT binary patch literal 16 XcmYc;N@ieSU}8xBJVVH^ROdYaBb@~R literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..8a645acc6366c86d29f5327a8093871a19ee8bc0 GIT binary patch literal 16 XcmYc;N@ieSU}ET7n7>oP?f!EBB~b;o literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..2e80f1525d9978d3d40e42dfe8080216ed9cd30b GIT binary patch literal 16 XcmYc;N@ieSU}9+YNIxgxcK|7MYbTCv6(#LJw7iqNw7E5L#!K)WUI6+W}Qk zPL()tKfovO1DrW9-k7Qq2QK58H*em|?D+KAi;y7FCJwp&dii79B^bgj zVk30-F-8bAtjYrRkUS#azWn})Z&B_A8%hHxNAsB7$INL~Q2im8Q3xuyQRR)TL6_R( z7_x}DbWBE-2btkXt60eV?dxK4re~5g0o=h3P81?7aGIH%=^Zq3fle`VneW*!5bGg( zi&Zp&DW*sC*t3V5Q8t#MT{GAwf}XGuNQ1~Pi?r!yiwNY6UDB$ROdq3lLPy?sXeKuX zl&W2fe_Q7R2A%H~d4lLii(=2FpR6`)=R3|kp9;BjNib0Jp}rV{Od;zCz-zAV|TM2tm**ug)H`n(YE z7Uyf-h-OaB4Y$W?+ZFuv3rTLWCO|_miJmjx)U8tH*Uu?K}A_(OQ;GUM3jh^cY zfO1_FGAn=pnL6ZuMEP{H-GMp9CrKO~sd09gjPz`LltgNB7|oQ_@xxe;qKQ(OJa9u;vf1-JSkoDN)!$f~m;3+# literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-23/part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-23/part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..357f6f12ca7274767da28ed79ec6a9695395ed84 GIT binary patch literal 687 zcmah{L2J}N6rPN+1X%@PXPiI|VX&cvI3#4Z-7dZ8p-7cbyo$(VXIz8X>^8|(WLYTm zCOryXym%G-9fJSBf8xWq7-G3PMX(8Y( zPM5k7)%?Af=|(EqNcP!U@#e!q^D(cRYB_0m$lv6;P&FUFTygV65Xu$69W5^#J=GNe z<+3PbRsaDqb;$pU^6_fD4zH41jjMF1l9$u}WxrY@cHA}mXjb{%gnlgIM8y71w#*CV wt85UDk~lh0!)!kp=*jRPiPUI6nkcE`y;u*Tky4r5bwgOP+4BQf(+PalKOA|N?f?J) literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..1b77d1c5e1dcb7711745c8dd891b5406489a4251 GIT binary patch literal 16 XcmYc;N@ieSU}8`bzThFGctZ^U9LfW^ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..ddf9a2ea89c22c0fe5d374043a1bb54505cc63a7 GIT binary patch literal 16 XcmYc;N@ieSU}88bmv%$K?f!EBBiRLj literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..80c0c5edc43fc9276fcb9281fee3b6731b580a87 GIT binary patch literal 16 XcmYc;N@ieSU}E_7X4`Xpj)x@xEP)0} literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-24/part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-24/part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5a85db4f6b9f12b50b2b4f0c437f69c974aa3cfb GIT binary patch literal 694 zcmah{L2J}N6rN0XODGE>>Fz1&c^o7Ox^Q*%{Z+B)d(rEoIrO z-aM3|H*fwD@#b&w>P>t}##->;W!}8^zW2TN&70}t7Xd+}ODuBz<=fA;Lom48#6)Q4 zK1K*_8ErTrc8@Uf^~d{nTel#0hJPVX(>S@b?_%3(RTbJ0Y*TQmaJxZhs}l}2$uS_1 z?a&FCRyW9upLL3*$WLDvv$rZ0qy@n}Y~ffU(t%4W6RFxmCl0_VwjJAZO&H4_AamQU zU<6Z4pU`9195>@MCq?`9aE%8&K^IU1$uEO+>1Ts*|Iv7*dZvyyr2O{@}Gu+hsktDXr`v4pUO#aXH5 zDu+O}N)izz0Dwy6u|K42wBBsIo8p?|rX0G&+5CUiuQteft_OaQ<~^;j7w{nDL2o}= z#fj|6s2>c&z(10M=rHW7+2APjP-;D40BY5=XkG0^z%bR(>_j|wh-n=<`@-ifd3#eP| z{(5({t}$qRtF`F0L9u7ikH!Ed^chP(gHBlX)Zsy>hrw8>MDBARmR#@oA*|^PzUpt={h0s& literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-24/part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-24/part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f716e5ddb434fb0ba69dff366c0a0f1497c9f897 GIT binary patch literal 700 zcmah{!D`z;5M6CtR}@2A@Ge>qR4j=|KwTtcCywzYrG!#~F};>jR^F{ED$9-}w*=!$ zKA^|+mUI3g=id4a{fZuXDRd-DA<#oFyEAXzyqVeA$;q35Akrlk`SbhN&10KjaNER0 z=+Q1l2<;dT;RMVfyX5x!&x?DEC!;N+4UCsMP2-$B#ms7z8TCalqu`WqyQVZo9kQuO zPJx4%O~+(XS`Zn%>f}qAeR`iyKkB(8Edckhg=2+C2aHxK=XwvF*+8e5+01iI=#2G| z*=8k;V2bGrdg_{^ZA06WqW!A9tp_z>GLQz5OH8`-+8_eCW1DnprK(P)bw(%dc+@oC z+@M#_)gbpZuS(4y2029Zt3k1A(kr72lX{l5$Yw$=B}^x(LB(gC{yNgW&py5|p+isS+FVAXG=g8xH2`2&1OlV!n!m&|+L``-87H-`E;%4yet-OfZ&2<|_LMGA9;YhF4Znvu>r^exK`^I~RB*SUv_VsHs7=m+ zgSbPdWL8B`nLOzgYn8uwS7}uMx+NpCsRvvh@Lq>r8M__Hp$rf^{)bUq zrV{Sta$`!FuinX}DV0{GVxO(8DBrJ*7>lCJH}g{X;#Fc&U5N4Xt*EXFN{I%zp(R;q z7A6NE+oY+AQy@U6@kJ{t#@pS_zizHIuG?Wsk}dw%{bGmM(`MlZ%iL`ox}gjs8M;UD zCP}rc<3Tuz!r)8~nARk~5tjjfXowaF1w z5qIdAj4BT*!{b)5kooDGV)9PUBxyo&2Q!=~L|Wi9Gda^cXygE#V(xI?vtc3LL-rQ0 zU<6Z4pVK4H9&T3ISc-PdVw(tNf+C;>l52~!=@*L#HwKt0 zEylmi^Bx1wH;X($^n*pQXVXts8`SxX&GV^{3kiyZ5hu=dUL^C3-HAj`1c>eZgDB2Y z0q=0O)TPMh7h zROgV$mT4;E6adhvefKY^J6NsP{!O#txM_zeNjCjo_vPhb)3RHhTm?){kXQA!}eWhZ#$9*_(b>jzcq7(Q>{{Yj@ BmF55d literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-25/part-00009-685aacbb-c7ac-4cb2-93f1-6dc27cd2e980.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-25/part-00009-685aacbb-c7ac-4cb2-93f1-6dc27cd2e980.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5cc08018718322083c38a42b097cd848df07d763 GIT binary patch literal 687 zcmah{L2J}N6rPN+gt8QbopAy=gu#Xu;*gMCyIp#*hayEn@hT#dopB9jv)d%A$g)Ro zJ$Mkj2>uNH4gLpj{sC_ueUrpm@ZjamyzhPQd+*Jgi-%8Qg2<3C^7Y5}pL-s`;O-Cy zq1}5JA#}^y1q+x*eDeAImv{CJ%HF7>*g!d5R+)PTyUaET{Xwux!D-=^z-)8$utyzo z265ziG$pCILFDAopkB%9)$@AxQZFR20o=z7=L(SlT5V zc3eXvm|}WL&jRP9Yo@al9ki`=9@K=PKpI4TS!75*Swx^9^~j)CYU{LG=QIsc&@!!e z)@c3pZfad&(E4Vy=#@or;Ls1&5GD*6TUK)+R}!WY?SSI7uIhYQvfGImiwLp9e;D;; zA>b`8*SZna@{L&NMk?7z_R-q#=Iu)JDX*JqJ!^Q#U*x(_HJ?7+aPveE$`!ypEiW5A z*A)Qex+r8;00A;}$p47)>1MkPuajGi>vX7+m-GK+KieX9-Zy-IQThFZek|fd#Qs6H z&I{$MY!pwDI67A2>?j%O+4wk#)Z{3dDXHTJu^vSer80TwhOlI_=ew|`3;3$P0WPta A!2kdN literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..ac22997311cf876d645c4cdfab55a05fbccb1e84 GIT binary patch literal 16 XcmYc;N@ieSU}7lJPBJh|ng0_29*hK+ literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..463fbfdfc61e31c5db4f2e1a1ca2b3d86f2cc82e GIT binary patch literal 16 XcmYc;N@ieSU}9+O4?n;%*V_O9BGd$a literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00002-7dd6bbed-a0c1-44f0-b729-42b7d7d7f5ca.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00002-7dd6bbed-a0c1-44f0-b729-42b7d7d7f5ca.c000.snappy.parquet.crc new file mode 100644 index 0000000000000000000000000000000000000000..c0fafe9ca53b8165a8ea951882c78cf053f76340 GIT binary patch literal 16 XcmYc;N@ieSU}Bj6IcN2w53_9nE3*cU literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-29/part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-29/part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b24ab63fdc30f3b52052f1f0eb0afedeed3c0424 GIT binary patch literal 917 zcma)5&ubGw6rP>dbtwTWWQJV`5|(ypAr4vcqiI+QiWHF$f>)6;$&BgZW|Qu2loE3F z;2+Vm|APm;2p&9o)L#4#RK$zmoBfeSycCw1_uhQ(d+&WSJGl4QC5V)XN&fu)`Q}!I zU~qL}AhflE5kec&K^WTWdhQoC1+` z8(Ze4BFr~}Eeehb>lwq!D2_DC1T<1EMplIyPS%Jhpy7;^d*Qp%aq22!pb9XNfj7`)hlFPEZ- ztaE->oUbuyBj-Z9BK>EGl)Oui0O7TjW~+`~Z4gS|V{!-4j~c~}K|gC{(6VNl(`d~2 zjDu(_)sMZ9QS47c^G1WUSRI-B{~_YDfPvI!bD6Mc`jSm#!Ua#b`BhukN=>_K$4N9F zC3ekz=F33DcK6A`RyP1gzJPEJ^TR}rWdwn69t3QKK9*7RLym&y3B1bx4Ji+`+=y6Vbkq2-1?zt4G$VkIcgm? h>Y{T{9|Z+Y+%vk$$4S2KVg@-O(Y(Nq8c literal 0 HcmV?d00001 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-29/part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-29/part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c7717e10ca83d7fa60e1ae0f1f744ad5a1838792 GIT binary patch literal 911 zcma)5&x_MQ6rN1CB`gIM`i2C`Qi2?YrfD}xm8JCJ zLBX@rq-nf$I3)d`(Zws@bIh`4Nm2V zLk@$x)S#Zg#?aUb_(<-Oc2U7qAm1CD@?a=?@x^3$z^GX?Tegk^m~}me^)VkPqwGFA zvGvwsg2l?nUNXKD4cQhlr2Zz~ zmEwCt;V!W+wKd^C10=*vc8rMDfzoT$&C3ih_JP68-qEKldec$AKPXB3uFol36bRvovr*PnX6D2aZ}ZB*ULk*N0%t2H^OA5|-& faab7$F5LrH)+!Al`h4GP;VoQzj?DIdn8#1Eu4^i3o@4IC(X>+F;an>~7UW`*zDL;`vw)lu`DO z?N`li=095**-X31GW3Q0ShK8gG~{9;a9BK@ z8-Jmq*dK@1%?5ArIo;+^(65*WYhlv`h2!U`K1fm}x0V`F_y%Kdhv-!NXC{8sN<#>u84zFLm z`l**DuBY;BPOO8x?favslFzB)aj(I>%4UD+2eKmjO|RYX>btVl-)S_}ptakm%l1xv dAcgX_J=LtYrR<9>yNfq4d&qmdEl2nd{sk(h%O?N; literal 0 HcmV?d00001 From 8df172fc1b7a7753501ace2dc0e3e8eebde17624 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 19:45:32 -0800 Subject: [PATCH 40/56] Add tests for cdf --- .../src/table_changes/physical_to_logical.rs | 188 +++++++++++++----- 1 file changed, 134 insertions(+), 54 deletions(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 9f94ed825..c06ffa461 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -150,7 +150,7 @@ mod tests { path: impl AsRef, start_version: Version, end_version: impl Into>, - ) -> DeltaResult { + ) -> DeltaResult> { let table = Table::try_from_uri(path)?; let options = HashMap::from([("skip_signature", "true".to_string())]); let engine = Arc::new(DefaultEngine::try_new( @@ -179,69 +179,149 @@ mod tests { } }) .try_collect()?; - let formatted = pretty_format_batches(&batches)?.to_string(); - Ok(formatted) + Ok(batches) + } + + fn assert_batches_sorted_eq(expected_lines: &[impl ToString], batches: &[RecordBatch]) { + let mut expected_lines: Vec = + expected_lines.iter().map(ToString::to_string).collect(); + + // sort except for header + footer + let num_lines = expected_lines.len(); + if num_lines > 3 { + expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() + } + + let formatted = arrow::util::pretty::pretty_format_batches(batches) + .unwrap() + .to_string(); + + let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); + + // sort except for header + footer + let num_lines = actual_lines.len(); + if num_lines > 3 { + actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() + } + + let expected_table_str = expected_lines.join("\n"); + let actual_table_str = actual_lines.join("\n"); + + assert_eq!( + actual_lines.len(), + expected_lines.len(), + "Incorrect number of lines. Expected:\n{}\nbut got:\n{} ", + expected_table_str, + actual_table_str + ); + for (expected, actual) in expected_lines.iter().zip(actual_lines) { + assert_eq!( + expected, actual, + "Expected:\n{}\nbut got:\n{}", + expected_table_str, actual_table_str + ); + } } #[test] fn cdf_with_deletion_vector() -> Result<(), Box> { let cdf = read_cdf_for_table("tests/data/table-with-cdf-and-dv", 0, None)?; - let expected = concat!( - "+-------+--------------+-----------------+--------------------------+\n", - "| value | _change_type | _commit_version | _commit_timestamp |\n", - "+-------+--------------+-----------------+--------------------------+\n", - "| 0 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", - "| 1 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", - "| 2 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", - "| 3 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", - "| 4 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", - "| 5 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", - "| 6 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", - "| 7 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", - "| 8 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", - "| 9 | insert | 0 | 1970-01-21T01:35:06.498Z |\n", - "| 0 | delete | 1 | 1970-01-21T01:35:06.498Z |\n", - "| 9 | delete | 1 | 1970-01-21T01:35:06.498Z |\n", - "| 0 | insert | 2 | 1970-01-21T01:35:06.498Z |\n", - "| 9 | insert | 2 | 1970-01-21T01:35:06.498Z |\n", - "+-------+--------------+-----------------+--------------------------+" + assert_batches_sorted_eq( + &[ + "+-------+--------------+-----------------+--------------------------+", + "| value | _change_type | _commit_version | _commit_timestamp |", + "+-------+--------------+-----------------+--------------------------+", + "| 0 | insert | 0 | 1970-01-21T01:35:06.498Z |", + "| 1 | insert | 0 | 1970-01-21T01:35:06.498Z |", + "| 2 | insert | 0 | 1970-01-21T01:35:06.498Z |", + "| 3 | insert | 0 | 1970-01-21T01:35:06.498Z |", + "| 4 | insert | 0 | 1970-01-21T01:35:06.498Z |", + "| 5 | insert | 0 | 1970-01-21T01:35:06.498Z |", + "| 6 | insert | 0 | 1970-01-21T01:35:06.498Z |", + "| 8 | insert | 0 | 1970-01-21T01:35:06.498Z |", + "| 7 | insert | 0 | 1970-01-21T01:35:06.498Z |", + "| 9 | insert | 0 | 1970-01-21T01:35:06.498Z |", + "| 0 | delete | 1 | 1970-01-21T01:35:06.498Z |", + "| 9 | delete | 1 | 1970-01-21T01:35:06.498Z |", + "| 0 | insert | 2 | 1970-01-21T01:35:06.498Z |", + "| 9 | insert | 2 | 1970-01-21T01:35:06.498Z |", + "+-------+--------------+-----------------+--------------------------+", + ], + &cdf, ); - assert_eq!(expected, cdf); Ok(()) } + #[test] fn basic_cdf() -> Result<(), Box> { - let cdf = read_cdf_for_table("tests/data/cdf-table", 0, None)?; - let expected = r#" - +----+--------+------------------+-----------------+-------------------------+------------+ - | id | name | _change_type | _commit_version | _commit_timestamp | birthday | - +----+--------+------------------+-----------------+-------------------------+------------+ - | 7 | Dennis | delete | 3 | 2024-01-06T16:44:59.570 | 2023-12-29 | - | 3 | Dave | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 | - | 4 | Kate | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 | - | 2 | Bob | update_preimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-23 | - | 7 | Dennis | update_preimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-24 | - | 5 | Emily | update_preimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-24 | - | 6 | Carl | update_preimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-24 | - | 7 | Dennis | update_postimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-29 | - | 5 | Emily | update_postimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-29 | - | 6 | Carl | update_postimage | 2 | 2023-12-29T21:41:33.785 | 2023-12-29 | - | 3 | Dave | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 | - | 4 | Kate | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 | - | 2 | Bob | update_postimage | 1 | 2023-12-22T17:10:21.675 | 2023-12-22 | - | 2 | Bob | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 | - | 3 | Dave | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 | - | 4 | Kate | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-23 | - | 5 | Emily | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 | - | 6 | Carl | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 | - | 7 | Dennis | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-24 | - | 1 | Steve | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-22 | - | 8 | Claire | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 | - | 9 | Ada | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 | - | 10 | Borb | insert | 0 | 2023-12-22T17:10:18.828 | 2023-12-25 | - +----+--------+------------------+-----------------+-------------------------+------------+ - "#; - //TODO + let batches = read_cdf_for_table("tests/data/cdf-table", 0, None)?; + assert_batches_sorted_eq(&[ + "+----+--------+------------+------------------+-----------------+--------------------------+", + "| id | name | birthday | _change_type | _commit_version | _commit_timestamp |", + "+----+--------+------------+------------------+-----------------+--------------------------+", + "| 1 | Steve | 2023-12-22 | insert | 0 | 2023-12-22T17:10:18.828 |", + "| 2 | Bob | 2023-12-23 | insert | 0 | 2023-12-22T17:10:18.828 |", + "| 3 | Dave | 2023-12-23 | insert | 0 | 2023-12-22T17:10:18.828 |", + "| 4 | Kate | 2023-12-23 | insert | 0 | 2023-12-22T17:10:18.828 |", + "| 5 | Emily | 2023-12-24 | insert | 0 | 2023-12-22T17:10:18.828 |", + "| 6 | Carl | 2023-12-24 | insert | 0 | 2023-12-22T17:10:18.828 |", + "| 7 | Dennis | 2023-12-24 | insert | 0 | 2023-12-22T17:10:18.828 |", + "| 8 | Claire | 2023-12-25 | insert | 0 | 2023-12-22T17:10:18.828 |", + "| 9 | Ada | 2023-12-25 | insert | 0 | 2023-12-22T17:10:18.828 |", + "| 10 | Borb | 2023-12-25 | insert | 0 | 2023-12-22T17:10:18.828 |", + "| 3 | Dave | 2023-12-22 | update_postimage | 1 | 2023-12-22T17:10:21.675 |", + "| 3 | Dave | 2023-12-23 | update_preimage | 1 | 2023-12-22T17:10:21.675 |", + "| 4 | Kate | 2023-12-22 | update_postimage | 1 | 2023-12-22T17:10:21.675 |", + "| 4 | Kate | 2023-12-23 | update_preimage | 1 | 2023-12-22T17:10:21.675 |", + "| 2 | Bob | 2023-12-22 | update_postimage | 1 | 2023-12-22T17:10:21.675 |", + "| 2 | Bob | 2023-12-23 | update_preimage | 1 | 2023-12-22T17:10:21.675 |", + "| 7 | Dennis | 2023-12-24 | update_preimage | 2 | 2023-12-29T21:41:33.785 |", + "| 7 | Dennis | 2023-12-29 | update_postimage | 2 | 2023-12-29T21:41:33.785 |", + "| 5 | Emily | 2023-12-24 | update_preimage | 2 | 2023-12-29T21:41:33.785 |", + "| 5 | Emily | 2023-12-29 | update_postimage | 2 | 2023-12-29T21:41:33.785 |", + "| 6 | Carl | 2023-12-24 | update_preimage | 2 | 2023-12-29T21:41:33.785 |", + "| 6 | Carl | 2023-12-29 | update_postimage | 2 | 2023-12-29T21:41:33.785 |", + "| 7 | Dennis | 2023-12-29 | delete | 3 | 2024-01-06T16:44:59.570 |", + "+----+--------+------------+------------------+-----------------+--------------------------+"], + &batches + ); + Ok(()) + } + + #[test] + fn cdf_non_partitioned() -> Result<(), Box> { + let batches = read_cdf_for_table("tests/data/cdf-table-non-partitioned", 0, None)?; + assert_batches_sorted_eq(&[ + "+----+--------+------------+-------------------+---------------+--------------+----------------+------------------+-----------------+--------------------------+", + "| id | name | birthday | long_field | boolean_field | double_field | smallint_field | _change_type | _commit_version | _commit_timestamp |", + "+----+--------+------------+-------------------+---------------+--------------+----------------+------------------+-----------------+--------------------------+", + "| 1 | Steve | 2024-04-14 | 1 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 2 | Bob | 2024-04-15 | 1 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 3 | Dave | 2024-04-15 | 2 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 4 | Kate | 2024-04-15 | 3 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 5 | Emily | 2024-04-16 | 4 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 6 | Carl | 2024-04-16 | 5 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 7 | Dennis | 2024-04-16 | 6 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 8 | Claire | 2024-04-17 | 7 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 9 | Ada | 2024-04-17 | 8 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 10 | Borb | 2024-04-17 | 99999999999999999 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", + "| 3 | Dave | 2024-04-15 | 2 | true | 3.14 | 1 | update_preimage | 1 | 2024-04-14T15:58:29.393 |", + "| 3 | Dave | 2024-04-14 | 2 | true | 3.14 | 1 | update_postimage | 1 | 2024-04-14T15:58:29.393 |", + "| 4 | Kate | 2024-04-15 | 3 | true | 3.14 | 1 | update_preimage | 1 | 2024-04-14T15:58:29.393 |", + "| 4 | Kate | 2024-04-14 | 3 | true | 3.14 | 1 | update_postimage | 1 | 2024-04-14T15:58:29.393 |", + "| 2 | Bob | 2024-04-15 | 1 | true | 3.14 | 1 | update_preimage | 1 | 2024-04-14T15:58:29.393 |", + "| 2 | Bob | 2024-04-14 | 1 | true | 3.14 | 1 | update_postimage | 1 | 2024-04-14T15:58:29.393 |", + "| 7 | Dennis | 2024-04-16 | 6 | true | 3.14 | 1 | update_preimage | 2 | 2024-04-14T15:58:31.257 |", + "| 7 | Dennis | 2024-04-14 | 6 | true | 3.14 | 1 | update_postimage | 2 | 2024-04-14T15:58:31.257 |", + "| 5 | Emily | 2024-04-16 | 4 | true | 3.14 | 1 | update_preimage | 2 | 2024-04-14T15:58:31.257 |", + "| 5 | Emily | 2024-04-14 | 4 | true | 3.14 | 1 | update_postimage | 2 | 2024-04-14T15:58:31.257 |", + "| 6 | Carl | 2024-04-16 | 5 | true | 3.14 | 1 | update_preimage | 2 | 2024-04-14T15:58:31.257 |", + "| 6 | Carl | 2024-04-14 | 5 | true | 3.14 | 1 | update_postimage | 2 | 2024-04-14T15:58:31.257 |", + "| 7 | Dennis | 2024-04-14 | 6 | true | 3.14 | 1 | delete | 3 | 2024-04-14T15:58:32.495 |", + "| 1 | Alex | 2024-04-14 | 1 | true | 3.14 | 1 | insert | 4 | 2024-04-14T15:58:33.444 |", + "| 2 | Alan | 2024-04-15 | 1 | true | 3.14 | 1 | insert | 4 | 2024-04-14T15:58:33.444 |", + "+----+--------+------------+-------------------+---------------+--------------+----------------+------------------+-----------------+--------------------------+" + ], &batches); Ok(()) } } From 7846757b0db33076b0e69c652738f32e92c01ad2 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 19:46:13 -0800 Subject: [PATCH 41/56] remove unneeded import --- kernel/src/table_changes/physical_to_logical.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index c06ffa461..f43a2b25f 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -138,7 +138,6 @@ mod tests { use arrow::compute::filter_record_batch; use arrow_array::RecordBatch; - use arrow_cast::pretty::pretty_format_batches; use itertools::Itertools; use crate::engine::arrow_data::ArrowEngineData; From 67a9a185c4f3642aea0a3d68bb7a618f02636187 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 19:47:45 -0800 Subject: [PATCH 42/56] more formatting --- kernel/src/table_changes/physical_to_logical.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index f43a2b25f..c83500e96 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -281,7 +281,8 @@ mod tests { "| 6 | Carl | 2023-12-24 | update_preimage | 2 | 2023-12-29T21:41:33.785 |", "| 6 | Carl | 2023-12-29 | update_postimage | 2 | 2023-12-29T21:41:33.785 |", "| 7 | Dennis | 2023-12-29 | delete | 3 | 2024-01-06T16:44:59.570 |", - "+----+--------+------------+------------------+-----------------+--------------------------+"], + "+----+--------+------------+------------------+-----------------+--------------------------+" + ], &batches ); Ok(()) @@ -320,7 +321,8 @@ mod tests { "| 1 | Alex | 2024-04-14 | 1 | true | 3.14 | 1 | insert | 4 | 2024-04-14T15:58:33.444 |", "| 2 | Alan | 2024-04-15 | 1 | true | 3.14 | 1 | insert | 4 | 2024-04-14T15:58:33.444 |", "+----+--------+------------+-------------------+---------------+--------------+----------------+------------------+-----------------+--------------------------+" - ], &batches); + ], + &batches); Ok(()) } } From fa042c107aabad9c97e63794a39e3dbfc0c1c261 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 22:14:46 -0800 Subject: [PATCH 43/56] Add some docs --- kernel/src/table_changes/physical_to_logical.rs | 8 +++++++- kernel/src/table_changes/scan.rs | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index c83500e96..1dbdeb256 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -14,6 +14,7 @@ use crate::{DeltaResult, Engine, Error, Expression, ExpressionRef, FileMeta}; use super::resolve_dvs::ResolvedCdfScanFile; use super::scan_file::{CdfScanFile, CdfScanFileType}; +/// Returns a map from change data feed column name to an expression that generates the row data. #[allow(unused)] fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult> { let timestamp = Scalar::timestamp_from_millis(scan_file.commit_timestamp)?; @@ -32,6 +33,8 @@ fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult SchemaRef { if scan_file.scan_type == CdfScanFileType::Cdc { let change_type = StructField::new("_change_type", DataType::STRING, false); @@ -78,6 +82,8 @@ fn read_schema(scan_file: &CdfScanFile, global_scan_state: &GlobalScanState) -> } } +/// Reads the data at the `resolved_scan_file` and transforms the data from physical to logical. +/// The result is a fallible iterator of [`ScanResult`] containing the logical data. pub(crate) fn read_scan_data( engine: &dyn Engine, resolved_scan_file: ResolvedCdfScanFile, diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index 247483f26..0fe5c989e 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -207,6 +207,10 @@ impl TableChangesScan { } } + /// Perform an "all in one" scan to get the change data feed. This will use the provided `engine` + /// to read and process all the data for the query. Each [`ScanResult`] in the resultant iterator + /// encapsulates the raw data and an optional boolean vector built from the deletion vector if it + /// was present. See the documentation for [`ScanResult`] for more details. pub fn execute( &self, engine: Arc, From 4098b6785e7882cb4867b2ffc3759338ce87333d Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Sun, 8 Dec 2024 22:22:10 -0800 Subject: [PATCH 44/56] Removed allow(unused) --- kernel/src/table_changes/physical_to_logical.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 1dbdeb256..1b6c56758 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -15,7 +15,6 @@ use super::resolve_dvs::ResolvedCdfScanFile; use super::scan_file::{CdfScanFile, CdfScanFileType}; /// Returns a map from change data feed column name to an expression that generates the row data. -#[allow(unused)] fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult> { let timestamp = Scalar::timestamp_from_millis(scan_file.commit_timestamp)?; let version = scan_file.commit_version; @@ -35,7 +34,6 @@ fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult SchemaRef { +fn get_read_schema(scan_file: &CdfScanFile, global_scan_state: &GlobalScanState) -> SchemaRef { if scan_file.scan_type == CdfScanFileType::Cdc { let change_type = StructField::new("_change_type", DataType::STRING, false); let fields = global_scan_state @@ -97,7 +95,7 @@ pub(crate) fn read_scan_data( } = resolved_scan_file; let expression = get_expression(&scan_file, global_state, all_fields)?; - let schema = read_schema(&scan_file, global_state); + let schema = get_read_schema(&scan_file, global_state); let evaluator = engine.get_expression_handler().get_evaluator( schema.clone(), expression, From 610d62eea1c5d8805028a8088e4d22136cb2dfb9 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 10:01:07 -0800 Subject: [PATCH 45/56] Remove data for next PR --- .../src/table_changes/physical_to_logical.rs | 196 ------------------ kernel/src/table_changes/scan.rs | 37 ---- ...-bf66-fc2a968c4feb.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-9f56-d45f47d5dea5.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-ae43-5bf5b4c36a3d-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-b674-deb4d1b82aee-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-a847-cc5d1415f35d.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-bf76-1987f87901f1-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-a1bc-7a81e4a5ddce-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-be02-282f3629694c.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-89d4-0d73a5d5b971.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-8ea5-ddd162a84e94.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-a2df-a50985051257-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-a93f-c1bf180e7008-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-94e2-e7987a580b6a-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-be61-76396629a546-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-acd1-d2642d1a778c-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-8dbc-812b7274a4e5-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-9288-5a6503bac41b-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-931d-168b2821adca-c000.snappy.parquet.crc | Bin 24 -> 0 bytes ...-8239-f53ca8761e0e.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-9a42-c551810ffef9.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-b0ab-5b9937a8bd12.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-9053-48dc314be509.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-8e90-e33319ebf581.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-aefb-cd68c1672e02.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...-a938-58954786d23f.c000.snappy.parquet.crc | Bin 28 -> 0 bytes ...4651-8239-f53ca8761e0e.c000.snappy.parquet | Bin 2459 -> 0 bytes ...4ee9-9a42-c551810ffef9.c000.snappy.parquet | Bin 2242 -> 0 bytes ...4293-b0ab-5b9937a8bd12.c000.snappy.parquet | Bin 2442 -> 0 bytes ...4ccc-9053-48dc314be509.c000.snappy.parquet | Bin 2451 -> 0 bytes ...4dbf-8e90-e33319ebf581.c000.snappy.parquet | Bin 2444 -> 0 bytes ...46d1-aefb-cd68c1672e02.c000.snappy.parquet | Bin 2437 -> 0 bytes ...47f1-a938-58954786d23f.c000.snappy.parquet | Bin 2444 -> 0 bytes .../_delta_log/.00000000000000000000.json.crc | Bin 68 -> 0 bytes .../_delta_log/.00000000000000000001.json.crc | Bin 36 -> 0 bytes .../_delta_log/.00000000000000000002.json.crc | Bin 36 -> 0 bytes .../_delta_log/.00000000000000000003.json.crc | Bin 16 -> 0 bytes .../_delta_log/.00000000000000000004.json.crc | Bin 24 -> 0 bytes .../_delta_log/00000000000000000000.json | 13 -- .../_delta_log/00000000000000000001.json | 10 - .../_delta_log/00000000000000000002.json | 10 - .../_delta_log/00000000000000000003.json | 3 - .../_delta_log/00000000000000000004.json | 3 - ...4b6f-bf66-fc2a968c4feb.c000.snappy.parquet | Bin 2188 -> 0 bytes ...41b6-9f56-d45f47d5dea5.c000.snappy.parquet | Bin 2175 -> 0 bytes ...456d-ae43-5bf5b4c36a3d-c000.snappy.parquet | Bin 1958 -> 0 bytes ...4613-b674-deb4d1b82aee-c000.snappy.parquet | Bin 1965 -> 0 bytes ...4ae7-a847-cc5d1415f35d.c000.snappy.parquet | Bin 2182 -> 0 bytes ...4166-bf76-1987f87901f1-c000.snappy.parquet | Bin 1958 -> 0 bytes ...43be-a1bc-7a81e4a5ddce-c000.snappy.parquet | Bin 1951 -> 0 bytes ...40da-be02-282f3629694c.c000.snappy.parquet | Bin 2175 -> 0 bytes ...41f0-89d4-0d73a5d5b971.c000.snappy.parquet | Bin 2175 -> 0 bytes ...4459-8ea5-ddd162a84e94.c000.snappy.parquet | Bin 2168 -> 0 bytes ...47af-a2df-a50985051257-c000.snappy.parquet | Bin 1958 -> 0 bytes ...4703-a93f-c1bf180e7008-c000.snappy.parquet | Bin 1958 -> 0 bytes ...4462-94e2-e7987a580b6a-c000.snappy.parquet | Bin 1965 -> 0 bytes ...4dc4-be61-76396629a546-c000.snappy.parquet | Bin 1958 -> 0 bytes ...445d-acd1-d2642d1a778c-c000.snappy.parquet | Bin 1971 -> 0 bytes ...4891-8dbc-812b7274a4e5-c000.snappy.parquet | Bin 1972 -> 0 bytes ...4ab2-9288-5a6503bac41b-c000.snappy.parquet | Bin 1951 -> 0 bytes ...4a23-931d-168b2821adca-c000.snappy.parquet | Bin 1958 -> 0 bytes ...-a497-6969cdf3966c.c000.snappy.parquet.crc | Bin 20 -> 0 bytes ...-90bd-9416b10ba6a6.c000.snappy.parquet.crc | Bin 20 -> 0 bytes ...-bf85-3b9f5027578c.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...4fc0-a497-6969cdf3966c.c000.snappy.parquet | Bin 1028 -> 0 bytes ...41e1-90bd-9416b10ba6a6.c000.snappy.parquet | Bin 1028 -> 0 bytes ...42c6-bf85-3b9f5027578c.c000.snappy.parquet | Bin 1021 -> 0 bytes ...-8c41-71e38c07fdc2.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-b7e4-90bf8d04898e.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-aa88-7d5f5228d781.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...4b10-8c41-71e38c07fdc2.c000.snappy.parquet | Bin 1021 -> 0 bytes ...4f3e-b7e4-90bf8d04898e.c000.snappy.parquet | Bin 1021 -> 0 bytes ...4713-aa88-7d5f5228d781.c000.snappy.parquet | Bin 1014 -> 0 bytes ...-a62e-2ecc8dc24035.c000.snappy.parquet.crc | Bin 20 -> 0 bytes ...-af76-3b32bab79832.c000.snappy.parquet.crc | Bin 20 -> 0 bytes ...-a9c8-05c1d4f79d6a.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...470a-a62e-2ecc8dc24035.c000.snappy.parquet | Bin 1034 -> 0 bytes ...406f-af76-3b32bab79832.c000.snappy.parquet | Bin 1028 -> 0 bytes ...4533-a9c8-05c1d4f79d6a.c000.snappy.parquet | Bin 1021 -> 0 bytes ...-9739-fc9d4db24308.c000.snappy.parquet.crc | Bin 20 -> 0 bytes ...-b2cf-91e882f4c500.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-b0ac-e70b1e2115b1.c000.snappy.parquet.crc | Bin 20 -> 0 bytes ...-99ed-062c0a337c29.c000.snappy.parquet.crc | Bin 20 -> 0 bytes ...4d37-9739-fc9d4db24308.c000.snappy.parquet | Bin 1041 -> 0 bytes ...44af-b2cf-91e882f4c500.c000.snappy.parquet | Bin 971 -> 0 bytes ...4227-b0ac-e70b1e2115b1.c000.snappy.parquet | Bin 1035 -> 0 bytes ...4198-99ed-062c0a337c29.c000.snappy.parquet | Bin 1028 -> 0 bytes .../_delta_log/.00000000000000000000.json.crc | Bin 44 -> 0 bytes .../_delta_log/.00000000000000000001.json.crc | Bin 40 -> 0 bytes .../_delta_log/.00000000000000000002.json.crc | Bin 40 -> 0 bytes .../_delta_log/.00000000000000000003.json.crc | Bin 20 -> 0 bytes .../_delta_log/00000000000000000000.json | 13 -- .../_delta_log/00000000000000000001.json | 13 -- .../_delta_log/00000000000000000002.json | 13 -- .../_delta_log/00000000000000000003.json | 3 - ...-9c61-120d006eb3b8.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-8fba-035e60e71ab2.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-bc83-487a583eb01b.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-b334-3a50c28185bb.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...4236-9c61-120d006eb3b8.c000.snappy.parquet | Bin 694 -> 0 bytes ...4ac9-8fba-035e60e71ab2.c000.snappy.parquet | Bin 904 -> 0 bytes ...42c8-bc83-487a583eb01b.c000.snappy.parquet | Bin 904 -> 0 bytes ...45ff-b334-3a50c28185bb.c000.snappy.parquet | Bin 897 -> 0 bytes ...-9db1-e985867a1a6c.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-bee3-df831f4abf3c.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-ac4f-56538beeddae.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...4acc-9db1-e985867a1a6c.c000.snappy.parquet | Bin 680 -> 0 bytes ...4e3b-bee3-df831f4abf3c.c000.snappy.parquet | Bin 687 -> 0 bytes ...43f8-ac4f-56538beeddae.c000.snappy.parquet | Bin 687 -> 0 bytes ...-b7bf-93f2f37c0cb9.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-9a85-e4f8d4501a67.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-adb9-21feeeee2c31.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet | Bin 694 -> 0 bytes ...4083-9a85-e4f8d4501a67.c000.snappy.parquet | Bin 687 -> 0 bytes ...45ad-adb9-21feeeee2c31.c000.snappy.parquet | Bin 700 -> 0 bytes ...-8bb3-721fa82961c6.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-bcc3-5df022ec6b35.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-93f1-6dc27cd2e980.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...4bbc-8bb3-721fa82961c6.c000.snappy.parquet | Bin 701 -> 0 bytes ...4b3b-bcc3-5df022ec6b35.c000.snappy.parquet | Bin 680 -> 0 bytes ...4cb2-93f1-6dc27cd2e980.c000.snappy.parquet | Bin 687 -> 0 bytes ...-9c5b-b99e676ddd06.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-8377-aa36cfe5762f.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...-b729-42b7d7d7f5ca.c000.snappy.parquet.crc | Bin 16 -> 0 bytes ...46a8-9c5b-b99e676ddd06.c000.snappy.parquet | Bin 917 -> 0 bytes ...4d88-8377-aa36cfe5762f.c000.snappy.parquet | Bin 911 -> 0 bytes ...44f0-b729-42b7d7d7f5ca.c000.snappy.parquet | Bin 904 -> 0 bytes 128 files changed, 314 deletions(-) delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00000-3cb5dee7-9ab2-4b6f-bf66-fc2a968c4feb.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00000-94321f1e-f3e8-456d-ae43-5bf5b4c36a3d-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00001-69317b6f-2e84-4ae7-a847-cc5d1415f35d.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00001-75bdbc7a-6029-4166-bf76-1987f87901f1-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00004-5de927bd-552f-4462-94e2-e7987a580b6a-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00006-a200a57d-283c-445d-acd1-d2642d1a778c-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/.part-00009-24d335c6-4da8-4a23-931d-168b2821adca-c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-4a1d2de0-dda2-4651-8239-f53ca8761e0e.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-3083ab22-0fc6-4ccc-9053-48dc314be509.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-d6ab6a1a-7a19-47f1-a938-58954786d23f.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-4a1d2de0-dda2-4651-8239-f53ca8761e0e.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00001-3083ab22-0fc6-4ccc-9053-48dc314be509.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00002-d6ab6a1a-7a19-47f1-a938-58954786d23f.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000000.json.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000001.json.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000002.json.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000003.json.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000004.json.crc delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/00000000000000000000.json delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/00000000000000000001.json delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/00000000000000000002.json delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/00000000000000000003.json delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/_delta_log/00000000000000000004.json delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00000-3cb5dee7-9ab2-4b6f-bf66-fc2a968c4feb.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00000-94321f1e-f3e8-456d-ae43-5bf5b4c36a3d-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00001-69317b6f-2e84-4ae7-a847-cc5d1415f35d.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00001-75bdbc7a-6029-4166-bf76-1987f87901f1-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00004-5de927bd-552f-4462-94e2-e7987a580b6a-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00006-a200a57d-283c-445d-acd1-d2642d1a778c-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table-non-partitioned/part-00009-24d335c6-4da8-4a23-931d-168b2821adca-c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/.cdc-00000-59fa51a4-edbb-4fc0-a497-6969cdf3966c.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/.cdc-00001-308c0cab-92b2-41e1-90bd-9416b10ba6a6.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/.cdc-00002-ea0bad63-f199-42c6-bf85-3b9f5027578c.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/cdc-00000-59fa51a4-edbb-4fc0-a497-6969cdf3966c.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/cdc-00001-308c0cab-92b2-41e1-90bd-9416b10ba6a6.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/cdc-00002-ea0bad63-f199-42c6-bf85-3b9f5027578c.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/.cdc-00000-fb59d34a-5bd7-4b10-8c41-71e38c07fdc2.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/.cdc-00001-985fd824-b34a-4f3e-b7e4-90bf8d04898e.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/.cdc-00002-831078a2-a13d-4713-aa88-7d5f5228d781.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/cdc-00000-fb59d34a-5bd7-4b10-8c41-71e38c07fdc2.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/cdc-00001-985fd824-b34a-4f3e-b7e4-90bf8d04898e.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/cdc-00002-831078a2-a13d-4713-aa88-7d5f5228d781.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00002-ddca9e04-03ef-4533-a9c8-05c1d4f79d6a.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00002-ddca9e04-03ef-4533-a9c8-05c1d4f79d6a.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/_delta_log/.00000000000000000000.json.crc delete mode 100644 kernel/tests/data/cdf-table/_delta_log/.00000000000000000001.json.crc delete mode 100644 kernel/tests/data/cdf-table/_delta_log/.00000000000000000002.json.crc delete mode 100644 kernel/tests/data/cdf-table/_delta_log/.00000000000000000003.json.crc delete mode 100644 kernel/tests/data/cdf-table/_delta_log/00000000000000000000.json delete mode 100644 kernel/tests/data/cdf-table/_delta_log/00000000000000000001.json delete mode 100644 kernel/tests/data/cdf-table/_delta_log/00000000000000000002.json delete mode 100644 kernel/tests/data/cdf-table/_delta_log/00000000000000000003.json delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-22/part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-23/part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-24/part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/.part-00007-8cd4b5a3-b4dd-4bbc-8bb3-721fa82961c6.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/.part-00008-436dbf31-f213-4b3b-bcc3-5df022ec6b35.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/.part-00009-685aacbb-c7ac-4cb2-93f1-6dc27cd2e980.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/part-00007-8cd4b5a3-b4dd-4bbc-8bb3-721fa82961c6.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/part-00008-436dbf31-f213-4b3b-bcc3-5df022ec6b35.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-25/part-00009-685aacbb-c7ac-4cb2-93f1-6dc27cd2e980.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00002-7dd6bbed-a0c1-44f0-b729-42b7d7d7f5ca.c000.snappy.parquet.crc delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet delete mode 100644 kernel/tests/data/cdf-table/birthday=2023-12-29/part-00002-7dd6bbed-a0c1-44f0-b729-42b7d7d7f5ca.c000.snappy.parquet diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 1b6c56758..b01e5b2b7 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -134,199 +134,3 @@ pub(crate) fn read_scan_data( }); Ok(result) } - -#[cfg(test)] -mod tests { - - use std::{collections::HashMap, error, sync::Arc}; - - use arrow::compute::filter_record_batch; - use arrow_array::RecordBatch; - use itertools::Itertools; - - use crate::engine::arrow_data::ArrowEngineData; - use crate::engine::default::executor::tokio::TokioBackgroundExecutor; - use crate::engine::default::DefaultEngine; - use crate::{DeltaResult, Error, Table, Version}; - - fn read_cdf_for_table( - path: impl AsRef, - start_version: Version, - end_version: impl Into>, - ) -> DeltaResult> { - let table = Table::try_from_uri(path)?; - let options = HashMap::from([("skip_signature", "true".to_string())]); - let engine = Arc::new(DefaultEngine::try_new( - table.location(), - options, - Arc::new(TokioBackgroundExecutor::new()), - )?); - let table_changes = table.table_changes(engine.as_ref(), start_version, end_version)?; - - let x = table_changes.into_scan_builder().build()?; - let batches: Vec = x - .execute(engine)? - .map(|scan_result| -> DeltaResult<_> { - let scan_result = scan_result?; - let mask = scan_result.full_mask(); - let data = scan_result.raw_data?; - let record_batch: RecordBatch = data - .into_any() - .downcast::() - .map_err(|_| Error::engine_data_type("ArrowEngineData".to_string()))? - .into(); - if let Some(mask) = mask { - Ok(filter_record_batch(&record_batch, &mask.into())?) - } else { - Ok(record_batch) - } - }) - .try_collect()?; - Ok(batches) - } - - fn assert_batches_sorted_eq(expected_lines: &[impl ToString], batches: &[RecordBatch]) { - let mut expected_lines: Vec = - expected_lines.iter().map(ToString::to_string).collect(); - - // sort except for header + footer - let num_lines = expected_lines.len(); - if num_lines > 3 { - expected_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() - } - - let formatted = arrow::util::pretty::pretty_format_batches(batches) - .unwrap() - .to_string(); - - let mut actual_lines: Vec<&str> = formatted.trim().lines().collect(); - - // sort except for header + footer - let num_lines = actual_lines.len(); - if num_lines > 3 { - actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() - } - - let expected_table_str = expected_lines.join("\n"); - let actual_table_str = actual_lines.join("\n"); - - assert_eq!( - actual_lines.len(), - expected_lines.len(), - "Incorrect number of lines. Expected:\n{}\nbut got:\n{} ", - expected_table_str, - actual_table_str - ); - for (expected, actual) in expected_lines.iter().zip(actual_lines) { - assert_eq!( - expected, actual, - "Expected:\n{}\nbut got:\n{}", - expected_table_str, actual_table_str - ); - } - } - - #[test] - fn cdf_with_deletion_vector() -> Result<(), Box> { - let cdf = read_cdf_for_table("tests/data/table-with-cdf-and-dv", 0, None)?; - assert_batches_sorted_eq( - &[ - "+-------+--------------+-----------------+--------------------------+", - "| value | _change_type | _commit_version | _commit_timestamp |", - "+-------+--------------+-----------------+--------------------------+", - "| 0 | insert | 0 | 1970-01-21T01:35:06.498Z |", - "| 1 | insert | 0 | 1970-01-21T01:35:06.498Z |", - "| 2 | insert | 0 | 1970-01-21T01:35:06.498Z |", - "| 3 | insert | 0 | 1970-01-21T01:35:06.498Z |", - "| 4 | insert | 0 | 1970-01-21T01:35:06.498Z |", - "| 5 | insert | 0 | 1970-01-21T01:35:06.498Z |", - "| 6 | insert | 0 | 1970-01-21T01:35:06.498Z |", - "| 8 | insert | 0 | 1970-01-21T01:35:06.498Z |", - "| 7 | insert | 0 | 1970-01-21T01:35:06.498Z |", - "| 9 | insert | 0 | 1970-01-21T01:35:06.498Z |", - "| 0 | delete | 1 | 1970-01-21T01:35:06.498Z |", - "| 9 | delete | 1 | 1970-01-21T01:35:06.498Z |", - "| 0 | insert | 2 | 1970-01-21T01:35:06.498Z |", - "| 9 | insert | 2 | 1970-01-21T01:35:06.498Z |", - "+-------+--------------+-----------------+--------------------------+", - ], - &cdf, - ); - Ok(()) - } - - #[test] - fn basic_cdf() -> Result<(), Box> { - let batches = read_cdf_for_table("tests/data/cdf-table", 0, None)?; - assert_batches_sorted_eq(&[ - "+----+--------+------------+------------------+-----------------+--------------------------+", - "| id | name | birthday | _change_type | _commit_version | _commit_timestamp |", - "+----+--------+------------+------------------+-----------------+--------------------------+", - "| 1 | Steve | 2023-12-22 | insert | 0 | 2023-12-22T17:10:18.828 |", - "| 2 | Bob | 2023-12-23 | insert | 0 | 2023-12-22T17:10:18.828 |", - "| 3 | Dave | 2023-12-23 | insert | 0 | 2023-12-22T17:10:18.828 |", - "| 4 | Kate | 2023-12-23 | insert | 0 | 2023-12-22T17:10:18.828 |", - "| 5 | Emily | 2023-12-24 | insert | 0 | 2023-12-22T17:10:18.828 |", - "| 6 | Carl | 2023-12-24 | insert | 0 | 2023-12-22T17:10:18.828 |", - "| 7 | Dennis | 2023-12-24 | insert | 0 | 2023-12-22T17:10:18.828 |", - "| 8 | Claire | 2023-12-25 | insert | 0 | 2023-12-22T17:10:18.828 |", - "| 9 | Ada | 2023-12-25 | insert | 0 | 2023-12-22T17:10:18.828 |", - "| 10 | Borb | 2023-12-25 | insert | 0 | 2023-12-22T17:10:18.828 |", - "| 3 | Dave | 2023-12-22 | update_postimage | 1 | 2023-12-22T17:10:21.675 |", - "| 3 | Dave | 2023-12-23 | update_preimage | 1 | 2023-12-22T17:10:21.675 |", - "| 4 | Kate | 2023-12-22 | update_postimage | 1 | 2023-12-22T17:10:21.675 |", - "| 4 | Kate | 2023-12-23 | update_preimage | 1 | 2023-12-22T17:10:21.675 |", - "| 2 | Bob | 2023-12-22 | update_postimage | 1 | 2023-12-22T17:10:21.675 |", - "| 2 | Bob | 2023-12-23 | update_preimage | 1 | 2023-12-22T17:10:21.675 |", - "| 7 | Dennis | 2023-12-24 | update_preimage | 2 | 2023-12-29T21:41:33.785 |", - "| 7 | Dennis | 2023-12-29 | update_postimage | 2 | 2023-12-29T21:41:33.785 |", - "| 5 | Emily | 2023-12-24 | update_preimage | 2 | 2023-12-29T21:41:33.785 |", - "| 5 | Emily | 2023-12-29 | update_postimage | 2 | 2023-12-29T21:41:33.785 |", - "| 6 | Carl | 2023-12-24 | update_preimage | 2 | 2023-12-29T21:41:33.785 |", - "| 6 | Carl | 2023-12-29 | update_postimage | 2 | 2023-12-29T21:41:33.785 |", - "| 7 | Dennis | 2023-12-29 | delete | 3 | 2024-01-06T16:44:59.570 |", - "+----+--------+------------+------------------+-----------------+--------------------------+" - ], - &batches - ); - Ok(()) - } - - #[test] - fn cdf_non_partitioned() -> Result<(), Box> { - let batches = read_cdf_for_table("tests/data/cdf-table-non-partitioned", 0, None)?; - assert_batches_sorted_eq(&[ - "+----+--------+------------+-------------------+---------------+--------------+----------------+------------------+-----------------+--------------------------+", - "| id | name | birthday | long_field | boolean_field | double_field | smallint_field | _change_type | _commit_version | _commit_timestamp |", - "+----+--------+------------+-------------------+---------------+--------------+----------------+------------------+-----------------+--------------------------+", - "| 1 | Steve | 2024-04-14 | 1 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", - "| 2 | Bob | 2024-04-15 | 1 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", - "| 3 | Dave | 2024-04-15 | 2 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", - "| 4 | Kate | 2024-04-15 | 3 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", - "| 5 | Emily | 2024-04-16 | 4 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", - "| 6 | Carl | 2024-04-16 | 5 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", - "| 7 | Dennis | 2024-04-16 | 6 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", - "| 8 | Claire | 2024-04-17 | 7 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", - "| 9 | Ada | 2024-04-17 | 8 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", - "| 10 | Borb | 2024-04-17 | 99999999999999999 | true | 3.14 | 1 | insert | 0 | 2024-04-14T15:58:26.249 |", - "| 3 | Dave | 2024-04-15 | 2 | true | 3.14 | 1 | update_preimage | 1 | 2024-04-14T15:58:29.393 |", - "| 3 | Dave | 2024-04-14 | 2 | true | 3.14 | 1 | update_postimage | 1 | 2024-04-14T15:58:29.393 |", - "| 4 | Kate | 2024-04-15 | 3 | true | 3.14 | 1 | update_preimage | 1 | 2024-04-14T15:58:29.393 |", - "| 4 | Kate | 2024-04-14 | 3 | true | 3.14 | 1 | update_postimage | 1 | 2024-04-14T15:58:29.393 |", - "| 2 | Bob | 2024-04-15 | 1 | true | 3.14 | 1 | update_preimage | 1 | 2024-04-14T15:58:29.393 |", - "| 2 | Bob | 2024-04-14 | 1 | true | 3.14 | 1 | update_postimage | 1 | 2024-04-14T15:58:29.393 |", - "| 7 | Dennis | 2024-04-16 | 6 | true | 3.14 | 1 | update_preimage | 2 | 2024-04-14T15:58:31.257 |", - "| 7 | Dennis | 2024-04-14 | 6 | true | 3.14 | 1 | update_postimage | 2 | 2024-04-14T15:58:31.257 |", - "| 5 | Emily | 2024-04-16 | 4 | true | 3.14 | 1 | update_preimage | 2 | 2024-04-14T15:58:31.257 |", - "| 5 | Emily | 2024-04-14 | 4 | true | 3.14 | 1 | update_postimage | 2 | 2024-04-14T15:58:31.257 |", - "| 6 | Carl | 2024-04-16 | 5 | true | 3.14 | 1 | update_preimage | 2 | 2024-04-14T15:58:31.257 |", - "| 6 | Carl | 2024-04-14 | 5 | true | 3.14 | 1 | update_postimage | 2 | 2024-04-14T15:58:31.257 |", - "| 7 | Dennis | 2024-04-14 | 6 | true | 3.14 | 1 | delete | 3 | 2024-04-14T15:58:32.495 |", - "| 1 | Alex | 2024-04-14 | 1 | true | 3.14 | 1 | insert | 4 | 2024-04-14T15:58:33.444 |", - "| 2 | Alan | 2024-04-15 | 1 | true | 3.14 | 1 | insert | 4 | 2024-04-14T15:58:33.444 |", - "+----+--------+------------+-------------------+---------------+--------------+----------------+------------------+-----------------+--------------------------+" - ], - &batches); - Ok(()) - } -} diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index 0fe5c989e..9b4faea04 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -206,43 +206,6 @@ impl TableChangesScan { column_mapping_mode: end_snapshot.column_mapping_mode, } } - - /// Perform an "all in one" scan to get the change data feed. This will use the provided `engine` - /// to read and process all the data for the query. Each [`ScanResult`] in the resultant iterator - /// encapsulates the raw data and an optional boolean vector built from the deletion vector if it - /// was present. See the documentation for [`ScanResult`] for more details. - pub fn execute( - &self, - engine: Arc, - ) -> DeltaResult>> { - let scan_data = self.scan_data(engine.clone())?; - let scan_files = scan_data_to_scan_file(scan_data); - - let global_scan_state = self.global_scan_state(); - let table_root = self.table_changes.table_root().clone(); - let all_fields = self.all_fields.clone(); - let predicate = self.predicate.clone(); - let dv_engine_ref = engine.clone(); - - let result = scan_files - .map(move |scan_file| { - resolve_scan_file_dv(dv_engine_ref.as_ref(), &table_root, scan_file?) - }) // Iterator-Result-Iterator - .flatten_ok() // Iterator-Result - .map(move |resolved_scan_file| -> DeltaResult<_> { - read_scan_data( - engine.as_ref(), - resolved_scan_file?, - &global_scan_state, - &all_fields, - predicate.clone(), - ) - }) // Iterator-Result-Iterator-Result - .flatten_ok() // Iterator-Result-Result - .map(|x| x?); // Iterator-Result - - Ok(result) - } } #[cfg(test)] diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00000-3cb5dee7-9ab2-4b6f-bf66-fc2a968c4feb.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00000-3cb5dee7-9ab2-4b6f-bf66-fc2a968c4feb.c000.snappy.parquet.crc deleted file mode 100644 index a1dea1429683aa46ab2f44d32556edbfa031778a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}Es9FO%5DYd1&HN-!=*W0CnmKumAu6 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet.crc deleted file mode 100644 index e26d3add58bd1d8aa84bf9c20cbf9b398aef21bc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}BJ1Kbt|&)I2M2yYzv{H)geItYQ5C0B=zVy8r+H diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00000-94321f1e-f3e8-456d-ae43-5bf5b4c36a3d-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00000-94321f1e-f3e8-456d-ae43-5bf5b4c36a3d-c000.snappy.parquet.crc deleted file mode 100644 index 5ec7564c505cfc172a06e21a987a803fe047ba96..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 fcmYc;N@ieSU}7i`l>hQi*T3@cquP6G+4>{_UknKx diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet.crc deleted file mode 100644 index 77e39b81d4e27661219185bdbbe580dfbd31da24..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}EsPDtBR*yT*Z&+xcH)f6s0L0Asuf_W%F@ diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00001-69317b6f-2e84-4ae7-a847-cc5d1415f35d.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00001-69317b6f-2e84-4ae7-a847-cc5d1415f35d.c000.snappy.parquet.crc deleted file mode 100644 index 997e9db293a7e5e51a8ac707196e2cd6286ac674..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}AW@Q0C$Zw~mUQA_I5%IUj06qZUsG0FFBgKmY&$ diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00001-75bdbc7a-6029-4166-bf76-1987f87901f1-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00001-75bdbc7a-6029-4166-bf76-1987f87901f1-c000.snappy.parquet.crc deleted file mode 100644 index 9c7d70c58ac9ef5a4d2bcf7e0aae10e13a656e5d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}8vgdCu@{v3Kd=N459Xvh_&<0AvdZdH?_b diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet.crc deleted file mode 100644 index ef39d10bbd547a5ef5b591ff49217ab62bb84812..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}Dg{-Iuw`WJB6+hObEzw){H=0AmUYSO5S3 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet.crc deleted file mode 100644 index 1303651224a1838491b966cedcd26cd67941019c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}CtnM@QsL&r_F$&$V|>zA>v!V-4#E0GEagQUCw| diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet.crc deleted file mode 100644 index 7526ee54eda1b51950905f7c825c2b4b93698468..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}ESuX3Nab2s!?CN<+@%8?)Lp*06p60DZ9v*Z=?k diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet.crc deleted file mode 100644 index b2547772ce81e13f55794138ebd4df957f98c916..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 jcmYc;N@ieSU}6wuH{?yqSW%H`KXdN2uhO;a8nZG1Yz7Iw diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet.crc deleted file mode 100644 index b5de62a3de667a8490a9034859c38595b4194138..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 fcmYc;N@ieSU}CW37IBtNDla_zsP^7kwmwM!N|pze diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet.crc deleted file mode 100644 index 14e9fdf09f3a05ec7abb97b0edac743377c75053..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}C70u!>rIY%=5FN459Xvh_&<09tGaw*UYD diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00004-5de927bd-552f-4462-94e2-e7987a580b6a-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00004-5de927bd-552f-4462-94e2-e7987a580b6a-c000.snappy.parquet.crc deleted file mode 100644 index 40542b6be889719ecf700cfc1a7b4458bd8758d1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}89XyOwu7BOl+%?ffsYzh^fA09?fg{{R30 diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet.crc deleted file mode 100644 index 37d56abc452027394a4a87d85a4c6d6cfb98b72f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}Bh8683(|YA^A_k81C&W$Tj!0B1-Ei~s-t diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00006-a200a57d-283c-445d-acd1-d2642d1a778c-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00006-a200a57d-283c-445d-acd1-d2642d1a778c-c000.snappy.parquet.crc deleted file mode 100644 index 5c6926f37065b51786fff2bff0109b4068611f58..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}A9T_$+Ji=Z;K(@M`aA{13MQ09#@R&;S4c diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet.crc deleted file mode 100644 index 38b817e1ef42aec86dace3a147545cfbbb72b532..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}89|Hs|lM^Qxlf*e-VL`KWdS0B8CNL;wH) diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet.crc deleted file mode 100644 index 2cf62caf4bef9e0e46340dd4083a6429a4344387..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}9K4$?>OCb=<4n3}2HbZ25N#0BBkY%>V!Z diff --git a/kernel/tests/data/cdf-table-non-partitioned/.part-00009-24d335c6-4da8-4a23-931d-168b2821adca-c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/.part-00009-24d335c6-4da8-4a23-931d-168b2821adca-c000.snappy.parquet.crc deleted file mode 100644 index 9b93f676dc8138675f0d4957294518aa4a14a3ee..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}Bi-K3#YI_D9BtAJyJl%ho3e0A@W2r~m)} diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-4a1d2de0-dda2-4651-8239-f53ca8761e0e.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-4a1d2de0-dda2-4651-8239-f53ca8761e0e.c000.snappy.parquet.crc deleted file mode 100644 index 1da6ec9e30c8d83c2e3d88d45b7297ff43e1bbb8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}E?%^V)jBnOSw~k8e}R4D8`xs7!4I0F>1V*Z=?k diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet.crc deleted file mode 100644 index ae94613f9fe4305b7ff4c038ffc444943bbca9de..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}6xvZ@a|VMB;TpV)D_Iid~b%70z%10D1cfcK`qY diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet.crc deleted file mode 100644 index 796311786525ecf4b0dc5d1c9dc85fb850c80e64..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}6xC6uFhQ=GoLa)#kocX(9{PZ1D>K0DmP4Pyhe` diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-3083ab22-0fc6-4ccc-9053-48dc314be509.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-3083ab22-0fc6-4ccc-9053-48dc314be509.c000.snappy.parquet.crc deleted file mode 100644 index 20ffcc417dfe2d6861030d5346f012b9c5927ddc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}D%4bL;HZiEADCKHxyk@$XIz0I2H?xBvhE diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet.crc deleted file mode 100644 index 90d40e3abf3db72f5f71bde46eca768272da1d76..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}7-zn)K{-b*b&%b`^exE%Pp9C}`RP0E0;h&j0`b diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet.crc deleted file mode 100644 index 287fdeea8cd34f2abd0155c45c8c89ffb15158b2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}7+EHM_EAbK})k=7=e;G#h7pxfFa70EjLN@c;k- diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-d6ab6a1a-7a19-47f1-a938-58954786d23f.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table-non-partitioned/_change_data/.cdc-00002-d6ab6a1a-7a19-47f1-a938-58954786d23f.c000.snappy.parquet.crc deleted file mode 100644 index 47f198ed0d28bc984aeea421c6aa03ce6fecf7fe..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28 kcmYc;N@ieSU}9ieV0&$|;*Z2r@3QwXY?*f^(8KsgtvgIUAEei$20#%_3EcwV9d#1L<_|us& zP$iL26uRny4PSKuU64)a%0i_pgQ_l2E3vBB(M4B)ga9cEng#dH-;Jd@ay;*y``)?d z-On3uz5HgGvXrA)`qi)B9OP;=L0LlB>>MG45}K!r^yYuxf1ZLLM2+DpY476?vxr@iopgP4iMc)|`0Z4BmuXo^BOjF}lm6c{E3a6x=veK3rfSquPpapMr_ z$_EDu4MB=OT6*vwNCH^)O4v!gn(MR`_A$m7SH+*!=hUh324v$iF2NwYO z_1CiYK(pSf+s(Go)o32TrG${^=!9lr!1lDRu|zf{cR2uDqhmR3UAwZB-82zo0nC{F zUaNkoW036tbf4Y`xjm zdM%^wTp0u!cvh6eYf4oD?W<{)6HzKn@GNigj#5=P$twcyn^%F;B zqKsLIW_;ljRS`-bv&vN#qzAnejZpeYbc*yZ(BC6h`Y08Z%M&H%dc@KhrIfuC6c4tj z_l-G-W>Y-eL=QLSF%?JP0gDHdJ5jbS|5XOEag?g9k>w}*TR3qC#A9{`J^V6H-y!^+ zG|ws=|9g57E`g#T^sSbv4K(Ol)ooz)eZ#Ul{hpAjsH@m42+m1DCI&x8Vo7tADZ*R+Z1%Roe1MDw@mNi<21we0Tni{mV zT!o!RQ4UDu>h|t#G2HBOgxkSOr`MW&cW~!K0`NsJ`~K*9iWm@^onhiWi%5KUcAoeh z7}0p>>@1VlX2fH+`xYi|#-zmW*BHr6$$58|2p9ZnI@`4he)$#3YPq783(JjR$J7gY zW2L-aDVNUctBrG&6{ER&zEaZH&y||GW|Y^;#!6{j*Bjay@fy5grhdq`@GjnhpJo38 DonIWd diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-a0f26ad2-e42f-4ee9-9a42-c551810ffef9.c000.snappy.parquet deleted file mode 100644 index 30adb9956b391702af138b3dfafcbe64cd46f4b2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2242 zcma)8O=ufO6rNqLq>ZY&PU0DML9}9tMFP$uAv*=%&4Nb6RIkZ{ru09FUyk3?2_ODI_7Z}pTR$V`sZf_Nv&k0 zY6|Q#nje1qbw}FHSYG?+?{~g^_0$@V=l&;mpS~oiW$`E?wMP#g-In8#b91RrV``pO zO+#bV;4iG21QrY~rs?N`w5746!6M0BSm>;1>|+|aC|N0rX();W=vhjjpNZ18#!>+< zNk+m;k%w+@fm>9Gj;})O$gGksUgy%Gl7kNw+~w$zir2V!qT&u0k7sh-0}XF56u7^- zR*ANq(T&ZhzeR@*o&#_}HB`?n48&X==i=D#3{H?>wI%|?lfa%zAKykEhiYz z&3Jb^i-D+S)x1^Lp*76qq7DUwP_ar>@+P@j*W2lFr%!^;Cx@S|pmNY#kRC$haS6|p z2O73fy<5c<&XnYuaCOLfz1z-uBzprN@0tveIiE7lg95&U>W7L5FRxb~slvffIc zEK0GG89&!CA>S2Pln;SX<6%n0PB^cVF6_5OQwj;V^Hvq8?Bfpe@mB%Aj_OA$6o~tn zJio+Eo2-FK8+shRK%Bf~s$)j=brD;regafwPBn4t3DY&w0SyQ^n+hSZfS*TYFTXP{ zKNVTP$Lzx5UY_Zsyx7MxP;540>5~;(tYqHDjQ8UbEB1jVdSapn6V@)-01ar4bWpQ5 zFh7IH>>r*2_Io904sur`YF#J4X!9`4i-r-72DY>9!1&q8B-%IWDDuLGbyEo)HWT zo1S6fViF`Ao}Nd(%mht}PR}xV{|TOGD;%-IoAOZPi&7$)>TurMN5*yDO&5ow8gIW^ z({8rxX6<}$?D=lZ?JYG|TFu54ce!`DwM6^NS6U5s<#MC%I<$GINtYTcuG@1in781k P;)P!n!Vl;={CoKy0~PDI diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00000-cb45a6ca-4b46-4293-b0ab-5b9937a8bd12.c000.snappy.parquet deleted file mode 100644 index 447df150cbd749981590d7ceb93f0e1e41b3886e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2442 zcma)8O^6#+9DkWiva>O6w|3uS1_liSJ8U5x60+N-OORDONLkD3(GS8TGfkqCkIqb5 z+U*`I9*T$;1y>JVdI=)3NE9g^qz4yo_9iGETU_vi9=416|Ic^2fhHs~@BQBY{r>PDkim$jSt<}B$Q&%D(5aGZ#fJHKVTA z=@uP-@Xe)!tTSz2)(M~(4h-Ej>jT^DboG|GNTgG8-*LT(bdr-q9#a&&C?_$PcreBu zU@rv;7()p#Mm1qAgpriQXqZ?|r6e(xN&rF_8?X?&#LgGP!$b!VPlQl# zzJNq#Rr(?vg-9~nOGU$nDS9zLkGKfElcpO1I_4vEl!=B18F~W^fPDAmyfe`4cj`{F zZFY6K2;ed&avg)vJPi1r-ZhuWrtFymAT&C*+cxxT%lS^jsfYE^LlYEuKVH^qsFa+j_5M*4=A^Py^#7 zS-PcGWzfEsFh*X58;VFmxfiJ>#b99N_&r*%3 zd(a5^fYUp*$Lg{hu%ccETC5!ZiL~)-3hkFU%Gk$qNc#|IQK|s>k7DOhUtkwuwdU8J z$YUWQF;=2KgJgqHNC+(4MaH`x;}Hj~5eQ8L-s7~{b@hl-f8gY}ZsQTqN7WpoQ}q9W z-u{`R&lB}d8bk#8C$s7~tg#I+sX;fu7x=w*Qutp;^_4&sqKurR|WQab_sgFD2jc% zrRf75npN}qR=Z-_PN&}!GZk%3D-rSZF$?Fir9q2d9-6LZ_pfSQ)79Y))x|x2Tfx>+ z))mLKhfP;0Dqase%K97I3U)p$bqs(5Gw4*aW!eDrhL)v6H!JI~(<~|>sj|Mkvr~*V zdmPbr@Y3nEX5SrNHL(D^`)A)D-%2q9VzV<$Tw^gw0ME{oy!v7qADx|L^0th5oOa*F zZ`h&2^Zj;5$k7hTO zr)?!pduWCK3E@$4~NKEloaxLO__IG-LOGvUapPBi5 zKl7c>cartzU#$>K1d=EJT)Xivt`m-6gm9im2oXdIWS)F=^v%b!GR(pxLj;5bnS;S} z8l?ewics?F-Gd*vVC77at$eX-*&8P>M?@w?^3l$RpUg3SN+=)<=fPOD>fby5IuffG5)KqLP~#Yv^_iePbcNW-|~2 zn6vx6cH^>T+OP}(@MN>!w@tklVmZL_Mt{__%@AT?W}U8X+g8u@!G+@5tIP#}TWMDjgOmu|th z`XeXD-i=#8A2~WkC+Po;Mf)#HoN|lb zNIk$Z$pym43=v5`!0LM#q=#h`x0v+f=mhB>JOT0U3rzYRA3NnSw|NnnYxa2cT^@YH zs?+~wXAt)$MEH_L*qvcVaR?EZZ`k3EIo9KUKLw|8$k)%I#m7rqJXZe6oE@+TKh2Og z5dA(wu`1BtGxJdYB~k1T+nPSmp%+~USYS~yhUv3$?SvglRm&z-K(bUgK*Wt%y;v0RIL1-QR Gj{OgPbQ)&> diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00001-c07209de-cdb1-4dbf-8e90-e33319ebf581.c000.snappy.parquet deleted file mode 100644 index 809770b5fdbe4aa2f77781ad0daf851077098cc1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2444 zcma)8PiP}m7=KA#(n*ZntlKxtz@TBELl?3`LfYMK2vR-CT9@k4KZHqUHVIDt?94=2 z+CmW&4>@?!O9k13dnwXGDOGxqB0Vg4RJ^&T^&klLAjK8+`{sYPflWwe-uu1p{eIv3 z{=9bMmA5L4GLGfh>2Kq4zRr@25<>IvAdIjATVUV(efY(^1Xh$~7)J;vC1Ej@A}K)7 z;Om{!pR@i>PQsn9YmV{k-GoW3$bS0oyS*&Z6OVp=*`s&fAjF2BFY@X6!AT|*Nr4D{ zW)YcZFR({Xe%(vNMCJrfBsjv4|Li_UhwLwX{QH|nFJ{-C&F+Qn{(~of=EBe=X7uZA zw!w}*`}R)Et!o|6txG^L8tM?m#?UglJ*{mNhZf=G%&IE*bP5~7eu!~iac4cG_6*fPb#gNhr2NW2gnCOm*}DS-U* z`6LowW3R$dh$!)giEwzHV7GmACl#V6DYoXL$23Iu)8X(e&9=}0$ai1M+e6KIzhSpJ zMo(h}02h-&va1u8hXL(tJ!6rqORhNpT(fIA9bLP(m|r&$qyXm3LBHL&(=|-kMgVxe zIT)CR)(@~8V0nEoYMMp>q0GG9(@eA5cRX;hxW0F#a2Z)qJKtz^w0_%YIQNEu4?Gnm zajeuN@O>piIT2lDNuKg1?@U1SrYpLUxfcE#}@cMGE61P zAv8kX=X6i)vO4@0tSDPRit5{% z6bliFu@b!*BpV!sguuc>WPIo{?xdhKd`Dw}58d5KDW^#(s@rG;^kFqe=oJ4Sq1yQz zdzmOFphkTEC$s7b*7-FssY5rw7x>+G-0{DV>X2hHQI4s^a=zme#SprFOqGu)xE{1n zG(y)eqElQSKL>ihaOnDeI;xVVD$P~IirG&q|AM01ncfp~fPL!CrYPJ;h0n8?+%r(X zn!)5wRIAJXNdk)JEM31&mY!~F;X>=jV|IiJKV;dvg#SLnQ-$LnXBMFMOM)=4+Nw6x zpjlP7Z`BVB%kB>PLb|H1sAVEty>HlPNO6Tq;hR@d%F~Fb~(cB zpwjKPXWt!MHIV~&_s_mRx|JdZ#AauhxW*z9AD*2je)UB(9y&YAlI_UysGO>?W#D2KP*$f diff --git a/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/_change_data/cdc-00002-67b4ede6-b16d-46d1-aefb-cd68c1672e02.c000.snappy.parquet deleted file mode 100644 index 2059c5f06e0fafb23d204ef5ae5848d064369f34..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2437 zcma)8PiWg#7=M;!Tk$;CUGszlVl#q>bhyICNt+rovYtlCO19(vU}Q_R&1B1!Am0g-tYUp z@6Ss%-+Z@5S<2A@J^u37M_iLKlqG~MFoY0FXo)V;$-m!!QIud6OS6{kE4%ILQgV6FdR3|MYzKUU&jddhPSS-nswk!usj#e&oKn_w1j1 z6gkv(`c6XMqnq^pogc0zWL@a_vd#d-c%*Bt(HhxCf1q`Z5)scymg9O8@hl?>EUpb+ zlo5HDAe@HS`_BZ(VkBWL1~mb5VH`yv8YaxClqjT934jYD1M92ldM!YYnFxKDrt1NEz((jK6Ah0tbQ>1{`R-c< zXQbI5wVY1R7-+Nv;1VM+eVxz(4A`MIFqX)M!8GtK_c^}(g`#@@NoS!6}ug;uAh4ZB9m zy*Uas@T@3_x0Qwj+E=qIC!$o!@GNiguF_CA$xpxkKGFXN9BK+Mv zUF84HQk5wC&<2Ho(>t}tYV)6hMcJ0T^6?%(ilyX@XHZV^l;`i}=<7sz3X~|5gWN}P z^JuT~*TCBJ>rU{B5Vc|z`ZGv0h=gQ-$U|g2%%R9*2AU!enFzYa>GIoPR}L8|uGx45 z^id_p=nVaj(A$$7eS;_u(;ybec@7@EuJoUn1ME?M zHp9a%dbqQIsW<`;STC5|sY>1m`N15hDvixF83n$t>8ME)u!-EC-0pTBHc~;^0 zKeCI^_Z2~~?5?VfG-y=S>sj?&T7ww`A;kF!gnSgJsaAM%S-R-gHDzku+yl> zA*tNh+S#c@n>~(bJ9z02yYuf3ubEf?-umUe?U#hPdoz=_rs=juq+R-(mcCluxRM&L9t(_Nd!ylEIU-DI!kWKgp?0F1wBX(UaSZaYA^LrG++r)B1jbv1y5cC5ACTJK`;l2R>bd{|Jww*VKejI?|tw0 z``-8G?XJD}dWBKOvpoCi+wXqK)>(>CLTElq2w{X3*c|)vpS@3JWU!(v%XmU~IR%U9 zG)V)320z^V;=@zHPF{xXmPqQi}b{&pIP+jN3Rm%z<(g}_vc^# z#zi6-5W&AmL}u8t?BU}FcesSel<13uNBH^Qo!gm+{rUHQe{KJ{>6NFm`=Psa`|+Q- zD0JL}e!aog*#1Y~TuHcfuH(Bk3lyWFp}S^dXq(-h-Zl$FIwKDp*PBSEQ<6wAN5PAw zBmpKKjKK%k%RvIhSOSb;O<40`BqcE#CYCu)5;-mb@L_DgJ{%^NDIOkFycopdh43)Z z0mRu53eFdh$U=?11VJ!p$6; z6aL6Dji`Ij2>F21JGIB^3LCJZZU8M-j{ij3cyNSUhoFEpgUKDMR*(M!4vOb6Q$J4@o@{IJSnDTawvP%wPP4a&@O@UGDla_B&Oz^& zL~&rZHGQZFRKY>f&9x zu3&2^HN|o5QOi|Iir2%AQhRep*ViIn2dQt<8 diff --git a/kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000003.json.crc b/kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000003.json.crc deleted file mode 100644 index a4a1dee7a0ac284130dffb7957d2f95726ee6606..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}8vXTYh}~YF$?VCT|5{ diff --git a/kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000004.json.crc b/kernel/tests/data/cdf-table-non-partitioned/_delta_log/.00000000000000000004.json.crc deleted file mode 100644 index f559c66ea438096c1bb9bf7ffab9243b9fe8e159..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 gcmYc;N@ieSU}DI#oyz@c>!QE^mu#KL_V5OJtP_i+4*v7t3dXk>k9UP4O^nc&` zec$`u_dRFx&C4}P=mOQ~-_QTJ_ezP%@S3A4A%)X2A!J6%!xOYJJx#xP@bjKvG2e_W za$xbcmNOR@2gWUZSZ&=eEj7*L=la7~Tr9V~cy0b9QqgyDtGjLa zJ!fmUx0A%cWWA){tf)`~`btiP3_++=W-@z=U8tzd_=w{tBWI!mkKZ73P(`pkLfAM@ z7ua_SHA$siqD4-M_8PHTY_rmCCUuc~1CLiqqKM3&koioZtE6%$W#OgyN`pL6=M%Dz?>pjnqRZ4mTKYsZ52GNVp2uR1~K4t#C4ZmyVBqMd!gBHZ;% zXv*D@Gc<$2HM8#wE!!Ge+I?l;z-BNSMmP*c-J!8$L|qpejraEr>{Xa@ZIHPc^p?|e z0ucO>=ULEyMgvwlOGW}U8v6$aOX=c>kuC=>uHTz}b#ggm0YKc&^y{;GBqN|UJz?VF z$sk;wo+DmX875Yzr%c{&87JHx1bBE;9*TGo3XrJ|=gI-mF7ax5qaT!c`IT#Ct!~!J zXF4O-v&(j8xwclXRnOZiopbeNr@L~#UbWZGRlBz3)K+WGa&^tNJJwnKCj1oq?}xk# KKZ=|1f5^X_k=gVB diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00000-47bab5c8-9d0c-41b6-9f56-d45f47d5dea5.c000.snappy.parquet deleted file mode 100644 index dc29aa1f310c4d0b4a6ea58ad5b240886d09b6f7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2175 zcma);L1-IC6ozM4Yk8xp#z{QGE{H@dwMf8OBxEO!Fvcw<2|*QUucerkb|Z_fBzsp% z3C0&gX(&1L6nYRXE)+@$Ap{?C@S&xYme50cDwH0I3ndT;A%PxC-kY7>9W_2kh}F!m zH{W}2=Dm@wz4=~+5;{i>diRen_ml!vVVb2nA-OkHLP$o*z!S6rEz;e8_P&-)W)>q8 z4NTs4Zg_(9;H&3%GDncYcM{T1-~7HeC7Hw|1`$-Vzxlg#4Gl*mz@&eDHh{PK5BGRe+yN5nj?nudm3AU>#S zGMI1~QwEu5F3FfC$UNIr*5lMnBN<9CMKKLUkpw+X0_@|V^tFb;gBPBI@B#>s2wJJc zEHIs3f>?3Xpbaq^DT#QfB3Z{|Uf|{+=*>EN|$G+33O-hXd%?lFG zm9wt0MyUZTEQaWbtaK)`en!?)&KhAm30SgENpK?th*(F_6N%|eCjL2185>PubcaCH z1X0@LB*;TxCIy&|fPI82`*@B%9!=3zQo66g#t0wd=f>Tvur)AQfqI5FfcdA4x}^i^ zR+d_%^bAlrff_q*PP)SCU;zQg9gvPN;8#`I%NOY7+bjlrfY*`m^1lj7Hv1q0n~m)h zJ#n+snatfs?&B|8%0EFJq_Z{dvs|G8*1uKSHwe6jq zg=8^jB+J1|r`Mi*b$ltL0YKc|BPF0VIbrO=Ng+a>oFiXSDJD`Ur;OigDJR_M z2Y7fB9*TScN|1>T=Yt($Tou)Ht{W6Z`4ua6rD|7-XPU!~?-t$WVr98nDPMG#nir~z qUTf)Mwd^imD7Re4tDLWRi{)k4Z8~So+wc=`!FkqL-i_BHrcTaZjyF4QA=A8 zK?Lc+Q%^m45u`}*ARc;A5yVryh#(Z{)gr~CLh;S+>_=@W31Kty-#5SapP4t4>G7L6 zjBq!W@y92x4o;`A2;DfA5bC@nB7`D>08h|TIE}yVeE%B{g(M4ZNEAZGP5OY7UO)f6 z772xUB>TEgI{38xCY;P8v1IH6q$ECvKRo^MEEXbl$lMb*^zO^{Lg=+ex%l|!jkgz0 zjd4FG%}nVM6*D}@6JQQ0DN@2E-dY!YaaZuwO%adrfPE1cd2=h`#zLOe$%H+xTX$#d${elf zIEk>>Bg$gcKsW)7XkD+-9#l-R-C%E|a@A_i8Tw*RqG&QP3Sy_(sL#&asZtY$ArQ`# z8x51{bx-RAEnzfTWs`amgC*^nZkpA4(-nKt#f|f4yMRhWC8aE9B-mADATGf=gCi^w zkz6Aa8EMqrEB8rIud_;S4^{>C8@vMKTO4GZ6TSd;e#58@hWZ@CD$48ws*|G*?W#kX zBt?h-!(KTDEoS(r8qV^bW!ZBKZ?VgRXZf#C{Kejiv|9>{`!WutKz1TT%t+)3xs;hG<7&2>IjUa{i+95 zN0wGr`u)KU!ygV=s`c6SM|-C?FaW-3`{RRy9uSz;KH<>K5J0%xJ|}#e1Q=IspK|yD z3OM#$!(y9v;-=hQLGl!@gUdB5%?l@;({gz&m+mdMs-}@P%7xreK9?Of2Fio^0<8=T l=d;GpV76lDGR7GTgF9zvx8_p=tPs{s;M?qtE~V diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00000-a9118234-f574-4613-b674-deb4d1b82aee-c000.snappy.parquet deleted file mode 100644 index e4afd94337d5b1553ec8202a7a5dcfa733619db7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1965 zcma)-&ubGw6vt;bNxRk42)zjP9}o|{2_AA2r1)lcHoLW@B!u0W@4WfEH}mGrE?&Q# z!w3&x8Gn8H{@`>Pi_q=C5<-2KM1)XG5a0>gAXf3WKRXAhh{@SW-y{JhcU-zgIjZ%h z{VEo*3sBBY50%Cle6YLsG3qTf7J363^O(Zt@t0>mUnC-=UYVQ0IKzDYzPB0~4k#BN z|GM>Q@YF1i=cJ`6{h}hr*&_K1iwc1blM~*ef_G#f&Q=2U-hx7ViDB4Jkvq-EEJ09Y zL5K)a5GU9N&B*XEQKgB5=O8?!q)8Dk^47M{5%&dLWl2cx$H3O#*7KcPxteOZXp%JeeH9CX}Y4GQbNTsUnE}O>sP_ke#F%DLJR-;~7 zx?iOhOhX`?DK#1vHR^%Z2U^l>G)oo@B<3dN)C|k2)?Hs5$`syRIM)wUA}VQRQ!h!hImd zd5Dqy^q32zsO0q?LYgnRk#uyi(iC)Vw8 zVB8Nqa2&|b7!fsz{EZDj8I8(v!>;JYiUBpCd*z_7QQN6D>T)8lkLy_^j~uaZRxKSW zXSGRP-EKV8Yt%JN!!_jJu?^K-U!m%>>bQ2Z?5d-xR~n8weRo4;1qDM@6J%}%1wbp* z2BF@xECY%}oraloR1Ki&^v2fKXlJs==u8JM)q17-*+B(|0YKce`}tw9hXkf|PdIct zgb*%w&xzhBA;wp`ryRb7LXNZCuvzj>Ov=42BrouDaJgn@_`%8K^ju!gWrj=5s%2)( z(pYXXpUX~}}Z~srfu$!z+^@=riE14b)W8TDQtTh``zgzo!dUx80qeNzx(`t zclZ09z5LnNC5-R{*6{6nPyR{eungTaRuRgak`Y29QVO1+O=1H-es%qa5y9lxTw?Nh z+x0d$YH#=GC($lK1z$2$4(IW&Zw?>H0%~j~L@D5net1 z{=%|OphM*pTXXal1{AtbuxIN{?A2&-x)wKuS(2n{k|YRfv<$G1n!>O`!v`;H72FFT zKtj|?MtY8@cnPAVtvX)jtsN;AUrM;n)jb(6^5&6@YrJ_d65DPoc!8?GzH=!XtXclG zm7uxmv@JXi;*_k(u8nXO8cN4%J5#8ZW9A^J4cG6j+SbNYw&qb$0WsrsJFS&(T*rfH z2!u}>-LB_Yok+`ombJV6hUY{Qbu$*UEzfg1y-=LW*Y2MjKTfICcXp+@YIR!9N^fH= zih;;_PQOu5;e7SkvQ(G*g41{SRO)Te-uxU-xX}4 zLOqAGoW$13ST(X-s8@Mb@E6d+Ea82T_muKpEBG8L>_|yy@m|FN?nn?J&86^3BTRZC z@x>@6WH&`4dW^WO7Qw7w`G&WG-2v8Z08aQ%Bx?;#csMDI(^;@R3W>v+eXO+v-``LuJ#TgSpy&6SJ!9Hn6&DzlE1L$bQy6k>khvMumD6&35ITL& zv!M2j3e0q-jRPX{k&ZfpB?v zj(AZem{1*_GI+NooM5%<)59C`P{d16fDCmwSGSOMnP=14wx8$emoJ&6vRTTXZuDKx z&fATd(pF|9LL|x{&BQz%kn*DV5w!uMZtz$R`U-M+CfCN#kxkw?9QZ1vpYIFTZjsg zL=Zxqg1~za5*{Qx2vQM5Cp!e5M7(rJ!gB}d_vY_2C03R=(VD_MEaO-}sO=h$5Q=bg1TBTrczOTl%X5rLvez|^+#w*%7g*PRb%kvJgHKOiOXCH(ot{;L>6Y7t$BoEOl??>lqMYd@Z=&wkz6ywW=2 z#q;I)`(xIb9}jgVweyk~rBv^Md5QQkJrl^z(HNPY17znop)B~ZOJXo(U>qlj9LETP zA0^oPQ|5KliB5DLo}3~A9)sYJlp;l3^k%DEARcgmy3XSf&)|s11#h|)acu#(GM=c< zs@B7)dU-~#syK---^q)7MMF3NlW0w?>Yb>NqS>Htgi^(_XEb%bGf^<87zMG-Fl*CO z4=TC=+YkunOQvb)YR%W$Kuc(5qh#p5L}AH#RW*!C&347kbYboC#de?)UP?)e83D2? z^~D9aXNU;%geSMiSVkCjbLEZ%bvc=HS8ywkZ-^BjyK&q>o<^~PGQ|{5cqHdY4r_uG zGeuz0%(^3Gg)5eINLlL+Yn2Pb3a%pvZj}QfEedy}#BjwDpR`~?_8j5?5Tjmr%v}aS z9so0_rf>%AGgQ{cYwBYqhHs+GIu94)`8b&#$IUPq0h3`UPWS=Xd9tWAiuxAAGRhnQ zs?9^4$f`pcCk3#8fcXM0ycG$r`JX}wE_*8imrYy59qDC4gQL^_~_dPLE*rWJKoh5A>VR!|=4R=r}@#8_VGQ?f|xK4np#GZg5W zxrS~lmibtz>b9z>wkrOPEXel!tS%49b=zu`ZMjEw+M+HG-CK}pGr_Bh2C`=c#jj86 z76`S5VW`j`@(}FQdt@If4=pS$^#q$8Mz9^cRBF@B@AkVl3;^QRo8KQcdPtyJ^Mn&$ zh7jV(&2!kVNr-XP<|!vXKq03-V_GzM|4qvMB_vLGb#T3ErM<#Q=agJt$)&qWjf$bA zwNh_xAfL+)YJH{te6L>a8_Z|5f&Oe+Q}x`9oZg!q(6o};EiJ-V_Vf?EjUhA%|Dpc^ DUD2YI diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00001-db3fa6b7-6267-43be-a1bc-7a81e4a5ddce-c000.snappy.parquet deleted file mode 100644 index e78a4c29def15715c0bbf5fa7ff6ffceb1d535dc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1951 zcma)-O=#3W6vrpY?l#tK{pt*fEV=|6TB%`|ZMBvn)q_Y`7xgMqvQ2Ckvzs=XRiqXK zQ9(R-wKq{bNlzkD1QiuLsvvmq(2HJ#o<&4_Gnph)719D zz#6{$^xDTQ1uVm34yy?D?voKhibO}ycH=$x#l4?jWWl7b6qzVsa?G!D(DUbyA8!+^ zVo-RJfeN^Yf4zP8WUFA(KN^7~0MQ?i`teTu>el+5tYFroaWhz_n3wP0EegvqW$*QG zr=RZGGR5P0b942V=q!$hI@5Q^x|X3-e#5d(xU9e{=^7mcF+<-1Y}w1CwrU)?t|e1p zmn7+$Bng5V#|ZZER9Mci*fe(Ia)GG$IE04u0;%CDpRGuVxGv!uS65{`#itu`V!Nh9 zwg9|+BHv!Hg9|h5*?G5V<9>wYL0OX<4#IhuWLkF99Yoav%>`Xi>y03sckIi9`Km|7 z42Zp+-?E${f9y646ch4r_aw&yikVI&R~tB-bV-vLx2 z>jnLasX|Wmv78F$3=v_O$mAF~ZmN?}rlOIcA(pr32~GvF4Y2~`LkMS^6Xrf8;PB zpIBt;K+N#agmYxfX;L%SftgfNIs^7URME$s9`JD^i;p046?zqMADdId+)R=wFqwqH zgbTpzE}|Y%)axuZkhuY!{1)i#+muJ+=R~GP^k{irNGh-y%!Aqkx*ZuCebJGAI zZnyjWX_Kb}s&!A;^i@b9T<)GDeo0bHq;^l){P9US?Rh_-$@_0o&MhNtn%BXBW>DmX zQ!HEMid8NS)jJK(DLVDh@_40OI_!+q4^>9p*|Ecwk~4m&H0#)I`C!={EsZ-)-5%Dj Rz!&!K3*FBmbPE2Oe*^i7pa}o~ diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00001-df7ec87d-5e3a-40da-be02-282f3629694c.c000.snappy.parquet deleted file mode 100644 index 6fc4437eea121e659f5095870be83edd31aace51..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2175 zcma);L5LGq7{}jCX0y{6msR&YW+2!wbc4b=BxG0Duq;~-R%{H~YmsG=d1gZ=$?nX= zBFjRtN<}>MP{czJ^eAnSvIh@)sFczoJ=lW>p$Aznf*@j{^i=%bn>RBr)kBsrdG9~p z`+eX0-uJ!i*^fV~V1#F|f$#kG^y|F^tim*lb%b&ssR*HrqQMii0xsfvKV1JRFPTiu zrzW2|Ar+*b?!LL1k<4OJa3&#LdGhafvPn$JfHU?7q$#`~|8o7=w-XX+(uf=?BA|zV zKHiXC$CMARzC8Ef;0G%ro?pND@xMKiSsafNIm+W{{LkNiJkLoc*%|H#nfp}J&{zt@ z7gkLI6E3qYgUoj>$!wb-^WCPh9&gPwlA#1s6w^=?Nzmg>fPFlaUTZ9P@WS^XyZ{0u z#H>_g7Kn~dLac1mzzs1PDv7wOB-XBL7dq5~We9}5 z&3@mbPA}GSpyl2Epy|zjFUtbnVRw!2^1lj7lD(6G zWV3dPo-EnvOy+Jg?u#r-_GK-tg^}9h)EZd^i>0Hi8IwYK#8%&�L?%r4hBHlLy)i zH0hjS^!>K&tUJ(FHt#U|BK3n#zh_KT?FGAxjKl9(1RI_W&A2h3q3!p-w7WEPTqksl zZEe$HO<>imAoK^V(3-P&w*^-1^G%EOD7@;pAPY0-DcYt!2)%*lInZ}j4OY@QD~4LN z&8@AuWHDzX%fU;h*B*a$d?};>K-_fv_38bQ5|}nVVdTO|AwnLXBVSS}CQ`?zjNWT0 zCs^zI?C{1s6!`*_AY&cQ`7LCe7S(j5>la1&6)Se7YFCPfnuCt#7TxB2WwBZ*A9oj; t$Ex$RwQ#&zb{CJ8TdqSZM=Nx`yy&`3=dgJLegfY6L7!w1It%}Lyalgf(K7%5 diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00002-05c18098-92f8-41f0-89d4-0d73a5d5b971.c000.snappy.parquet deleted file mode 100644 index 805de42f30f413008e3ba11337ada86bd7d44594..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2175 zcma);L1-IC6ozNlD`}&s#z{THE{IetwMf8OBxEO!Fve{m4M7!YuOXO~b|Z_fBzsp% z3C5Q|X($f#6iN{}^jJy>#fKbxXeosd3T^K`6gP)JAcU0m*!I2I+1*i+gN#_s{QKs6 zZ{EB&@|AZ!s!~GdsX-rp_2e*Dq$*6aR3{|=wn_+@QL^v^tw>As*E^37XC;%_rN~4B zllPp!7o_jMesOn3GK)#!2MOu#z5niLl1WU;015wqG)qs?pFaQNi;QHJGa`qI2WaqgfVxCY$+%6B`EFBLkGEzT$xwnRifJf{Bu|c}a#n{7L8w$^Dtnh*EbFV$@kUQZPV>W#W{^3kAlM!uY(Gcm*$*1ENx501 z1wrC_<*duBS#AOgiy?XuTf)vOp_gy681MmJN5acX6_h0VUue1yR?lHGCWZ6}SKrOiQ-pn~F}19-2iiO| z>7ro_f{yKMInY)%?=bs@7lhqG-^kSLWxGO*Gp|^L+rACWxIOZQb};zF?s-GUb%u_y zukBda1XkS&hry^lv=%MiZJ|}ax?^FF!mF+evM__5@;Y7sLVx7@4)mQ>hn3!<6+^B1 z&hGAFvY0cH<=~~;?@Yfsz7)~`AZ~B^_38bQ5>T6-Fmd6e5Ft;`kuRwf6RFcvChxVB z6K)OyJiI9nMZN$f$W(`OZI>9AMKzu61tn2_rK(-6+11kN)~M^dCAYOyU8z+o7u@C6 t`P!1#UcOMPxGU!?ZP)Rt=c?XPWyN({&KYwLegaJ|9c<9|zp(tH2_ diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00002-a34b39da-a6ce-4459-8ea5-ddd162a84e94.c000.snappy.parquet deleted file mode 100644 index cbff95901324050e4cfe4a48f5c363ed1e3e2287..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2168 zcma);O^72!6vwMOJ(=E^9cFc^=$0^s#tFMJHVK*8oeg0aS6G%MhSjUc&`GzMU?)A( zohZvNu&jt6d)U*UgBQi4xuurB@BtOIK!T|J-WG@ z#K2UuWZtRhaJS}qUWXHcP^rpPc7yB&nWszaD~;Nu z(k{`WAo0C&)-~3uw3|F5cmpKN4&IkJ-y-L+MqeP6BP9zdJ)<j67W$xM}fJVqZ6=IAD=OrZLS@G*U7+)a~h zfk_i`8NLAKzcT8!4yfCCYLUt*pb7#tb=B#Ko&qV&{;Y=qCpD3&vX>9h%V&8E_!u7| z;pJc1lOFbQ4h|bjDQ@wwvysf5AorI%9`-FQ%Y&)Bm*BM5u8v+F=f*}S^!x4kgx2Sa17Zm-)_VqEx-MYQMJP>6eDZ)AtT zJNCdEIj%Euj4!o)3k$$%SkWjPcSqK$#Vakc8n5qLSf4P}cR?0rP)}aZ3qcr;ecyqa zvl_6{TeT9X)!08cSWOpmM!Fol^oPCKS0@ib768QU&AvW+J2C=lvlFHsn+zi4**Wq9 zm0@CacFOddmT{t;AjHd?@lxc+PlC*JId2>gqF)=5^OE*Q@TvrE1r8yxPT@w^rS7-Hvm?yaUhw|9;8Kc|vZ$e~y0u D6@kuz diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00002-fce8caa1-4b6f-47af-a2df-a50985051257-c000.snappy.parquet deleted file mode 100644 index 5bad38065767d8aacaa058f0c3d49407e0950088..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1958 zcma)-Pe>F|9LL|x?&@TlW%(X6u+%c-qF}==yPJQIum};^7V8=zv(8JGW_NXV)({m! zh#)+4>Li4RC_=)6hYo^NM9`^DNe2lJU6SzJA^N?UH-AitmF3NQ-}$}I@AuyC_Z!Er z-pXKvJF$${zdc(M5?F*~3ziUSyC@=rB0>a4&=NR_mw&8151X{-T$3o6T-WC)Cv9(U zZk|4dWZ&>fA76f5i-k-)663KyAhqMO`2FMWPr_!cvg=Ub0loe5dG;8}g@-?HzBzwl zn8)+s(d$1aLuTH|1aPEuh>9F#XXFnoD#Vl7nLu`q7LwUHKz5E3$dVVkBKy-2Ob`@V z5JH0FMG5xaH1s;^L?=5BE+>eDM<6()BuEh#`D|72#XZ4SH$*(l4Gu(H;M1*$YYV`Y z(Rh7Ex9&~UC#Pve$L$DDhXvV zB|*BBe2WD43=v_Gh~yd>Nl8O)uH2EJZYOi@3T_4R4Y2}brv-PC?&ENJ95+K`7)*wsIN=9i=NU$=Gt{RT zR#EBzP;DG_D60->loY@M0`|%|n6ZG5s$nnhn3sJg;H^k-&HogNf7x3RxNO!U?#M1X zScSPejQgquN`dS~h?tVd??@-KP_Hc4tSL>O(V_k|rxmn0YSqiNsvOH|c`c3Po+B3Z zSyO|qnQc&8vugLX3bl1Zw{`hfWI?s(XQ(=;)@`dXX{)`e(-w7g@b-ernh9Q&4UoAR z6hED!76{dbY3k4*>LBc-y{ZRQ2N#x>di~7~!`}{G%GIgncYEC%1ORc<=JyAU9uSz; zJmJunA%JkXc~1Ci5@1}ldCK7rP{65A*DRL2|0d70siHIh diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00003-7920983b-c6d6-4703-a93f-c1bf180e7008-c000.snappy.parquet deleted file mode 100644 index 06067bbfef819108d5fc05dff5330ba513e798b8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1958 zcma)-O=uHA6vt;bNxQ}vzdFM%1PzM~Ez~6;sn$@WdJxeV)T>C@WXCkPN!rauky;Qx z5K0do#8WSxgrd@e2M+~N5y6ul1igs#sz{IGL4312yF0Za3Crxvf8YGxn|W_0XOEpP zV1$RTiXXoHd}~Jr%P<_k3PQ;PGD0XO#i0c)gR^+$$>NQaV3ICHCZ|o8a?*O5;8F_2Soi3Bhbo^&Lt)pqC%s&k3Ic%Kkgw&OP6^ zZHmY9=I+zq+Xb^A9tj+2yJby{vo-P;mNgQ{Y)v40OA}AHl9bz9CkCd>X(*r)xHCF`|%dForRx}b>-x+JwV=YTjSmAbr!7WQbWJTe(j2MkX;*CK}$QO^e4#YSQ zPuR;a$aP?b)fDxB{SQ_2@qqbQO5md?w<^QNa37n~#=)2qAsnC=~mu3o+{ud~AhpPClP z+zg7JHmCzayJy=b91y(>GwGNfK=tzc!opZM*<*y$!ArB<=zn%_dZPd!ZrcC+=%7ag zru9$QbTdQ{F89w7Z<7e)tNl|pUqBJ3JJWGk^8TBYiwj6S#p~c;%gOS>$rg-4(I{j` zYQ3gyWvyDNFi|Yzk67ci!^IM~5RgRZ82N zwop8Xc$D4*K@sUeJQbxBJk~$J(nD`T!Ct(G_+~QMOl^@}mdwm|-hAH6ym^!Ht9Nr4 z;cl$qm#;T}CDT}jVH;Kv>NqDOgc4E`o}l$$18?jtFC_$%lS8q|^|I&Ban#E9H=o-D zy9m{|%}{Ba!5@EaeUb%}&i)XkfH(RBQYSu*H&%9@i$S$(p_v5p6!YcV*6b0Kvrm8A z`P6f4jK{ON{Ql2z!7PeL0!Mn6tZNE=N#0;tCtOalm(sPc6($t=CScEAD7IIl=yk2x z6;?@-u1S(0s8O6?A9aOkg~g|_hRbQ9;u{bi($l1f$9Zp6YKprOE^>8E#$&v@FXIC5 zt|vlU06x2!s?FHJ!-?ADv|F}uC&F^CtjV5(a0)s~#V)(OsF0@lpfhU83+mI3J=dEm z_*7It?C`6Vsfh=k>%%ex!iiF~>brI&(mFs(In_qVcO!|yI%{Ry_q|Fz6nis;cV|v@ z0hP#lT3;|#NSHp*roui$L|7&=xkhf7>S&m)@JY~lmb-8Sy8;=9SOK!#hP%lV1zX50 zrg4gs*efwsfsC8QQJz}f04+>4AB((w%3D+LC1kEjacIpgX$H6|L4@>Rgijh_JQ9gZ z?U;~V7U3=s6&|9npNx5n6wO^=HcwRC1NITB=;Iaj@gj+@Aaf1M4RId_Q^eeik})tD zg#!wI0JBy^tx?og5*x_e2UG_~9mp&r-6RFDfPkYT9rtL!hgH$bSL$V-1-wqL8t&zv z!_wUB^#p7-EtBv`H#;7Q+-=JJ&<4kW>?DY6s^oW~8_K9p)2hLgWzX190~RX>>yaDO zylO>D=B)uMi?j=eENZj91(h?~aO+l3eQcH8y6xC?TiZ*_8}+#v*BCKs^`J3XH~I`# z8Z~3&-n>Bz3Z^^L?9PvhpF`?Qz<=`b0b86GofF|$1Nx8p>v|Icf zTr39}esD55E0?!&ne(NF=Q|mv)Sny5=d#1jKxr`F?@kU3=d;exV0O~6-Q4Az+n*hB WoRWP(Ux1tT@E^T}uhTeusQ&^3tftNY diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00005-9443cba0-84c4-4dc4-be61-76396629a546-c000.snappy.parquet deleted file mode 100644 index f41d4ef4baf8a12f267daab98fc7f6f80554a6c1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1958 zcma)-O=uHA6vt;bA6;V%Ry)Hk1PzM`EyN`uiETrXs!&Rap?Vc5o9vn{ZjyGBD5Wik zAVO)ulLzTtks>{M=)n&J@uJ>(s3OIKV3A(Dcu?Q$cWjY_Fgx>~H@|r^@4emW3s%uih$GW8fovYlC)0C)Y#zsxImdT_ce_3q!w5XX z_yo@J66~F>Z#rmt$D0pZjuQ?~fOANQlQJ&by>-SFcNtgRWbwFdu+QR>-Q5aUwg6n6 zjJD=vwOTKqagY7nV{S zgi9!dIk;!=2(yGG7s*758?$0%wFC{Ck+Vi{D-dt+6(HMTJV5RSv4m3PIF8yR^GGIZ zij-4jVA050EoFHlKWmS&Hci$#R!fQA$WOe}hY9&?5_f?ZwB7UF zrR(G_Fx_nOdcZzJ_3!bJ?y(xe=TT~tg^RKG=njv$%@`R6lQBq6_ygE{`cWGc^)7@( zl-dVW#71>R)g(=l5?Fx4PC9!%>hM9;zsoDS%bw}*R=~UFe-g#L?5zM?HZ2jWrI+oE z{M>EIeHw;TAfE$-O>yK`U;s)e&GSv8CdqR$E|fJJLw zm!N9qJ8D}pnzyC8+Ljf$E%QGDi(-3WP8Ew{t8H|uZ80sHWziCg*A_)uOfaP>Alqh; z{Ax`#KxlMyU4{Y?i?C8niw;yQE-o*p-Nh!uT@Jfwjau)ko$B=*0Ke7V*L#KT5vbNX zq3g}yL2S8qj{j}qVJx+GO7{!MhPnpBYhiF=4y7xrP$7*2EUoYHevZ z#W=pT6ig32_0WUKF^AHC38hdl4IwulQph#I7#|8LIhbO6XiMLWMyqig!Y)D@{pZc^ zy?Jk*cH#Q1Dn@t|>-gW}zaM^B!YaJxv4&9LqKXj8C_``ut&A_?-=4i%lPu2FV~adk zd`g4Biv()(<-^~HB*O&N`;4PX0NefR_{V}|GCYy80LMHI4bMS<$S9E}UJ#=# zr7s>RxGB_a70-&{p^6(~xRZ%(0r=f~FCRn``mhXvaJJRy_|yp!tpHll?etncO(f=K zIBGk-?*-jh94j||I)82is6;hN#(hhJk{OeE4U!2FVU?)l2AQ+8dR(vgPS7P@z4#5X z0yT$N0kWURqvY!>wvp8=;i4e%Epb+ZELhEYQrqMSJW=gLk(oz~xt+yR$l6lU@cL=# z2f3v{lAH_4z9GrvN$;9s$H$w)$*lQ&-2}~+nT6O*Q%9k?qus~ZGtXNUaM8y j>8q7R*P+#^DxIiIyKc+5Y}|*h?dc0W$|1A>f7ky5OlPXN diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00007-e6053215-0e6f-4891-8dbc-812b7274a4e5-c000.snappy.parquet deleted file mode 100644 index 071189c3bc74eabad3f6d9104d95391e2dfafae6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1972 zcma)-O=uHA6vt;bn{=(Q_|X}5A!u0Ir9xd2(r_1uj=S+x01DefqlgJt;Uw zti|gAD+9DgPd>j)3oboFA*%~m><>si_%wd^@XKS7)TM=P5{xs<+ue@~|Dc>(`+oEF z*<+JDpZDuqzmE%UQ9cq(q@R*?O<}9#FD&bX%c*cxy2cG*NMUP%h+s)^gc{|qYi(1A zk|bS|BtcN4Ou;`gg<&PgPzzpM&JYz}g#?kFA!R(xtxc&d?n}7L)omG1a`QmOC2sB{ zLSF!GTuV3RZU5e##>^b8+PDW{xnI^~*FiW9hElVuv>%l+tRiegt+;-3&as#J(jgW$mBA)YO2MsUg49VfuMTf7wihu9C8K7UJCb;2MV^3S-XiKXLw!nO1DWhD=;o+H)eT73NC`Y3;pkY$7EAc3D$eqO&GIWqcqb8W^Ir$1 zz1cen*lgA+;gfB4{3UYt8242Qjs*FdAhM~F--%vmr9n-r`?HokZ$lqgK}%SR)Ni=; znwBhB!&VMy=Z|%ainrRw?M$jG& zW8&7b!I}z&TnA+C22DU`sSiS}<#{$ViZKC^bkK;P#>DdK>R>xKV6?+w7PmIrd34mp zaRQJx?L0ni_?W=7&IyNZh#11<&NOL#pe~WT^6=_ra99*dS zS$=S`c`IMA^4Wn(%k`YBQyI#S74o@pXSgz27@{-7d*)1lm$<5cYP S`U>2(M}O#L5}_OL|N0N|aH<3V diff --git a/kernel/tests/data/cdf-table-non-partitioned/part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet b/kernel/tests/data/cdf-table-non-partitioned/part-00008-6fdf199c-d268-4ab2-9288-5a6503bac41b-c000.snappy.parquet deleted file mode 100644 index f285aedb14ccaad38f75eda3b3abeb273d1c641e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1951 zcma)-O=uHA6vtu--ag(&0L@8~- zOBKXJ&%OGA2gzBA7r*Y>gNS-4B3^{xQ9X!nX1}H?lCaFq{`bxAy?O7=W^VRk9wR(} z75w?bxA*A`7U404WrTW*o<4pQ;zKH}Tvz8fv-hSHnn&hlP z(Dm~Sl)+j2=>DVUGH231=7Yd3`vIvRAHpy0>~8X;eTr`;zOoOe5_F5f4T~xXWC^yUDs<$;1lM|!Y)gp5T>+N^wJqYAVA>HQ+il6W z1>oJY>Bf@jUAxklU$mUUzM?7HYkqrLB6a;@Y7^-SjuArt3PjrY{axc#g$KyZ1U8Uf&fs)F zVq0RYv!tY#i>$D001_=SoAaD6l+#Y&<4E5U;*g?LM1gGy5E*RI5Lhl>w7#P=Wui?~dO0r;# z8#$zm{AJNtb`3Z&%PqTUc=a1b)oz-W*))}ow5l~%mTYZWYc##qd{Z0M*pX;x)0b8? zdXV6iV}Tr)LF3yCwg*D3<+>&u4s9A%+M`+s)uvb1*GHqpj1esdFHUWt`_gWR<#nW-3;N(h)yoEr zlg%6Xf|1V-S6Ys1Wv$9sezK6yOD>w5P z;VvxWjc;FmCR131VF#8Fik}w|LJ=VjEodp6#?Mw?uSB_!WUm_%g^*F(DSM>PJ1e^} zF3cy*Uk^z82j4e3xe%Z9`}d>Cdsiz0kdpW`e)st6Do^T^-GuZ#HT34w$GLw{&aLd; z*f@J^#Pjq1;j4q=T$t}i0NctbQIVr`jot%`3h`ySCXm_EIGL^kWVW18mVEDu9CSID zASkjRaDwE!gm}Npy^b<>GJAM(ib(hxm_tg6lyJ%GtqFm+BLwQYh)2ADA0jS#-OY#_ z3&1C1iN>t%+?i@rX3Uz7lL(95qAXSogcHz-*7cg%jfyGA$KFWgs?(e?^o8z3(V}7$ zqT-fapPss1H7%HiKsZshZOhc_z7_{9Vc4y*W%?3LlWf#<%c|C!uGpO}zC3gC6i|t% zq?DzM1jnWHbx5$!;0TLEB%|b7MjCd{%54eiVJGK~U{~P0!7D(vI&c?x5XBnGlu|h1 zk=T+LYn+raC1BB$bz8~`M?7nfvep@EO$coT+Yy*s6M#sI!fh!r9Pz~aotThqM%)2n z)U(I!Wngj#m_ap#Jz)Ps<#RlxIiAGuWt3SLVPm`;hmVJ)873nTG7QBDKLE2Qk9tW_ zA7WTVnIC|Pd#FRF%1C3R2oYe|FXymF4IfqcEN^L+J!W__5^nQfg%WJ`W&}2yUJ9ac2zs9bDHg7tOs$G|31+7oZA^F0Q zh{l|y!L6BVnN7{H?`k!(sT+Dzm-i!!YI9-MREN|?(`i+jYF1^}qM;7mT2$#}f>%`o zWG@Vg-<&ob5b7<<(&2)rLon0Ksyvl^oiY&4$W$`LnCY@m6{3+3-`-s+ z2Eni_5f!1^_c21KpcG&P%^+2B@$2&are&*{T8dzK!p;TQnxsv>{C)fJ&IaioPi@;^ zn@u^3MSnVpVsFewLcfc3ttj`%UIgnD7KtpGASdXxyq#lcIVqzUR3(RiL%KnGq?gT> z3K_~Nn2w*8qbUnt^`pV57&B4=aR+PIp~;l(3f2uodjTE$OQnC53iP6;X_Ihdvnhk3+e!WHa}h?~B+za(?3P_e+Fs5Po%uGj>TD)r zJDj|>$0BBMmzeflnVYg3X2*=;a5jj|nmP1@&m*(*d~V7P!Y7Y|oY}k}79$aY5X^j^ zxjq;mQq;^9)a=X`i`sQ?!nh8HUA$m)^XSbhvrgh>lot!69c9OUFb=C(ovM!QG;F83 z>&`r%SGn7C+6||Ez+3KqqbUZhgGQaV_v-`Bg!9l5&3c=2m+k3YxXI~*{=uy~f&c#s DSH1eV diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/cdc-00001-308c0cab-92b2-41e1-90bd-9416b10ba6a6.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-22/cdc-00001-308c0cab-92b2-41e1-90bd-9416b10ba6a6.c000.snappy.parquet deleted file mode 100644 index 9420cd532b0856f38fdc2c35142f7443bd5c4c01..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1028 zcma)*L5tHs6vrpiT7p(tp)(|sLkVu!g&G>#y4})?9u`@bvUn9Olg>07Ow(?XDpCrf zH@|?FUc3qmegN+tymTQe=Dk)_cK{H65d^mgis=8sh(Oz2~vP6KbO}gaEpLb{1Hf*F?oWxa#ciZ0^`rkLKR2bS8;_Lq$m?WDWSR!|r8400kwKb1(0zQ)8x*1#auoRk)j zvJUCM8URanBO4`az1n1bm8_2`Yo)AV{}rD?T{^Z_O262^q5|0pZ7#YttN2+Wl>Wfv zDWbm=iY=A?R%&n-4NaRyp3PQCen5bj`%dN;iyjMBC(_C z3wt6G26u^R-<5?aPs8k)aT3i(iPO1b- z4|tusZKvCEn)|%t?zP%t)Y)$}d3Ucl;!HSq9no%fId|Ev-iMov1!P~9!ilAidWGvo0+7Gn@zeOiiCjR zj}UwF2k22ede*bxMQ`5x0YW|a?nm2|y%fTmcb<9PXWp+pdhp05h}4Nf-hO)h?Yc%V z9IHe|=*C@)5UNOt1#E-dA)nv<{e=sZTUJJ?0A+vf0IN;9A)EF)0U(Ll#=$Oeo^O3qVoZhd&oGIE`|O}JQZ~41Q`8t^U4UkRcdjWRm|dNg;4q( zlc$LOmMAu4`bVn69GZ$U52hwtFle}$_TGFhg3z7&%C(l+F)gHQml)xRXTq3HViB3a z{DnCa5rZva$`5I&CzH^7dKd-qIMSQ?#1$S7_1@#9o=gv}Tn=zfbNxt6MF4;wdmeK< zAV8#O>gS?*Z@F4EFM?Bzi*R_w^`}>lF0RXE5?51svO>zkyzHG>(8&ALuuZ#V+KnwI zc0JzUPTTIbY-^8qoZVJij5~WRi+6XeF=xWQZ;Q6o<=kQ0Y9F>Z`=K+~xFh)R{{hanzyiNsR(iU0q-Z-iYr2BGpkS~kLiAIP6zMBW7;@ScDP*K1 zfrNEP+fEx;sut;}Sc}y%>#JhDPgpbP)*SX<@(J{W3TLLZ*6Oz=z+S60Uh^zx@H0m! z{f^0FM1N}(J39TN6`>Dh!x)Br#z!0q&XU@*a~Ve7FfguFS)Emov9sog&U^-KIvdNF zg~JzYAY%@fh#Nn&iLE+eH|;16$GzAt+s{1di^y(1nb@j(@W~S(XEra0WnYFM1Y_Ul zT^|e(Da-a8YBwj-Y56iZVO)m8E?&^Tdi3&jStM~a%F`(_4zpw58HA;*P9>MQRpyqq zyJOE6CDE<9jjCJO7xnI5wI+M@{c1%t_9{KWrMv6OTBRXGm+zP@xW(y%{=$trh9CbQ Dzc=>Y diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/cdc-00001-985fd824-b34a-4f3e-b7e4-90bf8d04898e.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-23/cdc-00001-985fd824-b34a-4f3e-b7e4-90bf8d04898e.c000.snappy.parquet deleted file mode 100644 index fe6fc81be2e26c6bb3119d40edd220d7c6c2ba60..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1021 zcma)*L5tHs6vrpiT7py;p)(|qLkVu!g*r60-R+iM^svafl*OYW-Dzj+hE3CMk}6UP z3*P(;y?7NA@!-vaC&6zZsGqh?Iyy-hBQ3 z>V`!yEDJrN~|cn-ms_tdJll;nv$7VVHuHk)fi zs418Z9~7f84^O*M|3nTsDS)_%4eSX-iZICsd?+{3p{49fD0cfNJQ&Ds{BnHpfnw7# z+l~%xF&juPm@1xNis^m2@96Dxcj-vcR{#V|(UfblVhhO?yi?70jhZxk5UYOKMk$k<+UL}xyOV>+A2n1!Pk zY$#(6mxvobwW+O6!fx4798UVNU9q2e(if54dNQ@u>A@#YfSlR9AeI9ef)Gr6pZ9z) zK%}hL7f`!3oy{tj!3pCs9I|-9;Of!K*JV+})hJJA$T-NheP?*v}wr29!ilAibqA#*~~Ou+-%bQP$cB! zudp}o9*QR~f`WfR@#4{oKR_=czWdR3O)iD-=ACDr_nG%=4<01?_@25u#@$Ql)<-Vj`ojkxH(VCXj0F z)1J}qrVU9AC?(BF{i4q2MCUCgcad=_T?+k|cq({EIWW3Qy;}m{rN#a(af%syD-lY+ zV)7W#pAyA}On*sLm_c1pCc(&LQwGg8)4rR}MG(3ZU%A#WTc(AS%_T;7;+Zg_lUPJ% zFnM8)MZ{oznDSkk>B$uIjvhuqJdE_Ze&h;|hkED9Oiv~US1t!Qr@4M4Mj`;fk3Ek$ z9uOc>)b(>wy)&E7>leYP#zi>1;`*bjM;BM+GKs6HJe?!uVP5vVaZt4U4z8ts!T^-m*p0YIE+eO|=Ufoc+*m*tP@s;{O4P C)%2GD diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet.crc deleted file mode 100644 index 918300ab2b1e7ea3b4e42473bcb3a7cc133188b7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20 ccmYc;N@ieSU}8|<{+N7bgUep~*NZj+06bv_v;Y7A diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet.crc deleted file mode 100644 index e8e5b3d53e230762c4b02bf23f74db4f35f928ad..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20 bcmYc;N@ieSU}8x0_R28K=4Nwp;#dj*Edc~~ diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00002-ddca9e04-03ef-4533-a9c8-05c1d4f79d6a.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/.cdc-00002-ddca9e04-03ef-4533-a9c8-05c1d4f79d6a.c000.snappy.parquet.crc deleted file mode 100644 index 7a2a49872da09d39de424c2608896318f644853c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}Dhzuy-~a$8JXeBd`Rx diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet deleted file mode 100644 index 4298341a2afb6d6fe625be454d762f3f5d47d467..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1034 zcma)6O^ee&7@kbmZjee9Izs|El&~9ip@znOY`5g1$Rbio@u*05+L`SJ)3lqUD@*Cc zAF^kUi&qi!;6L!HNAcvzg9w6eKH81#Wg*PG^UU);@7Ivt!)F#jq(l_*_xG>2MU7xM z=825Z?H!B|%1PU>0#+tFs5@f*p=X0gNq=jm0kNXc0XE3jN_J^G2c{UQCx46swwQ z8#2_aHsn0i0{#S3OdruBL$;F&la->qxZdOhc|p@4CPMUMj+E#dOiW~SHBv~G;sj!? z4(%CTd)b!d4ti;$mPl6fOQd;)NfQ~D(zWP+Ma1IAlmo-w=;H&UGe@YY^GW{c!U>+4knfN1ZHq0tg_BR;8nQOwh&SoMs z{mE-{EJ6kkiYZ^ExgJeKZ|gzm&xWC1(N7%V@<4CDnCsCL;mY9vr!>b4#Yp%7cr({! z0~ZM3DJuF>RBz80i^|G5)>yei6vrFg+&lYhu9LW#%F6{(o~C8rAN%FBPi4!rYNl1* z8_XP+m-(P>wQ5%NkT(VgwYnHK4r^83I;ajg6IRm_^=gar0ozw?c+JU&ayf*0@E7<8 D^o;$F diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-24/cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet deleted file mode 100644 index 3f6d0df77d0b4ad352512302f91f68e13f7ed889..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1028 zcma)5&x_MQ6rN1CB}kp{f3|9}_qyjQ`CzWnGmwwHx4Gw+-4eecJ-^dCI62qI;okoO$#KKb(b^5g9cPd!d}C6Jm; zISa*L>I>UtW1-%`sC23PuFyW6kBBxhSoRDC5c^dUVmtgaw%+FrPBw#IIYQ}A zOr9ZnB~fh1^p8}Ax2P)0#2=e%%Ao0H+<)_x@B@3|DfznDG;2uN-CzV~jtNh7HWQ)g zPhOj@2pNnLQ+`NuJ$enjtp}k$8-;pRe`yPc2YUPYT#sH5u51o)O0&ICjD-(?H**{| zbbtV!qN*=N_4a(Rs9rnA8rSX+#rDQG_s)*YDv6t^yjURRNm}-O*RP~~s#vB~H?7Lf zaArHa!iNp3Rkvz~yg59mH^iuUSg-NcL2bmDu=Xv{sI@pBvR$E9sY!J7wf;>CYp_2j{We}Le{mmj;W?PVcMX1@8p&%F0$a(wTR0|ZK-!I!tc z&aWFlaajPBkQ=utAta|1a74|3GJJdU^L%5;vQbYhd9*y>frqxcunV6*z5j4?$)=yA zwoSB6#)3z(I}W5fVb%u*Xj*%!Q%8u=$u{!|kN^_6r$=?#S1!)$E2{CECuKBRunLJf^dW zjO<|a(jLl)<0azScV%kECtY(1Hp@#*0wSD>8PTtAWn8KB@# zJdgK0G+?Bxm={p9HJ!~Wm%$0+G8}ht{lV3vm#@nriK|hb&WLuHm3?Ozl(RmS9ownd zPIG$FQ diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet.crc deleted file mode 100644 index ea2c2b42234071eea2f00c6bda9ef050c478d8e0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20 bcmYc;N@ieSU}E_C<<1ok;a-uMj*G(pNR|hq diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet.crc deleted file mode 100644 index 60e5f4b8ba53c3e6ee5a057068461b51adbc033c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}Bio*&5lI!0iD5BUl7< diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet.crc deleted file mode 100644 index 75deae3c0d39cb59dd46a8ddd5e0faed4ed7f7c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20 bcmYc;N@ieSU}D(TyU3TM diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/.cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet.crc deleted file mode 100644 index 9acb3a1d1b2864693dc208c112bd1bb19379bea7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20 bcmYc;N@ieSU}A_~a$v50fze$jCyu26JT(S( diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet deleted file mode 100644 index 0169eb7b25f12b0dc95de5372959343a4a62c560..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1041 zcma)6O^ee&7@kbmZqR}iIzu8kl&~9ip@xRGyWJ%hMHZ1#idWGx=}fy})3lqU%2Ims zKX}`-7cX8t>&1T{=-HEa@F1w*n~!!Qxh#a4cb<9P=lvQoxc|f=h}4Ke{{H&;=C(#K z9Lq#T=;jVa2$iI5SOKe(9rFIe*NYsZYNw1cFdhoucSGQ{Ntb;3{rc^#9PutsiB%vj z=A1=hIG=@)J7p80-odJ}6$i;Sf>jEK7%r!(8zXmBE$Ao(D5GmsCMO_?RE_q@AhRwd zGLB6k^&VBiISbB);pkLM87Tv}ixup0L@Hox`D`k7QBRBMF;E$fPMJRu!{}nZzMxpu zRNIiDY_%hop&IZfm}2^Xo*1&7)|jpo?IjhbC&&x>1~Cz$A4{Z0UtwY*qpy)lrj#U* zX!Yp8=-cbYtas4MTD3;n+a diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet deleted file mode 100644 index ff7097554e42a7a8c015442c56cc6df254c343b7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 971 zcma)5O>5gQ7?z#RGbOMMB4L9M!L+7@OvE^8(~!%?C}V^WavdWmk(Qb|PHHOL*Ua~7QsxT81A8Jw-jr~K{8TVSPS3*STq!+3s?{ZZr`IV=C}z;hZ7Gm# z&9&5AtM~{xYvo$>MiZ%cj|_kT>Y07TEY)u!+v@L2n2=9WoqSP<>geQ$QUR}PhB1pK zl+78KN>hL7D-p&1EHv(Tv`Op8IQ)kZUj!7c;$k5Z8qHqQsYn=S2{XPbOIx~ax9vEI z7URUO*)M$&@YrrYU)u79Amwv_bDAF}Vj>~{!bK3UQ2+#}6g9gLwcE?psgDIE|{gPgR$?9(Ah+qlF*vDjzl6mgm+_cyn~@HN?1i j;?;TUxIX4gxJRyN)LWd7*rC~hmzsU(JG`ns{KkI)8^hyO diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet deleted file mode 100644 index 9d860c43c2d88334ffbfc4f9f85757b6bfa57471..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1035 zcma)6O^ee&7@kbFOVENVbcRHNl;DP4sG+f~+bz8)xQLWeyo#1-XW9*$q}?P{meQN( z9}v9l&7&88g5t@$H*cOih&K;<@XbfJkz5wS%)HM$@AG~Q8Qgnp6GSRRC2!t+{$0`u z26LMz2wk~_5kj(5f)%iv#2}x3zJ4X=DA(E>$^*}F&w$q?ZSv*M+xJ&9kJ<+38#& zllTav-otV@XTixZ93Kmhk!=8Xv5FmzNEr@WKJ&yb>gh2(2Fkk20yymzY?{?CYeQDJ2Oc zT0J^2``xT9*#>H9n=(7E_chl0fXN+Xo=Y3S{|ZQiM|1*=Zn23)1mYE1U1a66_#qQY zzhm+Y(O(k9rb7Qp6_`g&RcFD}Vsi$Kw~`rH&qWYAGhZz=td>VAO{p1Kx{>vSO^ zE112oJP|Q?QB3_NEsc00M#l)FU@?x2nlW*N%R{5{WNE}xge!*woY5RV5>pWX;4fU4 zja(prr>GfgQKPe5t!fv}iN=LHL~;D-<-POIW|PF_RGzMo`Y0>=wii^hK2>eYZdi78 zZ?te+Uge{v-EP?RL*5!4G@4@EI&9Q=`=CDNOxSmA(X6*QAF+L{3$Ho-kSrrKfZxDB DVWa)g diff --git a/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet b/kernel/tests/data/cdf-table/_change_data/birthday=2023-12-29/cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet deleted file mode 100644 index f126fc421b7c282ce0041194faa6c464480935f0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1028 zcma)*&x_MQ6vrpiT7#Blh0c&j4kfr@7wXW^*6o&FR9IwL%HmbDOghtU*fi}XsUoEp z@16zezaaPr_y-6c^bc4)?ZMOj0qa3ue(biA%R-pUeDi&udGF2S=<#!#AW|V3`TFtq z*=>VhSeA&2(47YuAyiOGFoI@~D*68Y{A^>*ve8T}MX)^Ofd{rGX_Ie%K76{nX1gy^ z+a}m%Q^6zIpH9Na9rKaY?_pgl#(QKhf^`auM2?vtC+L;DnPX@0EQy!f3!@;o}b5a6v3v1XFh?HSc^!ZqBp+h6K$3nS3IOhIH_M^Ac zs}B_GhTgSQC|Pf*C8z}A38t7nq5GEFP3ucXinfz#(-q_eJ%c?FqF)N6Lf>J+kkvCt zIU^+rB&-A4vwFaayOEADYqeTu{fJqg6V_5$!Tw7=fjV?(EtPJ*fq4aX%e6W0TF>IV zKq&o%$ty%J6pAgCUMdwhi<+iQf)V3W4ozoC|JiF9gzm)GZZ%ko)seQ7bA+cJgQGf~ z$%qA$H*72;4tI%bKb5%|pN82n!zh>yBC}==UFnI??7Wzp@d?5wSAd+^TtAW{8Gzu= zJdZmb7$8#C%qytbnJ*T#>)?cO9S*y={^;h>>sMx##LXx#7f3tE%6>Qws#%|^HnSVd zu5LRs*ArFYH0^f7uJ4JKv)gFOL2Iv37wz5pKyYb4vSqX07Q*2>dKYeT`k=pX>yF_6 F{|6~W`@H}F diff --git a/kernel/tests/data/cdf-table/_delta_log/.00000000000000000000.json.crc b/kernel/tests/data/cdf-table/_delta_log/.00000000000000000000.json.crc deleted file mode 100644 index 966584a04c4328886ab83c78df27572a9006e49d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 44 zcmV+{0Mq|ta$^7h00IDUk%Z=vu(=k31Mq)Kxwuo5lbU3kNk9|X9#G3K!g{%EbSK1$ CdlMG` diff --git a/kernel/tests/data/cdf-table/_delta_log/.00000000000000000001.json.crc b/kernel/tests/data/cdf-table/_delta_log/.00000000000000000001.json.crc deleted file mode 100644 index 557f7ea9da9b9895baf1fbd4db3b402667c4e415..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 wcmYc;N@ieSU}8wSJ}ctDoL>TlPIdBetu57WuC8cdeqS?(DXgfz=iTe~06g;&RsaA1 diff --git a/kernel/tests/data/cdf-table/_delta_log/.00000000000000000002.json.crc b/kernel/tests/data/cdf-table/_delta_log/.00000000000000000002.json.crc deleted file mode 100644 index 0ae38ad3321724a2861ae841d233f3ae0d674ccc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40 ycmV+@0N4Lxa$^7h00ICkOw`gMt}PU))ua)m%SKJdhWehtli`3YjIJ3LZb0}q)e&?6 diff --git a/kernel/tests/data/cdf-table/_delta_log/.00000000000000000003.json.crc b/kernel/tests/data/cdf-table/_delta_log/.00000000000000000003.json.crc deleted file mode 100644 index 26b5bbb5695c3a6763b6ba56d3a645ea52eafae9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20 ccmYc;N@ieSU}E@u^2L;7mh27xW}65A07(i5(f|Me diff --git a/kernel/tests/data/cdf-table/_delta_log/00000000000000000000.json b/kernel/tests/data/cdf-table/_delta_log/00000000000000000000.json deleted file mode 100644 index 4c635f5dc..000000000 --- a/kernel/tests/data/cdf-table/_delta_log/00000000000000000000.json +++ /dev/null @@ -1,13 +0,0 @@ -{"commitInfo":{"timestamp":1703265018828,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[\"birthday\"]"},"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"10","numOutputRows":"10","numOutputBytes":"6897"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.0.0","txnId":"d05345ec-8304-433e-88ff-6498dc37ca19"}} -{"metaData":{"id":"d38a7090-96be-4b1b-b20f-b85ad8ae1a38","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"birthday\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["birthday"],"configuration":{"delta.enableChangeDataFeed":"true"},"createdTime":1703265016759}} -{"protocol":{"minReaderVersion":1,"minWriterVersion":4}} -{"add":{"path":"birthday=2023-12-22/part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":694,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":1,\"name\":\"Steve\"},\"maxValues\":{\"id\":1,\"name\":\"Steve\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-23/part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":680,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"name\":\"Bob\"},\"maxValues\":{\"id\":2,\"name\":\"Bob\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-23/part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":687,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"name\":\"Dave\"},\"maxValues\":{\"id\":3,\"name\":\"Dave\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-23/part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":687,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"name\":\"Kate\"},\"maxValues\":{\"id\":4,\"name\":\"Kate\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-24/part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":694,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"name\":\"Emily\"},\"maxValues\":{\"id\":5,\"name\":\"Emily\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-24/part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":687,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":6,\"name\":\"Carl\"},\"maxValues\":{\"id\":6,\"name\":\"Carl\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-24/part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":700,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":7,\"name\":\"Dennis\"},\"maxValues\":{\"id\":7,\"name\":\"Dennis\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-25/part-00007-8cd4b5a3-b4dd-4bbc-8bb3-721fa82961c6.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-25"},"size":701,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":8,\"name\":\"Claire\"},\"maxValues\":{\"id\":8,\"name\":\"Claire\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-25/part-00008-436dbf31-f213-4b3b-bcc3-5df022ec6b35.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-25"},"size":680,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":9,\"name\":\"Ada\"},\"maxValues\":{\"id\":9,\"name\":\"Ada\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-25/part-00009-685aacbb-c7ac-4cb2-93f1-6dc27cd2e980.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-25"},"size":687,"modificationTime":1703265018088,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":10,\"name\":\"Borb\"},\"maxValues\":{\"id\":10,\"name\":\"Borb\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} diff --git a/kernel/tests/data/cdf-table/_delta_log/00000000000000000001.json b/kernel/tests/data/cdf-table/_delta_log/00000000000000000001.json deleted file mode 100644 index d6d9b7833..000000000 --- a/kernel/tests/data/cdf-table/_delta_log/00000000000000000001.json +++ /dev/null @@ -1,13 +0,0 @@ -{"commitInfo":{"timestamp":1703265021675,"operation":"UPDATE","operationParameters":{"predicate":"[\"id#1065 IN (2,3,4)\"]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"3","numRemovedBytes":"8187","numCopiedRows":"0","numDeletionVectorsAdded":"0","executionTimeMs":"808","numDeletionVectorsUpdated":"0","scanTimeMs":"639","numAddedFiles":"3","numUpdatedRows":"3","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"6","numAddedBytes":"2705","rewriteTimeMs":"167"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.0.0","txnId":"afc86e2b-99e4-4dc3-bf82-2bfd5c6762bb"}} -{"add":{"path":"birthday=2023-12-22/part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":904,"modificationTime":1703265021654,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":3,\"name\":\"Dave\"},\"maxValues\":{\"id\":3,\"name\":\"Dave\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-22/part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":904,"modificationTime":1703265021655,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":4,\"name\":\"Kate\"},\"maxValues\":{\"id\":4,\"name\":\"Kate\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-22/part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":897,"modificationTime":1703265021655,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":2,\"name\":\"Bob\"},\"maxValues\":{\"id\":2,\"name\":\"Bob\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"cdc":{"path":"_change_data/birthday=2023-12-22/cdc-00000-59fa51a4-edbb-4fc0-a497-6969cdf3966c.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":1028,"dataChange":false}} -{"cdc":{"path":"_change_data/birthday=2023-12-23/cdc-00000-fb59d34a-5bd7-4b10-8c41-71e38c07fdc2.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":1021,"dataChange":false}} -{"cdc":{"path":"_change_data/birthday=2023-12-22/cdc-00001-308c0cab-92b2-41e1-90bd-9416b10ba6a6.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":1028,"dataChange":false}} -{"cdc":{"path":"_change_data/birthday=2023-12-23/cdc-00001-985fd824-b34a-4f3e-b7e4-90bf8d04898e.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":1021,"dataChange":false}} -{"cdc":{"path":"_change_data/birthday=2023-12-22/cdc-00002-ea0bad63-f199-42c6-bf85-3b9f5027578c.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-22"},"size":1021,"dataChange":false}} -{"cdc":{"path":"_change_data/birthday=2023-12-23/cdc-00002-831078a2-a13d-4713-aa88-7d5f5228d781.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-23"},"size":1014,"dataChange":false}} -{"remove":{"path":"birthday=2023-12-23/part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet","deletionTimestamp":1703265021672,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-23"},"size":687}} -{"remove":{"path":"birthday=2023-12-23/part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet","deletionTimestamp":1703265021672,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-23"},"size":687}} -{"remove":{"path":"birthday=2023-12-23/part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet","deletionTimestamp":1703265021672,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-23"},"size":680}} diff --git a/kernel/tests/data/cdf-table/_delta_log/00000000000000000002.json b/kernel/tests/data/cdf-table/_delta_log/00000000000000000002.json deleted file mode 100644 index abbe569bd..000000000 --- a/kernel/tests/data/cdf-table/_delta_log/00000000000000000002.json +++ /dev/null @@ -1,13 +0,0 @@ -{"commitInfo":{"timestamp":1703886093785,"operation":"UPDATE","operationParameters":{"predicate":"[\"id#39 IN (5,6,7)\"]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"3","numRemovedBytes":"8268","numCopiedRows":"0","numDeletionVectorsAdded":"0","executionTimeMs":"3666","numDeletionVectorsUpdated":"0","scanTimeMs":"3237","numAddedFiles":"3","numUpdatedRows":"3","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"6","numAddedBytes":"2732","rewriteTimeMs":"427"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.0.0","txnId":"179df3d2-696a-460b-bebe-eb911c63e0b8"}} -{"add":{"path":"birthday=2023-12-29/part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":917,"modificationTime":1703886093724,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":7,\"name\":\"Dennis\"},\"maxValues\":{\"id\":7,\"name\":\"Dennis\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-29/part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":911,"modificationTime":1703886093724,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":5,\"name\":\"Emily\"},\"maxValues\":{\"id\":5,\"name\":\"Emily\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"add":{"path":"birthday=2023-12-29/part-00002-7dd6bbed-a0c1-44f0-b729-42b7d7d7f5ca.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":904,"modificationTime":1703886093724,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"id\":6,\"name\":\"Carl\"},\"maxValues\":{\"id\":6,\"name\":\"Carl\"},\"nullCount\":{\"id\":0,\"name\":0}}"}} -{"cdc":{"path":"_change_data/birthday=2023-12-24/cdc-00000-4beb5c26-e34a-470a-a62e-2ecc8dc24035.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":1034,"dataChange":false}} -{"cdc":{"path":"_change_data/birthday=2023-12-29/cdc-00000-e8760032-5a99-4d37-9739-fc9d4db24308.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":1041,"dataChange":false}} -{"cdc":{"path":"_change_data/birthday=2023-12-24/cdc-00001-a5f1d5a2-e308-406f-af76-3b32bab79832.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":1028,"dataChange":false}} -{"cdc":{"path":"_change_data/birthday=2023-12-29/cdc-00001-1aa06a1f-c45f-4227-b0ac-e70b1e2115b1.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":1035,"dataChange":false}} -{"cdc":{"path":"_change_data/birthday=2023-12-24/cdc-00002-ddca9e04-03ef-4533-a9c8-05c1d4f79d6a.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-24"},"size":1021,"dataChange":false}} -{"cdc":{"path":"_change_data/birthday=2023-12-29/cdc-00002-97dc4c5b-3806-4198-99ed-062c0a337c29.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":1028,"dataChange":false}} -{"remove":{"path":"birthday=2023-12-24/part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet","deletionTimestamp":1703886093764,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-24"},"size":700}} -{"remove":{"path":"birthday=2023-12-24/part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet","deletionTimestamp":1703886093764,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-24"},"size":694}} -{"remove":{"path":"birthday=2023-12-24/part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet","deletionTimestamp":1703886093764,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-24"},"size":687}} diff --git a/kernel/tests/data/cdf-table/_delta_log/00000000000000000003.json b/kernel/tests/data/cdf-table/_delta_log/00000000000000000003.json deleted file mode 100644 index 26b9aa78d..000000000 --- a/kernel/tests/data/cdf-table/_delta_log/00000000000000000003.json +++ /dev/null @@ -1,3 +0,0 @@ -{"commitInfo":{"timestamp":1704559499570,"operation":"DELETE","operationParameters":{"predicate":"[\"(name#40 = Dennis)\"]"},"readVersion":2,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"1","numRemovedBytes":"917","numCopiedRows":"0","numDeletionVectorsAdded":"0","executionTimeMs":"3479","numDeletionVectorsUpdated":"0","numAddedFiles":"0","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"1","numDeletedRows":"1","scanTimeMs":"3157","numAddedBytes":"0","rewriteTimeMs":"322"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.0.0","txnId":"ef48960f-ceb5-4bc2-9b59-8c947083ae58"}} -{"remove":{"path":"birthday=2023-12-29/part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet","deletionTimestamp":1704559499540,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{"birthday":"2023-12-29"},"size":917}} -{"cdc":{"path":"_change_data/birthday=2023-12-29/cdc-00000-ed223ebe-3b27-44af-b2cf-91e882f4c500.c000.snappy.parquet","partitionValues":{"birthday":"2023-12-29"},"size":971,"dataChange":false}} diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet.crc deleted file mode 100644 index 7ccb48758195a95ffceb5d0040f4b2200480acd2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}D%UTPz}^ctZ^U9;O4! diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet.crc deleted file mode 100644 index 124dae08bffa1ba80380fc227ff8c61cbb209822..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}9+fvp@9FhuO9OD`p0I diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet.crc deleted file mode 100644 index 031ec7d55e658fc2c15070e5253061455ff037e7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}9M7HaYashuO9OCZq+x diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-22/.part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet.crc deleted file mode 100644 index 41c3d9679b50f7a556562a8b25282ea5478092e2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}C6#wJqqTfanJRCpHC^ diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-592a7e14-f790-4236-9c61-120d006eb3b8.c000.snappy.parquet deleted file mode 100644 index 7a24bef8d2e6e3b0fd0b1594754f560924ee9bc8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 694 zcmah{L5tHs6rPOj64s&!oneAO%HW1wh(lu9-EQec4~xiBiXKH|(iv+oO}k0DBBf_< z9z5vHpWsjM=1qT)m*Pt@E(;#K%$xV#_rCYOc{6?VEF_3@2_ru~eB0f02?lqM*a+?4 z#|WVtmIWuo?h#IY{r=%2;@y%(lJbRpPK8OPQA(4%x(;@ zHJXfno$nI{zMmF(i0DU)V$Y_ZtuCna8C&LaAy*QViCUz%)OnFCGuDp8Km>>#{evhj zQUULAvDT%?m+!siTt{yNdAD){KdhF8x7sYF58z>=)g zbDcvVTc@dvQvg7v_W55@KH6-z{#9|!aa9gol5GCJ>X%!@o|=IlEb^XF*b7A%iLiGV zuai{uRNN1TQ5c-6L3|SR^=xn&1!{N_%#_sOaj5&jP^nlRIX*19+Vx#n)f9g0AM;q8 A?f?J) diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00000-cd6a8496-3a3c-4ac9-8fba-035e60e71ab2.c000.snappy.parquet deleted file mode 100644 index f5e6762c81d7862c68259450a176dc6ead959261..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 904 zcma)5L2DC16rP>Nu#_MQdBZL&5*8a;h(nenZ5l#Bks_sp;#H(fGGn@Qvq^V16$!a` z@<$Y;2XCJBA9(U4o(i4>@h*7s;G5k|8u3zCX5ahfd*8hG?d;LRrydX}g9Y!e-d-9G zP)v(p60&}u5<=FDHCz#MpaQRdT)tbS+-_!+0#Y7}bA_}L+=1V}K7U_dQ7V>G8nN$E z+bU?i`~}ozn53|-M>v(qVZ*kNNi>TR+hHc`BV9*5=;hIc0ed<`(&3X*JQ309aXdIt zBLPJOH>gE@iEUv}2*pTkkV8k)HBdSpoQQC!j+0lD#Rj9cV|S}2wr{tHS+sRMP)6A! zwqG^7S^wF}$YwfCmSJBwKb}bl`%0nAt_~@~bT`PdIeQ6=}O;&HFRy~<{P>IbqS`%SOi@ans=)!%6})u6T8sLS?F eeISMMwmsFXx25chExU_1Fnh>Hye&ug5B>$xH_9Ub diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00001-96c64ea1-3383-42c8-bc83-487a583eb01b.c000.snappy.parquet deleted file mode 100644 index a6c9b9265cbdd71e2a9a8e31d1abb4a0aa0dd073..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 904 zcma)5L5tHs6rPN=gr$l?-;h9%65O&2bx3Ht+bv~54~wizS-gsrX=m&P)3lqUij;!j zMf?qW@hW)Lf1npn;zhxe7w>{6557sWZNeIE#v!GhNxzrMKQ z0L8QjCLtR)DIw&NQNR^32P*L9?W^adHOke-igHh+3TY*{2ETuO`mwP_E0!}Fv9D9x zDrmj@1=MDkWU#JBIFrd?!?ut~G>a13VI~|PT}NH$=Fyn}`#MC@(cMxq5%J?*GB{Bq z0YwBisYL^cwlF9}Vx%_7k)!DvDD?&>A{we*`fPH(!Km%novMlU?Itmcwyp=tD7(WB zs%B^5f3Y&Mm36bo&=>ZPXA;7`P$;u+25?gCI#9}$G6R{`A?sGVsHMAItXgZ8t!w?% zS}&>AIx|*c|CrB2+iV{hogF=^rISAt7<)(I24NozMyn?KY?N_^HOm^uLoOx)hsCqG z@h2)yf^lSBuJb1Mh_(G6BRLH@j`(z{QXY?=@{vk~6sfSj8#7l=(QUg)8czqQTXP=; zDwK)aelT$5Vo6c=77> zPrWj6F_njNVjbpf-x|f0d`=ag`*rSDw))c`loi=;_^rC{?a5|;x86{L=3d>CtzB;* ch4OcN)$m$U_Qkf{!5g@E$a}mk$M_Ha1%VUF6#xJL diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-22/part-00002-93942e85-bb5c-45ff-b334-3a50c28185bb.c000.snappy.parquet deleted file mode 100644 index 51ff3c55b03994ff69af0002e461618b84e833ef..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 897 zcma)5L5tHs6rN1il3gk&bcRIAQi2X{HqzpI2tlO z;b0U?J!8*h6#L`QxKU?K<{)F|KSX>MFtGP*DiaosU$Bu(xZnvlzH2i}IcT-5IEki% z#Hv|OeHn<@YCoP?%5~WC1(b7`A0~1rBPfK^AmCmA0O*u8Ye8zYXY+Y&*_?7L+u;;H z9A3YB^)p|IxE{-sIWivRW8WV|)!e75%iKD1tJ~hx4@6aX4YyTyoqf^t_Ua8eXzte? g(b{tcf=hSTl?|sQgvWQx4&1)%LGR$EoWRfb7mvipT>t<8 diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00001-723d68a5-94eb-4acc-9db1-e985867a1a6c.c000.snappy.parquet.crc deleted file mode 100644 index 7e5ff66214161ea46c5957ad547770fe96c6085a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}8xBJVVH^ROdYaBb@~R diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00002-7c6f102f-6ad1-4e3b-bee3-df831f4abf3c.c000.snappy.parquet.crc deleted file mode 100644 index 8a645acc6366c86d29f5327a8093871a19ee8bc0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}ET7n7>oP?f!EBB~b;o diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-23/.part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet.crc deleted file mode 100644 index 2e80f1525d9978d3d40e42dfe8080216ed9cd30b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}9+YNIxgxcK|7MYbTCv6(#LJw7iqNw7E5L#!K)WUI6+W}Qk zPL()tKfovO1DrW9-k7Qq2QK58H*em|?D+KAi;y7FCJwp&dii79B^bgj zVk30-F-8bAtjYrRkUS#azWn})Z&B_A8%hHxNAsB7$INL~Q2im8Q3xuyQRR)TL6_R( z7_x}DbWBE-2btkXt60eV?dxK4re~5g0o=h3P81?7aGIH%=^Zq3fle`VneW*!5bGg( zi&Zp&DW*sC*t3V5Q8t#MT{GAwf}XGuNQ1~Pi?r!yiwNY6UDB$ROdq3lLPy?sXeKuX zl&W2fe_Q7R2A%H~d4lLii(=2FpR6`)=R3|kp9;BjNib0Jp}rV{Od;zCz-zAV|TM2tm**ug)H`n(YE z7Uyf-h-OaB4Y$W?+ZFuv3rTLWCO|_miJmjx)U8tH*Uu?K}A_(OQ;GUM3jh^cY zfO1_FGAn=pnL6ZuMEP{H-GMp9CrKO~sd09gjPz`LltgNB7|oQ_@xxe;qKQ(OJa9u;vf1-JSkoDN)!$f~m;3+# diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-23/part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-23/part-00003-98b8082f-db4e-43f8-ac4f-56538beeddae.c000.snappy.parquet deleted file mode 100644 index 357f6f12ca7274767da28ed79ec6a9695395ed84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 687 zcmah{L2J}N6rPN+1X%@PXPiI|VX&cvI3#4Z-7dZ8p-7cbyo$(VXIz8X>^8|(WLYTm zCOryXym%G-9fJSBf8xWq7-G3PMX(8Y( zPM5k7)%?Af=|(EqNcP!U@#e!q^D(cRYB_0m$lv6;P&FUFTygV65Xu$69W5^#J=GNe z<+3PbRsaDqb;$pU^6_fD4zH41jjMF1l9$u}WxrY@cHA}mXjb{%gnlgIM8y71w#*CV wt85UDk~lh0!)!kp=*jRPiPUI6nkcE`y;u*Tky4r5bwgOP+4BQf(+PalKOA|N?f?J) diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet.crc deleted file mode 100644 index 1b77d1c5e1dcb7711745c8dd891b5406489a4251..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}8`bzThFGctZ^U9LfW^ diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00005-8aeab9bc-7a46-4083-9a85-e4f8d4501a67.c000.snappy.parquet.crc deleted file mode 100644 index ddf9a2ea89c22c0fe5d374043a1bb54505cc63a7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}88bmv%$K?f!EBBiRLj diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-24/.part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet.crc deleted file mode 100644 index 80c0c5edc43fc9276fcb9281fee3b6731b580a87..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}E_7X4`Xpj)x@xEP)0} diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-24/part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-24/part-00004-218c1bff-cde9-44b2-b7bf-93f2f37c0cb9.c000.snappy.parquet deleted file mode 100644 index 5a85db4f6b9f12b50b2b4f0c437f69c974aa3cfb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 694 zcmah{L2J}N6rN0XODGE>>Fz1&c^o7Ox^Q*%{Z+B)d(rEoIrO z-aM3|H*fwD@#b&w>P>t}##->;W!}8^zW2TN&70}t7Xd+}ODuBz<=fA;Lom48#6)Q4 zK1K*_8ErTrc8@Uf^~d{nTel#0hJPVX(>S@b?_%3(RTbJ0Y*TQmaJxZhs}l}2$uS_1 z?a&FCRyW9upLL3*$WLDvv$rZ0qy@n}Y~ffU(t%4W6RFxmCl0_VwjJAZO&H4_AamQU zU<6Z4pU`9195>@MCq?`9aE%8&K^IU1$uEO+>1Ts*|Iv7*dZvyyr2O{@}Gu+hsktDXr`v4pUO#aXH5 zDu+O}N)izz0Dwy6u|K42wBBsIo8p?|rX0G&+5CUiuQteft_OaQ<~^;j7w{nDL2o}= z#fj|6s2>c&z(10M=rHW7+2APjP-;D40BY5=XkG0^z%bR(>_j|wh-n=<`@-ifd3#eP| z{(5({t}$qRtF`F0L9u7ikH!Ed^chP(gHBlX)Zsy>hrw8>MDBARmR#@oA*|^PzUpt={h0s& diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-24/part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-24/part-00006-53327328-4603-45ad-adb9-21feeeee2c31.c000.snappy.parquet deleted file mode 100644 index f716e5ddb434fb0ba69dff366c0a0f1497c9f897..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 700 zcmah{!D`z;5M6CtR}@2A@Ge>qR4j=|KwTtcCywzYrG!#~F};>jR^F{ED$9-}w*=!$ zKA^|+mUI3g=id4a{fZuXDRd-DA<#oFyEAXzyqVeA$;q35Akrlk`SbhN&10KjaNER0 z=+Q1l2<;dT;RMVfyX5x!&x?DEC!;N+4UCsMP2-$B#ms7z8TCalqu`WqyQVZo9kQuO zPJx4%O~+(XS`Zn%>f}qAeR`iyKkB(8Edckhg=2+C2aHxK=XwvF*+8e5+01iI=#2G| z*=8k;V2bGrdg_{^ZA06WqW!A9tp_z>GLQz5OH8`-+8_eCW1DnprK(P)bw(%dc+@oC z+@M#_)gbpZuS(4y2029Zt3k1A(kr72lX{l5$Yw$=B}^x(LB(gC{yNgW&py5|p+isS+FVAXG=g8xH2`2&1OlV!n!m&|+L``-87H-`E;%4yet-OfZ&2<|_LMGA9;YhF4Znvu>r^exK`^I~RB*SUv_VsHs7=m+ zgSbPdWL8B`nLOzgYn8uwS7}uMx+NpCsRvvh@Lq>r8M__Hp$rf^{)bUq zrV{Sta$`!FuinX}DV0{GVxO(8DBrJ*7>lCJH}g{X;#Fc&U5N4Xt*EXFN{I%zp(R;q z7A6NE+oY+AQy@U6@kJ{t#@pS_zizHIuG?Wsk}dw%{bGmM(`MlZ%iL`ox}gjs8M;UD zCP}rc<3Tuz!r)8~nARk~5tjjfXowaF1w z5qIdAj4BT*!{b)5kooDGV)9PUBxyo&2Q!=~L|Wi9Gda^cXygE#V(xI?vtc3LL-rQ0 zU<6Z4pVK4H9&T3ISc-PdVw(tNf+C;>l52~!=@*L#HwKt0 zEylmi^Bx1wH;X($^n*pQXVXts8`SxX&GV^{3kiyZ5hu=dUL^C3-HAj`1c>eZgDB2Y z0q=0O)TPMh7h zROgV$mT4;E6adhvefKY^J6NsP{!O#txM_zeNjCjo_vPhb)3RHhTm?){kXQA!}eWhZ#$9*_(b>jzcq7(Q>{{Yj@ BmF55d diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-25/part-00009-685aacbb-c7ac-4cb2-93f1-6dc27cd2e980.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-25/part-00009-685aacbb-c7ac-4cb2-93f1-6dc27cd2e980.c000.snappy.parquet deleted file mode 100644 index 5cc08018718322083c38a42b097cd848df07d763..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 687 zcmah{L2J}N6rPN+gt8QbopAy=gu#Xu;*gMCyIp#*hayEn@hT#dopB9jv)d%A$g)Ro zJ$Mkj2>uNH4gLpj{sC_ueUrpm@ZjamyzhPQd+*Jgi-%8Qg2<3C^7Y5}pL-s`;O-Cy zq1}5JA#}^y1q+x*eDeAImv{CJ%HF7>*g!d5R+)PTyUaET{Xwux!D-=^z-)8$utyzo z265ziG$pCILFDAopkB%9)$@AxQZFR20o=z7=L(SlT5V zc3eXvm|}WL&jRP9Yo@al9ki`=9@K=PKpI4TS!75*Swx^9^~j)CYU{LG=QIsc&@!!e z)@c3pZfad&(E4Vy=#@or;Ls1&5GD*6TUK)+R}!WY?SSI7uIhYQvfGImiwLp9e;D;; zA>b`8*SZna@{L&NMk?7z_R-q#=Iu)JDX*JqJ!^Q#U*x(_HJ?7+aPveE$`!ypEiW5A z*A)Qex+r8;00A;}$p47)>1MkPuajGi>vX7+m-GK+KieX9-Zy-IQThFZek|fd#Qs6H z&I{$MY!pwDI67A2>?j%O+4wk#)Z{3dDXHTJu^vSer80TwhOlI_=ew|`3;3$P0WPta A!2kdN diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet.crc deleted file mode 100644 index ac22997311cf876d645c4cdfab55a05fbccb1e84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}7lJPBJh|ng0_29*hK+ diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet.crc deleted file mode 100644 index 463fbfdfc61e31c5db4f2e1a1ca2b3d86f2cc82e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}9+O4?n;%*V_O9BGd$a diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00002-7dd6bbed-a0c1-44f0-b729-42b7d7d7f5ca.c000.snappy.parquet.crc b/kernel/tests/data/cdf-table/birthday=2023-12-29/.part-00002-7dd6bbed-a0c1-44f0-b729-42b7d7d7f5ca.c000.snappy.parquet.crc deleted file mode 100644 index c0fafe9ca53b8165a8ea951882c78cf053f76340..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16 XcmYc;N@ieSU}Bj6IcN2w53_9nE3*cU diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-29/part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-29/part-00000-1ca113cd-a94c-46a8-9c5b-b99e676ddd06.c000.snappy.parquet deleted file mode 100644 index b24ab63fdc30f3b52052f1f0eb0afedeed3c0424..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 917 zcma)5&ubGw6rP>dbtwTWWQJV`5|(ypAr4vcqiI+QiWHF$f>)6;$&BgZW|Qu2loE3F z;2+Vm|APm;2p&9o)L#4#RK$zmoBfeSycCw1_uhQ(d+&WSJGl4QC5V)XN&fu)`Q}!I zU~qL}AhflE5kec&K^WTWdhQoC1+` z8(Ze4BFr~}Eeehb>lwq!D2_DC1T<1EMplIyPS%Jhpy7;^d*Qp%aq22!pb9XNfj7`)hlFPEZ- ztaE->oUbuyBj-Z9BK>EGl)Oui0O7TjW~+`~Z4gS|V{!-4j~c~}K|gC{(6VNl(`d~2 zjDu(_)sMZ9QS47c^G1WUSRI-B{~_YDfPvI!bD6Mc`jSm#!Ua#b`BhukN=>_K$4N9F zC3ekz=F33DcK6A`RyP1gzJPEJ^TR}rWdwn69t3QKK9*7RLym&y3B1bx4Ji+`+=y6Vbkq2-1?zt4G$VkIcgm? h>Y{T{9|Z+Y+%vk$$4S2KVg@-O(Y(Nq8c diff --git a/kernel/tests/data/cdf-table/birthday=2023-12-29/part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet b/kernel/tests/data/cdf-table/birthday=2023-12-29/part-00001-8334a9a7-7041-4d88-8377-aa36cfe5762f.c000.snappy.parquet deleted file mode 100644 index c7717e10ca83d7fa60e1ae0f1f744ad5a1838792..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 911 zcma)5&x_MQ6rN1CB`gIM`i2C`Qi2?YrfD}xm8JCJ zLBX@rq-nf$I3)d`(Zws@bIh`4Nm2V zLk@$x)S#Zg#?aUb_(<-Oc2U7qAm1CD@?a=?@x^3$z^GX?Tegk^m~}me^)VkPqwGFA zvGvwsg2l?nUNXKD4cQhlr2Zz~ zmEwCt;V!W+wKd^C10=*vc8rMDfzoT$&C3ih_JP68-qEKldec$AKPXB3uFol36bRvovr*PnX6D2aZ}ZB*ULk*N0%t2H^OA5|-& faab7$F5LrH)+!Al`h4GP;VoQzj?DIdn8#1Eu4^i3o@4IC(X>+F;an>~7UW`*zDL;`vw)lu`DO z?N`li=095**-X31GW3Q0ShK8gG~{9;a9BK@ z8-Jmq*dK@1%?5ArIo;+^(65*WYhlv`h2!U`K1fm}x0V`F_y%Kdhv-!NXC{8sN<#>u84zFLm z`l**DuBY;BPOO8x?favslFzB)aj(I>%4UD+2eKmjO|RYX>btVl-)S_}ptakm%l1xv dAcgX_J=LtYrR<9>yNfq4d&qmdEl2nd{sk(h%O?N; From 4bc58197abf97a9c03c585f8dcd4c72ed8482c9a Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 10:40:09 -0800 Subject: [PATCH 46/56] Remove dv test file --- .../_delta_log/00000000000000000000.json | 3 --- .../_delta_log/00000000000000000001.json | 2 -- .../_delta_log/00000000000000000002.json | 2 -- ...ector_61d16c75-6994-46b7-a15b-8b538852e50e.bin | Bin 45 -> 0 bytes ...37d-4e51-827b-c3d5516560ca-c000.snappy.parquet | Bin 635 -> 0 bytes 5 files changed, 7 deletions(-) delete mode 100644 kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000000.json delete mode 100644 kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000001.json delete mode 100644 kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000002.json delete mode 100644 kernel/tests/data/table-with-cdf-and-dv/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin delete mode 100644 kernel/tests/data/table-with-cdf-and-dv/part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet diff --git a/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000000.json b/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000000.json deleted file mode 100644 index 4ab50e731..000000000 --- a/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000000.json +++ /dev/null @@ -1,3 +0,0 @@ -{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}} -{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableChangeDataFeed":"true","delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}} -{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000001.json b/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000001.json deleted file mode 100644 index 6ddbf539f..000000000 --- a/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000001.json +++ /dev/null @@ -1,2 +0,0 @@ -{"remove":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","deletionTimestamp":1677811194426,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":635,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}} -{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}} diff --git a/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000002.json b/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000002.json deleted file mode 100644 index 8e39274e6..000000000 --- a/kernel/tests/data/table-with-cdf-and-dv/_delta_log/00000000000000000002.json +++ /dev/null @@ -1,2 +0,0 @@ -{"remove":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","deletionTimestamp":1677811194426,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":635,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}} -{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/kernel/tests/data/table-with-cdf-and-dv/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin b/kernel/tests/data/table-with-cdf-and-dv/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin deleted file mode 100644 index f1a01e661cdcca08ff5d67e7d2de53381980735a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 45 lcmZQ%U|>+Wc-b+>@u6oHnRZaa}xk;gAOoS z#<6}t%0rSCNt@&m3E|*~#Cc*mq6q?Bjf0sEC&T3yyEjN{9KCms%M}*aS7r27rN~Tj zzA#1W7Im$F+m7qFUCSlpIkYWjA7Cc8`45BN8(r(ouK2DKhm&oqokhKfIaLJUzYWIu zPlHKl678-<$u_Z>3nwp@5?4qBej=(UKN%Q>#iA`8S!W3Kv+Rn6JI+Zl%15 zS5`$GRbJ1F6QviWH~GBwGEAG$c3l+NBa^IBOI45~tF^{Z6NZvi&--82o2)mRFB=fg z4w&qQd5e|0q&+nA%=X0kY0=qF(izCXnGE_3#gJBXUG{z7KkJ-?b)pv?9F9Xjj^mLU lg%~U_#xTV3XgCPMU~!JGNsB^M@u}mw^bwBeg)ZpZ{R3QVoUi}@ From 02599e656348c625688665533852842bf13e02f6 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 10:58:13 -0800 Subject: [PATCH 47/56] appease clippy --- kernel/src/table_changes/physical_to_logical.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index b01e5b2b7..f1be6496a 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -15,6 +15,7 @@ use super::resolve_dvs::ResolvedCdfScanFile; use super::scan_file::{CdfScanFile, CdfScanFileType}; /// Returns a map from change data feed column name to an expression that generates the row data. +#[allow(unused)] fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult> { let timestamp = Scalar::timestamp_from_millis(scan_file.commit_timestamp)?; let version = scan_file.commit_version; @@ -34,6 +35,7 @@ fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult SchemaRef { if scan_file.scan_type == CdfScanFileType::Cdc { let change_type = StructField::new("_change_type", DataType::STRING, false); @@ -82,6 +85,7 @@ fn get_read_schema(scan_file: &CdfScanFile, global_scan_state: &GlobalScanState) /// Reads the data at the `resolved_scan_file` and transforms the data from physical to logical. /// The result is a fallible iterator of [`ScanResult`] containing the logical data. +#[allow(unused)] pub(crate) fn read_scan_data( engine: &dyn Engine, resolved_scan_file: ResolvedCdfScanFile, From bd43bbaf2bb5112c304bc31f9aa88b932b61b970 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 14:43:16 -0800 Subject: [PATCH 48/56] Add expression test --- .../src/table_changes/physical_to_logical.rs | 73 ++++++++++++++++--- kernel/src/table_changes/scan.rs | 5 +- 2 files changed, 63 insertions(+), 15 deletions(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index f1be6496a..de5c9b134 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -38,7 +38,7 @@ fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult DeltaResult { let mut generated_columns = get_generated_columns(scan_file)?; @@ -46,7 +46,7 @@ fn get_expression( .iter() .map(|field| match field { ColumnType::Partition(field_idx) => { - let field = global_state.logical_schema.fields.get_index(*field_idx); + let field = logical_schema.fields.get_index(*field_idx); let Some((_, field)) = field else { return Err(Error::generic( "logical schema did not contain expected field, can't transform data", @@ -69,17 +69,13 @@ fn get_expression( /// Gets the physical schema that will be used to read data in the `scan_file` path. #[allow(unused)] -fn get_read_schema(scan_file: &CdfScanFile, global_scan_state: &GlobalScanState) -> SchemaRef { +fn get_read_schema(scan_file: &CdfScanFile, read_schema: &StructType) -> SchemaRef { if scan_file.scan_type == CdfScanFileType::Cdc { let change_type = StructField::new("_change_type", DataType::STRING, false); - let fields = global_scan_state - .read_schema - .fields() - .cloned() - .chain(iter::once(change_type)); + let fields = read_schema.fields().cloned().chain(iter::once(change_type)); StructType::new(fields).into() } else { - global_scan_state.read_schema.clone() + read_schema.clone().into() } } @@ -98,8 +94,8 @@ pub(crate) fn read_scan_data( mut selection_vector, } = resolved_scan_file; - let expression = get_expression(&scan_file, global_state, all_fields)?; - let schema = get_read_schema(&scan_file, global_state); + let expression = get_expression(&scan_file, global_state.logical_schema.as_ref(), all_fields)?; + let schema = get_read_schema(&scan_file, global_state.read_schema.as_ref()); let evaluator = engine.get_expression_handler().get_evaluator( schema.clone(), expression, @@ -138,3 +134,58 @@ pub(crate) fn read_scan_data( }); Ok(result) } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use crate::expressions::{column_expr, Expression, Scalar}; + use crate::scan::ColumnType; + use crate::schema::{DataType, StructField, StructType}; + use crate::table_changes::scan_file::{CdfScanFile, CdfScanFileType}; + + use super::get_expression; + + #[test] + fn add_get_expression() { + let test = |scan_type, expected_expr| { + let scan_file = CdfScanFile { + scan_type, + path: "fake_path".to_string(), + dv_info: Default::default(), + remove_dv: None, + partition_values: HashMap::from([("age".to_string(), "20".to_string())]), + commit_version: 42, + commit_timestamp: 1234, + }; + let logical_schema = StructType::new([ + StructField::new("id", DataType::STRING, true), + StructField::new("age", DataType::LONG, false), + StructField::new("_change_type", DataType::STRING, false), + StructField::new("_commit_version", DataType::LONG, false), + StructField::new("_commit_timestamp", DataType::TIMESTAMP, false), + ]); + let all_fields = vec![ + ColumnType::Selected("id".to_string()), + ColumnType::Partition(1), + ColumnType::Selected("_change_type".to_string()), + ColumnType::Selected("_commit_version".to_string()), + ColumnType::Selected("_commit_timestamp".to_string()), + ]; + let expression = get_expression(&scan_file, &logical_schema, &all_fields).unwrap(); + let expected = Expression::struct_from([ + column_expr!("id"), + Scalar::Long(20).into(), + expected_expr, + Expression::literal(42i64), + Scalar::Timestamp(1234000).into(), // Microsecond is 1000x millisecond + ]); + + assert_eq!(expression, expected) + }; + + test(CdfScanFileType::Add, "insert".into()); + test(CdfScanFileType::Remove, "delete".into()); + test(CdfScanFileType::Cdc, column_expr!("_change_type")); + } +} diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index 9b4faea04..d137d8e7c 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -4,14 +4,11 @@ use itertools::Itertools; use tracing::debug; use crate::scan::state::GlobalScanState; -use crate::scan::{ColumnType, ScanResult}; +use crate::scan::ColumnType; use crate::schema::{SchemaRef, StructType}; use crate::{DeltaResult, Engine, ExpressionRef}; use super::log_replay::{table_changes_action_iter, TableChangesScanData}; -use super::physical_to_logical::read_scan_data; -use super::resolve_dvs::resolve_scan_file_dv; -use super::scan_file::scan_data_to_scan_file; use super::{TableChanges, CDF_FIELDS}; /// The result of building a [`TableChanges`] scan over a table. This can be used to get a change From 61da4c34914270264dfab09f16f6bbccfca82025 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 17:36:20 -0800 Subject: [PATCH 49/56] Address PR comments --- kernel/src/expressions/scalars.rs | 4 +- kernel/src/table_changes/mod.rs | 11 ++- .../src/table_changes/physical_to_logical.rs | 83 ++++++++++--------- kernel/src/table_changes/scan.rs | 15 ++-- 4 files changed, 66 insertions(+), 47 deletions(-) diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index fe9ac3fd3..ac1779f95 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -153,7 +153,7 @@ impl Scalar { } /// Constructs a Scalar timestamp from an `i64` millisecond since unix epoch - pub fn timestamp_from_millis(millis: i64) -> DeltaResult { + pub(crate) fn timestamp_from_millis(millis: i64) -> DeltaResult { let Some(timestamp) = DateTime::from_timestamp_millis(millis) else { return Err(Error::generic(format!( "Failed to create millisecond timestamp from {millis}" @@ -472,7 +472,9 @@ impl PrimitiveType { #[cfg(test)] mod tests { + use core::panic; use std::f32::consts::PI; + use std::time::Duration; use crate::expressions::{column_expr, BinaryOperator}; use crate::Expression; diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index b5cf6f695..a5938082b 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -21,11 +21,16 @@ mod resolve_dvs; pub mod scan; mod scan_file; +static CHANGE_TYPE_COL_NAME: &str = "_change_type"; +static COMMIT_VERSION_COL_NAME: &str = "_commit_version"; +static COMMIT_TIMESTAMP_COL_NAME: &str = "_commit_timestamp"; +static ADD_CHANGE_TYPE: &str = "insert"; +static REMOVE_CHANGE_TYPE: &str = "delete"; static CDF_FIELDS: LazyLock<[StructField; 3]> = LazyLock::new(|| { [ - StructField::new("_change_type", DataType::STRING, false), - StructField::new("_commit_version", DataType::LONG, false), - StructField::new("_commit_timestamp", DataType::TIMESTAMP, false), + StructField::new(CHANGE_TYPE_COL_NAME, DataType::STRING, false), + StructField::new(COMMIT_VERSION_COL_NAME, DataType::LONG, false), + StructField::new(COMMIT_TIMESTAMP_COL_NAME, DataType::TIMESTAMP, false), ] }); diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index de5c9b134..e478698cd 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -5,7 +5,7 @@ use itertools::Itertools; use url::Url; use crate::actions::deletion_vector::split_vector; -use crate::expressions::{column_expr, Scalar}; +use crate::expressions::Scalar; use crate::scan::state::GlobalScanState; use crate::scan::{parse_partition_value, ColumnType, ScanResult}; use crate::schema::{ColumnName, DataType, SchemaRef, StructField, StructType}; @@ -13,35 +13,39 @@ use crate::{DeltaResult, Engine, Error, Expression, ExpressionRef, FileMeta}; use super::resolve_dvs::ResolvedCdfScanFile; use super::scan_file::{CdfScanFile, CdfScanFileType}; +use super::{ + ADD_CHANGE_TYPE, CHANGE_TYPE_COL_NAME, COMMIT_TIMESTAMP_COL_NAME, COMMIT_VERSION_COL_NAME, + REMOVE_CHANGE_TYPE, +}; /// Returns a map from change data feed column name to an expression that generates the row data. #[allow(unused)] -fn get_generated_columns(scan_file: &CdfScanFile) -> DeltaResult> { +fn get_cdf_columns(scan_file: &CdfScanFile) -> DeltaResult> { let timestamp = Scalar::timestamp_from_millis(scan_file.commit_timestamp)?; let version = scan_file.commit_version; let change_type: Expression = match scan_file.scan_type { - CdfScanFileType::Cdc => column_expr!("_change_type"), - CdfScanFileType::Add => "insert".into(), + CdfScanFileType::Cdc => Expression::from(CHANGE_TYPE_COL_NAME), + CdfScanFileType::Add => ADD_CHANGE_TYPE.into(), - CdfScanFileType::Remove => "delete".into(), + CdfScanFileType::Remove => REMOVE_CHANGE_TYPE.into(), }; let expressions = [ - ("_change_type", change_type), - ("_commit_version", Expression::literal(version)), - ("_commit_timestamp", timestamp.into()), + (CHANGE_TYPE_COL_NAME, change_type), + (COMMIT_VERSION_COL_NAME, Expression::literal(version)), + (COMMIT_TIMESTAMP_COL_NAME, timestamp.into()), ]; Ok(expressions.into_iter().collect()) } /// Generates the expression used to convert physical data from the `scan_file` path into logical -/// data matching the `global_state.logical_schema` +/// data matching the `logical_schema` #[allow(unused)] -fn get_expression( +fn physical_to_logical_expr( scan_file: &CdfScanFile, logical_schema: &StructType, all_fields: &[ColumnType], ) -> DeltaResult { - let mut generated_columns = get_generated_columns(scan_file)?; + let mut cdf_columns = get_cdf_columns(scan_file)?; let all_fields = all_fields .iter() .map(|field| match field { @@ -59,7 +63,7 @@ fn get_expression( } ColumnType::Selected(field_name) => { // Remove to take ownership - let generated_column = generated_columns.remove(field_name.as_str()); + let generated_column = cdf_columns.remove(field_name.as_str()); Ok(generated_column.unwrap_or_else(|| ColumnName::new([field_name]).into())) } }) @@ -69,9 +73,9 @@ fn get_expression( /// Gets the physical schema that will be used to read data in the `scan_file` path. #[allow(unused)] -fn get_read_schema(scan_file: &CdfScanFile, read_schema: &StructType) -> SchemaRef { +fn scan_file_read_schema(scan_file: &CdfScanFile, read_schema: &StructType) -> SchemaRef { if scan_file.scan_type == CdfScanFileType::Cdc { - let change_type = StructField::new("_change_type", DataType::STRING, false); + let change_type = StructField::new(CHANGE_TYPE_COL_NAME, DataType::STRING, false); let fields = read_schema.fields().cloned().chain(iter::once(change_type)); StructType::new(fields).into() } else { @@ -82,7 +86,7 @@ fn get_read_schema(scan_file: &CdfScanFile, read_schema: &StructType) -> SchemaR /// Reads the data at the `resolved_scan_file` and transforms the data from physical to logical. /// The result is a fallible iterator of [`ScanResult`] containing the logical data. #[allow(unused)] -pub(crate) fn read_scan_data( +pub(crate) fn read_scan_file( engine: &dyn Engine, resolved_scan_file: ResolvedCdfScanFile, global_state: &GlobalScanState, @@ -94,11 +98,12 @@ pub(crate) fn read_scan_data( mut selection_vector, } = resolved_scan_file; - let expression = get_expression(&scan_file, global_state.logical_schema.as_ref(), all_fields)?; - let schema = get_read_schema(&scan_file, global_state.read_schema.as_ref()); - let evaluator = engine.get_expression_handler().get_evaluator( - schema.clone(), - expression, + let phys_to_logical_expr = + physical_to_logical_expr(&scan_file, global_state.logical_schema.as_ref(), all_fields)?; + let read_schema = scan_file_read_schema(&scan_file, global_state.read_schema.as_ref()); + let phys_to_logical_eval = engine.get_expression_handler().get_evaluator( + read_schema.clone(), + phys_to_logical_expr, global_state.logical_schema.clone().into(), ); @@ -112,12 +117,12 @@ pub(crate) fn read_scan_data( let read_result_iter = engine .get_parquet_handler() - .read_parquet_files(&[file], schema, predicate)?; + .read_parquet_files(&[file], read_schema, predicate)?; let result = read_result_iter.map(move |batch| -> DeltaResult<_> { let batch = batch?; // to transform the physical data into the correct logical form - let logical = evaluator.evaluate(batch.as_ref()); + let logical = phys_to_logical_eval.evaluate(batch.as_ref()); let len = logical.as_ref().map_or(0, |res| res.len()); // need to split the dv_mask. what's left in dv_mask covers this result, and rest // will cover the following results. we `take()` out of `selection_vector` to avoid @@ -142,12 +147,15 @@ mod tests { use crate::expressions::{column_expr, Expression, Scalar}; use crate::scan::ColumnType; use crate::schema::{DataType, StructField, StructType}; + use crate::table_changes::physical_to_logical::physical_to_logical_expr; use crate::table_changes::scan_file::{CdfScanFile, CdfScanFileType}; - - use super::get_expression; + use crate::table_changes::{ + ADD_CHANGE_TYPE, CHANGE_TYPE_COL_NAME, COMMIT_TIMESTAMP_COL_NAME, COMMIT_VERSION_COL_NAME, + REMOVE_CHANGE_TYPE, + }; #[test] - fn add_get_expression() { + fn verify_physical_to_logical_expression() { let test = |scan_type, expected_expr| { let scan_file = CdfScanFile { scan_type, @@ -161,19 +169,20 @@ mod tests { let logical_schema = StructType::new([ StructField::new("id", DataType::STRING, true), StructField::new("age", DataType::LONG, false), - StructField::new("_change_type", DataType::STRING, false), - StructField::new("_commit_version", DataType::LONG, false), - StructField::new("_commit_timestamp", DataType::TIMESTAMP, false), + StructField::new(CHANGE_TYPE_COL_NAME, DataType::STRING, false), + StructField::new(COMMIT_VERSION_COL_NAME, DataType::LONG, false), + StructField::new(COMMIT_TIMESTAMP_COL_NAME, DataType::TIMESTAMP, false), ]); let all_fields = vec![ ColumnType::Selected("id".to_string()), ColumnType::Partition(1), - ColumnType::Selected("_change_type".to_string()), - ColumnType::Selected("_commit_version".to_string()), - ColumnType::Selected("_commit_timestamp".to_string()), + ColumnType::Selected(CHANGE_TYPE_COL_NAME.to_string()), + ColumnType::Selected(COMMIT_VERSION_COL_NAME.to_string()), + ColumnType::Selected(COMMIT_TIMESTAMP_COL_NAME.to_string()), ]; - let expression = get_expression(&scan_file, &logical_schema, &all_fields).unwrap(); - let expected = Expression::struct_from([ + let phys_to_logical_expr = + physical_to_logical_expr(&scan_file, &logical_schema, &all_fields).unwrap(); + let expected_expr = Expression::struct_from([ column_expr!("id"), Scalar::Long(20).into(), expected_expr, @@ -181,11 +190,11 @@ mod tests { Scalar::Timestamp(1234000).into(), // Microsecond is 1000x millisecond ]); - assert_eq!(expression, expected) + assert_eq!(phys_to_logical_expr, expected_expr) }; - test(CdfScanFileType::Add, "insert".into()); - test(CdfScanFileType::Remove, "delete".into()); - test(CdfScanFileType::Cdc, column_expr!("_change_type")); + test(CdfScanFileType::Add, ADD_CHANGE_TYPE.into()); + test(CdfScanFileType::Remove, REMOVE_CHANGE_TYPE.into()); + test(CdfScanFileType::Cdc, Expression::from(CHANGE_TYPE_COL_NAME)); } } diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index d137d8e7c..37ba7a487 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -213,6 +213,9 @@ mod tests { use crate::expressions::{column_expr, Scalar}; use crate::scan::ColumnType; use crate::schema::{DataType, StructField, StructType}; + use crate::table_changes::{ + CHANGE_TYPE_COL_NAME, COMMIT_TIMESTAMP_COL_NAME, COMMIT_VERSION_COL_NAME, + }; use crate::{Expression, Table}; #[test] @@ -231,9 +234,9 @@ mod tests { vec![ ColumnType::Selected("part".to_string()), ColumnType::Selected("id".to_string()), - ColumnType::Selected("_change_type".to_string()), - ColumnType::Selected("_commit_version".to_string()), - ColumnType::Selected("_commit_timestamp".to_string()), + ColumnType::Selected(CHANGE_TYPE_COL_NAME.to_string()), + ColumnType::Selected(COMMIT_VERSION_COL_NAME.to_string()), + ColumnType::Selected(COMMIT_TIMESTAMP_COL_NAME.to_string()), ] ); assert_eq!(scan.predicate, None); @@ -251,7 +254,7 @@ mod tests { let schema = table_changes .schema() - .project(&["id", "_commit_version"]) + .project(&["id", COMMIT_VERSION_COL_NAME]) .unwrap(); let predicate = Arc::new(Expression::gt(column_expr!("id"), Scalar::from(10))); let scan = table_changes @@ -264,14 +267,14 @@ mod tests { scan.all_fields, vec![ ColumnType::Selected("id".to_string()), - ColumnType::Selected("_commit_version".to_string()), + ColumnType::Selected(COMMIT_VERSION_COL_NAME.to_string()), ] ); assert_eq!( scan.logical_schema, StructType::new([ StructField::new("id", DataType::INTEGER, true), - StructField::new("_commit_version", DataType::LONG, false), + StructField::new(COMMIT_VERSION_COL_NAME, DataType::LONG, false), ]) .into() ); From 65bce5fb7706733291c794f9a1d794cc5e68fe25 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 17:39:23 -0800 Subject: [PATCH 50/56] Remove read_scan_data --- .../src/table_changes/physical_to_logical.rs | 57 ------------------- 1 file changed, 57 deletions(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index e478698cd..6918b6a36 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -83,63 +83,6 @@ fn scan_file_read_schema(scan_file: &CdfScanFile, read_schema: &StructType) -> S } } -/// Reads the data at the `resolved_scan_file` and transforms the data from physical to logical. -/// The result is a fallible iterator of [`ScanResult`] containing the logical data. -#[allow(unused)] -pub(crate) fn read_scan_file( - engine: &dyn Engine, - resolved_scan_file: ResolvedCdfScanFile, - global_state: &GlobalScanState, - all_fields: &[ColumnType], - predicate: Option, -) -> DeltaResult>> { - let ResolvedCdfScanFile { - scan_file, - mut selection_vector, - } = resolved_scan_file; - - let phys_to_logical_expr = - physical_to_logical_expr(&scan_file, global_state.logical_schema.as_ref(), all_fields)?; - let read_schema = scan_file_read_schema(&scan_file, global_state.read_schema.as_ref()); - let phys_to_logical_eval = engine.get_expression_handler().get_evaluator( - read_schema.clone(), - phys_to_logical_expr, - global_state.logical_schema.clone().into(), - ); - - let table_root = Url::parse(&global_state.table_root)?; - let location = table_root.join(&scan_file.path)?; - let file = FileMeta { - last_modified: 0, - size: 0, - location, - }; - let read_result_iter = - engine - .get_parquet_handler() - .read_parquet_files(&[file], read_schema, predicate)?; - - let result = read_result_iter.map(move |batch| -> DeltaResult<_> { - let batch = batch?; - // to transform the physical data into the correct logical form - let logical = phys_to_logical_eval.evaluate(batch.as_ref()); - let len = logical.as_ref().map_or(0, |res| res.len()); - // need to split the dv_mask. what's left in dv_mask covers this result, and rest - // will cover the following results. we `take()` out of `selection_vector` to avoid - // trying to return a captured variable. We're going to reassign `selection_vector` - // to `rest` in a moment anyway - let mut sv = selection_vector.take(); - let rest = split_vector(sv.as_mut(), len, None); - let result = ScanResult { - raw_data: logical, - raw_mask: sv, - }; - selection_vector = rest; - Ok(result) - }); - Ok(result) -} - #[cfg(test)] mod tests { use std::collections::HashMap; From bea39ba13afe0b8566043ff1d6757a3e015be38f Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 17:48:43 -0800 Subject: [PATCH 51/56] fix compiler warnings --- kernel/src/expressions/scalars.rs | 1 - kernel/src/table_changes/physical_to_logical.rs | 8 ++------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index ac1779f95..2836b10da 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -474,7 +474,6 @@ impl PrimitiveType { mod tests { use core::panic; use std::f32::consts::PI; - use std::time::Duration; use crate::expressions::{column_expr, BinaryOperator}; use crate::Expression; diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 6918b6a36..415d061fb 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -2,16 +2,12 @@ use std::collections::HashMap; use std::iter; use itertools::Itertools; -use url::Url; -use crate::actions::deletion_vector::split_vector; use crate::expressions::Scalar; -use crate::scan::state::GlobalScanState; -use crate::scan::{parse_partition_value, ColumnType, ScanResult}; +use crate::scan::{parse_partition_value, ColumnType}; use crate::schema::{ColumnName, DataType, SchemaRef, StructField, StructType}; -use crate::{DeltaResult, Engine, Error, Expression, ExpressionRef, FileMeta}; +use crate::{DeltaResult, Error, Expression}; -use super::resolve_dvs::ResolvedCdfScanFile; use super::scan_file::{CdfScanFile, CdfScanFileType}; use super::{ ADD_CHANGE_TYPE, CHANGE_TYPE_COL_NAME, COMMIT_TIMESTAMP_COL_NAME, COMMIT_VERSION_COL_NAME, From 5622874179c42b0ac25e07de256f5c093cfe7c43 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 17:56:48 -0800 Subject: [PATCH 52/56] fix test --- kernel/src/table_changes/physical_to_logical.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 415d061fb..757b7a182 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -20,9 +20,8 @@ fn get_cdf_columns(scan_file: &CdfScanFile) -> DeltaResult Expression::from(CHANGE_TYPE_COL_NAME), + CdfScanFileType::Cdc => Expression::column([CHANGE_TYPE_COL_NAME]), CdfScanFileType::Add => ADD_CHANGE_TYPE.into(), - CdfScanFileType::Remove => REMOVE_CHANGE_TYPE.into(), }; let expressions = [ @@ -132,8 +131,9 @@ mod tests { assert_eq!(phys_to_logical_expr, expected_expr) }; + let cdc_change_type = Expression::column([CHANGE_TYPE_COL_NAME]); test(CdfScanFileType::Add, ADD_CHANGE_TYPE.into()); test(CdfScanFileType::Remove, REMOVE_CHANGE_TYPE.into()); - test(CdfScanFileType::Cdc, Expression::from(CHANGE_TYPE_COL_NAME)); + test(CdfScanFileType::Cdc, cdc_change_type); } } From 221b96f51ffc4c4fd00a47053ce182108a98155f Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 19:22:05 -0800 Subject: [PATCH 53/56] Switch to no timezone --- kernel/src/expressions/scalars.rs | 6 +++--- kernel/src/table_changes/physical_to_logical.rs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 2836b10da..26ef98a2c 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -152,14 +152,14 @@ impl Scalar { matches!(self, Self::Null(_)) } - /// Constructs a Scalar timestamp from an `i64` millisecond since unix epoch - pub(crate) fn timestamp_from_millis(millis: i64) -> DeltaResult { + /// Constructs a Scalar timestamp with no timezone from an `i64` millisecond since unix epoch + pub(crate) fn timestamp_ntz_from_millis(millis: i64) -> DeltaResult { let Some(timestamp) = DateTime::from_timestamp_millis(millis) else { return Err(Error::generic(format!( "Failed to create millisecond timestamp from {millis}" ))); }; - Ok(Self::Timestamp(timestamp.timestamp_micros())) + Ok(Self::TimestampNtz(timestamp.timestamp_micros())) } } diff --git a/kernel/src/table_changes/physical_to_logical.rs b/kernel/src/table_changes/physical_to_logical.rs index 757b7a182..7232e2cf8 100644 --- a/kernel/src/table_changes/physical_to_logical.rs +++ b/kernel/src/table_changes/physical_to_logical.rs @@ -17,7 +17,7 @@ use super::{ /// Returns a map from change data feed column name to an expression that generates the row data. #[allow(unused)] fn get_cdf_columns(scan_file: &CdfScanFile) -> DeltaResult> { - let timestamp = Scalar::timestamp_from_millis(scan_file.commit_timestamp)?; + let timestamp = Scalar::timestamp_ntz_from_millis(scan_file.commit_timestamp)?; let version = scan_file.commit_version; let change_type: Expression = match scan_file.scan_type { CdfScanFileType::Cdc => Expression::column([CHANGE_TYPE_COL_NAME]), @@ -125,7 +125,7 @@ mod tests { Scalar::Long(20).into(), expected_expr, Expression::literal(42i64), - Scalar::Timestamp(1234000).into(), // Microsecond is 1000x millisecond + Scalar::TimestampNtz(1234000).into(), // Microsecond is 1000x millisecond ]); assert_eq!(phys_to_logical_expr, expected_expr) From 857d6446e2626876b3e1afb8d4155c7c08a112db Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 21:32:23 -0800 Subject: [PATCH 54/56] Address pr comments --- kernel/src/expressions/scalars.rs | 1 - kernel/src/table_changes/scan.rs | 14 ++++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/kernel/src/expressions/scalars.rs b/kernel/src/expressions/scalars.rs index 26ef98a2c..a2476a990 100644 --- a/kernel/src/expressions/scalars.rs +++ b/kernel/src/expressions/scalars.rs @@ -472,7 +472,6 @@ impl PrimitiveType { #[cfg(test)] mod tests { - use core::panic; use std::f32::consts::PI; use crate::expressions::{column_expr, BinaryOperator}; diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index 37ba7a487..980374177 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -213,9 +213,7 @@ mod tests { use crate::expressions::{column_expr, Scalar}; use crate::scan::ColumnType; use crate::schema::{DataType, StructField, StructType}; - use crate::table_changes::{ - CHANGE_TYPE_COL_NAME, COMMIT_TIMESTAMP_COL_NAME, COMMIT_VERSION_COL_NAME, - }; + use crate::table_changes::COMMIT_VERSION_COL_NAME; use crate::{Expression, Table}; #[test] @@ -234,9 +232,9 @@ mod tests { vec![ ColumnType::Selected("part".to_string()), ColumnType::Selected("id".to_string()), - ColumnType::Selected(CHANGE_TYPE_COL_NAME.to_string()), - ColumnType::Selected(COMMIT_VERSION_COL_NAME.to_string()), - ColumnType::Selected(COMMIT_TIMESTAMP_COL_NAME.to_string()), + ColumnType::Selected("_change_type".to_string()), + ColumnType::Selected("_commit_version".to_string()), + ColumnType::Selected("_commit_timestamp".to_string()), ] ); assert_eq!(scan.predicate, None); @@ -267,14 +265,14 @@ mod tests { scan.all_fields, vec![ ColumnType::Selected("id".to_string()), - ColumnType::Selected(COMMIT_VERSION_COL_NAME.to_string()), + ColumnType::Selected("_commit_version".to_string()), ] ); assert_eq!( scan.logical_schema, StructType::new([ StructField::new("id", DataType::INTEGER, true), - StructField::new(COMMIT_VERSION_COL_NAME, DataType::LONG, false), + StructField::new("_commit_version", DataType::LONG, false), ]) .into() ); From e3031c3146ad03cf0b4dd661ade1561d1b5a16db Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 21:46:11 -0800 Subject: [PATCH 55/56] Remove unneeded changes --- kernel/src/scan/mod.rs | 2 +- kernel/src/table_changes/resolve_dvs.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 41f65e619..b39760669 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -160,7 +160,7 @@ impl ScanResult { /// store the name of the column, as that's all that's needed during the actual query. For /// `Partition` we store an index into the logical schema for this query since later we need the /// data type as well to materialize the partition column. -#[derive(Clone, PartialEq, Debug)] +#[derive(PartialEq, Debug)] pub enum ColumnType { // A column, selected from the data, as is Selected(String), diff --git a/kernel/src/table_changes/resolve_dvs.rs b/kernel/src/table_changes/resolve_dvs.rs index 1f6a301fd..caa2cf310 100644 --- a/kernel/src/table_changes/resolve_dvs.rs +++ b/kernel/src/table_changes/resolve_dvs.rs @@ -8,15 +8,15 @@ use crate::{DeltaResult, Engine, Error}; /// A [`CdfScanFile`] with its associated `selection_vector`. The `scan_type` is resolved to /// match the `_change_type` that its rows will have in the change data feed. #[allow(unused)] -pub(crate) struct ResolvedCdfScanFile { +struct ResolvedCdfScanFile { /// The scan file that holds the path the data file to be read. The `scan_type` field is /// resolved to the `_change_type` of the rows for this data file. - pub(crate) scan_file: CdfScanFile, + scan_file: CdfScanFile, /// Optional vector of bools. If `selection_vector[i] = true`, then that row must be included /// in the CDF output. Otherwise the row must be filtered out. The vector may be shorter than /// the data file. In this case, all the remaining rows are *not* selected. If `selection_vector` /// is `None`, then all rows are selected. - pub(crate) selection_vector: Option>, + selection_vector: Option>, } /// Resolves the deletion vectors for a [`CdfScanFile`]. This function handles two @@ -34,7 +34,7 @@ pub(crate) struct ResolvedCdfScanFile { /// read the deletion vector (if present), and each is converted into a [`ResolvedCdfScanFile`]. /// No changes are made to the `scan_type`. #[allow(unused)] -pub(crate) fn resolve_scan_file_dv( +fn resolve_scan_file_dv( engine: &dyn Engine, table_root: &Url, scan_file: CdfScanFile, From 88df7fad1cac218f7a745a4b7030f42366e520f1 Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Mon, 9 Dec 2024 21:47:26 -0800 Subject: [PATCH 56/56] make raw mask private --- kernel/src/scan/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index b39760669..f03d62cc9 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -125,7 +125,7 @@ pub struct ScanResult { pub raw_data: DeltaResult>, /// Raw row mask. // TODO(nick) this should be allocated by the engine - pub(crate) raw_mask: Option>, + raw_mask: Option>, } impl ScanResult {