From 1b7e1883510d978b2d39c7003a51ac978db84735 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 5 Feb 2025 16:23:57 -0800 Subject: [PATCH 01/37] mvp, todo: tests --- kernel/src/actions/mod.rs | 8 +- kernel/src/log_segment.rs | 153 ++++++++++++++++++++++++++++-- kernel/src/log_segment/tests.rs | 6 +- kernel/src/path.rs | 19 ++-- kernel/src/scan/mod.rs | 4 +- kernel/src/schema/mod.rs | 4 + kernel/src/table_configuration.rs | 4 +- kernel/src/table_features/mod.rs | 7 ++ kernel/tests/golden_tables.rs | 5 +- 9 files changed, 177 insertions(+), 33 deletions(-) diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs index 8bcb5df50..3e7d36438 100644 --- a/kernel/src/actions/mod.rs +++ b/kernel/src/actions/mod.rs @@ -783,7 +783,7 @@ mod tests { } #[test] - fn test_v2_checkpoint_unsupported() { + fn test_v2_checkpoint_supported() { let protocol = Protocol::try_new( 3, 7, @@ -791,7 +791,7 @@ mod tests { Some([ReaderFeatures::V2Checkpoint]), ) .unwrap(); - assert!(protocol.ensure_read_supported().is_err()); + assert!(protocol.ensure_read_supported().is_ok()); let protocol = Protocol::try_new( 4, @@ -821,7 +821,7 @@ mod tests { Some(&empty_features), ) .unwrap(); - assert!(protocol.ensure_read_supported().is_err()); + assert!(protocol.ensure_read_supported().is_ok()); let protocol = Protocol::try_new( 3, @@ -839,7 +839,7 @@ mod tests { Some([WriterFeatures::V2Checkpoint]), ) .unwrap(); - assert!(protocol.ensure_read_supported().is_err()); + assert!(protocol.ensure_read_supported().is_ok()); let protocol = Protocol { min_reader_version: 1, diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index b4f255c57..848aa49b7 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -1,14 +1,20 @@ //! Represents a segment of a delta log. [`LogSegment`] wraps a set of checkpoint and commit //! files. -use crate::actions::{get_log_schema, Metadata, Protocol, METADATA_NAME, PROTOCOL_NAME}; +use crate::actions::visitors::SidecarVisitor; +use crate::actions::{ + get_log_add_schema, get_log_schema, Metadata, Protocol, Sidecar, ADD_NAME, METADATA_NAME, + PROTOCOL_NAME, SIDECAR_NAME, +}; use crate::path::{LogPathFileType, ParsedLogPath}; use crate::schema::SchemaRef; use crate::snapshot::CheckpointMetadata; use crate::utils::require; use crate::{ - DeltaResult, Engine, EngineData, Error, Expression, ExpressionRef, FileSystemClient, Version, + DeltaResult, Engine, EngineData, Error, Expression, ExpressionRef, FileDataReadResultIterator, + FileMeta, FileSystemClient, ParquetHandler, RowVisitor, Version, }; +use itertools::Either::{Left, Right}; use itertools::Itertools; use std::collections::HashMap; use std::convert::identity; @@ -218,14 +224,149 @@ impl LogSegment { .iter() .map(|f| f.location.clone()) .collect(); - let checkpoint_stream = engine - .get_parquet_handler() - .read_parquet_files(&checkpoint_parts, checkpoint_read_schema, meta_predicate)? - .map_ok(|batch| (batch, false)); + let checkpoint_stream = if checkpoint_parts.is_empty() { + Left(None.into_iter()) + } else { + Right(Self::create_checkpoint_stream( + self, + engine, + checkpoint_read_schema, + meta_predicate, + checkpoint_parts, + )?) + }; Ok(commit_stream.chain(checkpoint_stream)) } + fn create_checkpoint_stream( + &self, + engine: &dyn Engine, + checkpoint_read_schema: SchemaRef, + meta_predicate: Option, + checkpoint_parts: Vec, + ) -> DeltaResult, bool)>> + Send> { + // We checked that checkpoint_parts is not empty before calling this function + require!( + !checkpoint_parts.is_empty(), + Error::generic("Checkpoint parts should not be empty") + ); + let is_json_checkpoint = checkpoint_parts[0].location.path().ends_with(".json"); + + let actions = Self::read_checkpoint_files( + engine, + checkpoint_parts, + checkpoint_read_schema.clone(), + meta_predicate, + is_json_checkpoint, + )?; + + let need_file_actions = checkpoint_read_schema.contains(ADD_NAME) + && checkpoint_read_schema.contains(SIDECAR_NAME); + + require!( + !(need_file_actions && !checkpoint_read_schema.contains(SIDECAR_NAME)), + Error::generic( + "If the checkpoint read schema contains file actions, it must contain the sidecar column" + ) + ); + + let is_multi_part_checkpoint = self.checkpoint_parts.len() > 1; //maybe not self? + let log_root = self.log_root.clone(); + let parquet_handler = engine.get_parquet_handler().clone(); + + // If the schema does not contain add/remove actions (e.g., metadata-only replay), + // then we return the checkpoint batch directly. + // - Multi-part checkpoints never have sidecar actions, so they are returned as-is. + // - Otherwise, we check for sidecar references and replace checkpoint batches accordingly. + let checkpoint_stream = if need_file_actions && !is_multi_part_checkpoint { + Left( + actions + // Flatten the new batches returned. The new batches could be: + // - the checkpoint batch itself if no sidecar actions are present in the batch + // - 1 or more sidecar batches referenced in the checkpoint batch by sidecar actions + .flat_map(move |batch_result| match batch_result { + Ok(checkpoint_batch) => Right( + Self::process_checkpoint_batch( + parquet_handler.clone(), + log_root.clone(), + checkpoint_batch, + ) + .map_or_else(|e| Left(std::iter::once(Err(e))), Right) + .map_ok(|batch| (batch, false)), + ), + Err(e) => Left(std::iter::once(Err(e))), + }), + ) + } else { + Right(actions.map_ok(|batch| (batch, false))) + }; + + Ok(checkpoint_stream) + } + + fn process_checkpoint_batch( + parquet_handler: Arc, + log_root: Url, + batch: Box, + ) -> DeltaResult>> + Send> { + let mut visitor = SidecarVisitor::default(); + + // Collect sidecars + visitor.visit_rows_of(batch.as_ref())?; + + // If there are no sidecars, return the batch as is + if visitor.sidecars.is_empty() { + return Ok(Left(std::iter::once(Ok(batch)))); + } + + // Convert sidecar actions to sidecar file paths + let sidecar_files: Result, _> = visitor + .sidecars + .iter() + .map(|sidecar| Self::sidecar_to_filemeta(sidecar, &log_root)) + .collect(); + + let sidecar_read_schema = get_log_add_schema().clone(); + + // If sidecars files exist, read the sidecar files and return the iterator of sidecar batches + // to replace the checkpoint batch in the top level iterator + Ok(Right(parquet_handler.read_parquet_files( + &sidecar_files?, + sidecar_read_schema, + None, + )?)) + } + + // Helper function to read checkpoint files based on the file type + fn read_checkpoint_files( + engine: &dyn Engine, + checkpoint_parts: Vec, + schema: SchemaRef, + predicate: Option, + is_json: bool, + ) -> DeltaResult { + if is_json { + engine + .get_json_handler() + .read_json_files(&checkpoint_parts, schema, predicate) + } else { + engine + .get_parquet_handler() + .read_parquet_files(&checkpoint_parts, schema, predicate) + } + } + + // Helper function to convert a single sidecar action to a FileMeta + fn sidecar_to_filemeta(sidecar: &Sidecar, log_root: &Url) -> Result { + let location = log_root.join("_sidecars/")?.join(&sidecar.path)?; + Ok(FileMeta { + location, + last_modified: sidecar.modification_time, + size: sidecar.size_in_bytes as usize, + }) + } + // Get the most up-to-date Protocol and Metadata actions pub(crate) fn read_metadata(&self, engine: &dyn Engine) -> DeltaResult<(Metadata, Protocol)> { let data_batches = self.replay_for_metadata(engine)?; diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 5db1c4581..59e6837ef 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -108,7 +108,7 @@ fn build_log_with_paths_and_checkpoint( } #[test] -fn build_snapshot_with_unsupported_uuid_checkpoint() { +fn build_snapshot_with_uuid_checkpoint() { let (client, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -129,10 +129,10 @@ fn build_snapshot_with_unsupported_uuid_checkpoint() { let checkpoint_parts = log_segment.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); - assert_eq!(checkpoint_parts[0].version, 3); + assert_eq!(checkpoint_parts[0].version, 5); let versions = commit_files.into_iter().map(|x| x.version).collect_vec(); - let expected_versions = vec![4, 5, 6, 7]; + let expected_versions = vec![6, 7]; assert_eq!(versions, expected_versions); } diff --git a/kernel/src/path.rs b/kernel/src/path.rs index 23e7819de..58fbba657 100644 --- a/kernel/src/path.rs +++ b/kernel/src/path.rs @@ -166,7 +166,9 @@ impl ParsedLogPath { // TODO: Include UuidCheckpoint once we actually support v2 checkpoints matches!( self.file_type, - LogPathFileType::SinglePartCheckpoint | LogPathFileType::MultiPartCheckpoint { .. } + LogPathFileType::SinglePartCheckpoint + | LogPathFileType::MultiPartCheckpoint { .. } + | LogPathFileType::UuidCheckpoint(_) ) } @@ -175,10 +177,7 @@ impl ParsedLogPath { #[allow(dead_code)] // currently only used in tests, which don't "count" fn is_unknown(&self) -> bool { // TODO: Stop treating UuidCheckpoint as unknown once we support v2 checkpoints - matches!( - self.file_type, - LogPathFileType::Unknown | LogPathFileType::UuidCheckpoint(_) - ) + matches!(self.file_type, LogPathFileType::Unknown) } } @@ -357,10 +356,7 @@ mod tests { LogPathFileType::UuidCheckpoint(ref u) if u == "3a0d65cd-4056-49b8-937b-95f9e3ee90e5", )); assert!(!log_path.is_commit()); - - // TODO: Support v2 checkpoints! Until then we can't treat these as checkpoint files. - assert!(!log_path.is_checkpoint()); - assert!(log_path.is_unknown()); + assert!(log_path.is_checkpoint()); let log_path = table_log_dir .join("00000000000000000002.checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json") @@ -377,10 +373,7 @@ mod tests { LogPathFileType::UuidCheckpoint(ref u) if u == "3a0d65cd-4056-49b8-937b-95f9e3ee90e5", )); assert!(!log_path.is_commit()); - - // TODO: Support v2 checkpoints! Until then we can't treat these as checkpoint files. - assert!(!log_path.is_checkpoint()); - assert!(log_path.is_unknown()); + assert!(log_path.is_checkpoint()); let log_path = table_log_dir .join("00000000000000000002.checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.foo") diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 14e2ee50f..610bb3d40 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -11,7 +11,7 @@ use url::Url; use crate::actions::deletion_vector::{ deletion_treemap_to_bools, split_vector, DeletionVectorDescriptor, }; -use crate::actions::{get_log_add_schema, get_log_schema, ADD_NAME, REMOVE_NAME}; +use crate::actions::{get_log_schema, ADD_NAME, REMOVE_NAME, SIDECAR_NAME}; use crate::expressions::{ColumnName, Expression, ExpressionRef, ExpressionTransform, Scalar}; use crate::predicates::{DefaultPredicateEvaluator, EmptyColumnResolver}; use crate::scan::state::{DvInfo, Stats}; @@ -428,7 +428,7 @@ impl Scan { engine: &dyn Engine, ) -> DeltaResult, bool)>> + Send> { let commit_read_schema = get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?; - let checkpoint_read_schema = get_log_add_schema().clone(); + let checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; // NOTE: We don't pass any meta-predicate because we expect no meaningful row group skipping // when ~every checkpoint file will contain the adds and removes we are looking for. diff --git a/kernel/src/schema/mod.rs b/kernel/src/schema/mod.rs index 6086a7031..11befb34a 100644 --- a/kernel/src/schema/mod.rs +++ b/kernel/src/schema/mod.rs @@ -286,6 +286,10 @@ impl StructType { self.fields.values() } + pub fn contains(&self, name: impl AsRef) -> bool { + self.fields.contains_key(name.as_ref()) + } + /// Extracts the name and type of all leaf columns, in schema order. Caller should pass Some /// `own_name` if this schema is embedded in a larger struct (e.g. `add.*`) and None if the /// schema is a top-level result (e.g. `*`). diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index 565546d52..22447554b 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -262,8 +262,8 @@ mod test { let protocol = Protocol::try_new( 3, 7, - Some([ReaderFeatures::V2Checkpoint]), - Some([WriterFeatures::V2Checkpoint]), + Some([ReaderFeatures::UnsupportedFeature]), + Some([WriterFeatures::UnsupportedFeature]), ) .unwrap(); let table_root = Url::try_from("file:///").unwrap(); diff --git a/kernel/src/table_features/mod.rs b/kernel/src/table_features/mod.rs index ee27fc17e..bc5499501 100644 --- a/kernel/src/table_features/mod.rs +++ b/kernel/src/table_features/mod.rs @@ -48,6 +48,8 @@ pub enum ReaderFeatures { /// vacuumProtocolCheck ReaderWriter feature ensures consistent application of reader and writer /// protocol checks during VACUUM operations VacuumProtocolCheck, + /// A dummy variant used to represent an unsupported feature for testing purposes + UnsupportedFeature, } /// Similar to reader features, writer features communicate capabilities that must be implemented @@ -109,6 +111,8 @@ pub enum WriterFeatures { /// vacuumProtocolCheck ReaderWriter feature ensures consistent application of reader and writer /// protocol checks during VACUUM operations VacuumProtocolCheck, + /// A dummy variant used to represent an unsupported feature for testing purposes + UnsupportedFeature, } impl From for String { @@ -133,6 +137,7 @@ pub(crate) static SUPPORTED_READER_FEATURES: LazyLock> = ReaderFeatures::TypeWidening, ReaderFeatures::TypeWideningPreview, ReaderFeatures::VacuumProtocolCheck, + ReaderFeatures::V2Checkpoint, ]) }); @@ -154,6 +159,7 @@ mod tests { (ReaderFeatures::TypeWideningPreview, "typeWidening-preview"), (ReaderFeatures::V2Checkpoint, "v2Checkpoint"), (ReaderFeatures::VacuumProtocolCheck, "vacuumProtocolCheck"), + (ReaderFeatures::UnsupportedFeature, "unsupportedFeature"), ]; assert_eq!(ReaderFeatures::VARIANTS.len(), cases.len()); @@ -192,6 +198,7 @@ mod tests { (WriterFeatures::IcebergCompatV1, "icebergCompatV1"), (WriterFeatures::IcebergCompatV2, "icebergCompatV2"), (WriterFeatures::VacuumProtocolCheck, "vacuumProtocolCheck"), + (WriterFeatures::UnsupportedFeature, "unsupportedFeature"), ]; assert_eq!(WriterFeatures::VARIANTS.len(), cases.len()); diff --git a/kernel/tests/golden_tables.rs b/kernel/tests/golden_tables.rs index 120271ef2..a7c271432 100644 --- a/kernel/tests/golden_tables.rs +++ b/kernel/tests/golden_tables.rs @@ -409,9 +409,8 @@ golden_test!("time-travel-start", latest_snapshot_test); golden_test!("time-travel-start-start20", latest_snapshot_test); golden_test!("time-travel-start-start20-start40", latest_snapshot_test); -skip_test!("v2-checkpoint-json": "v2 checkpoint not supported"); -skip_test!("v2-checkpoint-parquet": "v2 checkpoint not supported"); - +golden_test!("v2-checkpoint-json", latest_snapshot_test); +golden_test!("v2-checkpoint-parquet", latest_snapshot_test); // BUG: // - AddFile: 'file:/some/unqualified/absolute/path' // - RemoveFile: '/some/unqualified/absolute/path' From cfd5f4f9b31460fd6234bddfd6162f8b33136f36 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 6 Feb 2025 14:07:01 -0800 Subject: [PATCH 02/37] refactor --- kernel/src/log_segment.rs | 87 +++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 40 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 848aa49b7..74985996d 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -239,6 +239,16 @@ impl LogSegment { Ok(commit_stream.chain(checkpoint_stream)) } + /// Returns an iterator over checkpoint data, processing sidecar files when necessary. + /// + /// Checkpoint data is returned directly if: + /// - Processing multi-part checkpoints + /// - Schema doesn't contain file actions + /// + /// For single-part checkpoints, any referenced sidecar files are processed. These + /// sidecar files contain the actual add/remove actions that would otherwise be + /// stored directly in the checkpoint, providing a more efficient storage mechanism + /// for large change sets. fn create_checkpoint_stream( &self, engine: &dyn Engine, @@ -251,8 +261,17 @@ impl LogSegment { !checkpoint_parts.is_empty(), Error::generic("Checkpoint parts should not be empty") ); - let is_json_checkpoint = checkpoint_parts[0].location.path().ends_with(".json"); + let need_file_actions = checkpoint_read_schema.contains(ADD_NAME) + && checkpoint_read_schema.contains(SIDECAR_NAME); + require!( + !need_file_actions || checkpoint_read_schema.contains(SIDECAR_NAME), + Error::generic( + "If the checkpoint read schema contains file actions, it must contain the sidecar column" + ) + ); + + let is_json_checkpoint = checkpoint_parts[0].location.path().ends_with(".json"); let actions = Self::read_checkpoint_files( engine, checkpoint_parts, @@ -261,51 +280,39 @@ impl LogSegment { is_json_checkpoint, )?; - let need_file_actions = checkpoint_read_schema.contains(ADD_NAME) - && checkpoint_read_schema.contains(SIDECAR_NAME); - - require!( - !(need_file_actions && !checkpoint_read_schema.contains(SIDECAR_NAME)), - Error::generic( - "If the checkpoint read schema contains file actions, it must contain the sidecar column" - ) - ); + // 1. In the case where the schema does not contain add/remove actions, we return the checkpoint + // batch directly as sidecar files only have to be read when the schema contains add/remove actions. + // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is. + if !need_file_actions || self.checkpoint_parts.len() > 1 { + return Ok(Left(actions.map_ok(|batch| (batch, false)))); + } - let is_multi_part_checkpoint = self.checkpoint_parts.len() > 1; //maybe not self? let log_root = self.log_root.clone(); let parquet_handler = engine.get_parquet_handler().clone(); - // If the schema does not contain add/remove actions (e.g., metadata-only replay), - // then we return the checkpoint batch directly. - // - Multi-part checkpoints never have sidecar actions, so they are returned as-is. - // - Otherwise, we check for sidecar references and replace checkpoint batches accordingly. - let checkpoint_stream = if need_file_actions && !is_multi_part_checkpoint { - Left( - actions - // Flatten the new batches returned. The new batches could be: - // - the checkpoint batch itself if no sidecar actions are present in the batch - // - 1 or more sidecar batches referenced in the checkpoint batch by sidecar actions - .flat_map(move |batch_result| match batch_result { - Ok(checkpoint_batch) => Right( - Self::process_checkpoint_batch( - parquet_handler.clone(), - log_root.clone(), - checkpoint_batch, - ) - .map_or_else(|e| Left(std::iter::once(Err(e))), Right) - .map_ok(|batch| (batch, false)), - ), - Err(e) => Left(std::iter::once(Err(e))), - }), - ) - } else { - Right(actions.map_ok(|batch| (batch, false))) - }; - - Ok(checkpoint_stream) + // Process checkpoint batches with potential sidecar file references that need to be read + // to extract add/remove actions from the sidecar files. + Ok(Right( + actions + // Flatten the new batches returned. The new batches could be: + // - the checkpoint batch itself if no sidecar actions are present in the batch + // - one or more sidecar batches read from the sidecar files if sidecar actions are present + .flat_map(move |batch_result| match batch_result { + Ok(checkpoint_batch) => Right( + Self::process_single_checkpoint_batch( + parquet_handler.clone(), + log_root.clone(), + checkpoint_batch, + ) + .map_or_else(|e| Left(std::iter::once(Err(e))), Right) + .map_ok(|batch| (batch, false)), + ), + Err(e) => Left(std::iter::once(Err(e))), + }), + )) } - fn process_checkpoint_batch( + fn process_single_checkpoint_batch( parquet_handler: Arc, log_root: Url, batch: Box, From 6139ddddfcc8bcceb36f526896d9a62c0c3429a4 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 6 Feb 2025 23:38:11 -0800 Subject: [PATCH 03/37] mvp for tests --- kernel/src/actions/visitors.rs | 37 +++ kernel/src/log_segment.rs | 12 +- kernel/src/log_segment/tests.rs | 546 +++++++++++++++++++++++++++++++- kernel/src/path.rs | 1 - kernel/src/scan/log_replay.rs | 7 +- kernel/src/scan/mod.rs | 35 +- kernel/src/scan/state.rs | 3 +- kernel/src/utils.rs | 86 ++++- 8 files changed, 705 insertions(+), 22 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 9f34bd2c5..9ebb330f5 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -477,6 +477,12 @@ impl RowVisitor for SidecarVisitor { for i in 0..row_count { // Since path column is required, use it to detect presence of a sidecar action if let Some(path) = getters[0].get_opt(i, "sidecar.path")? { + // We read checkpoint batches with the sidecar action. This results in empty paths + // if a row is not a sidecar action. We do not want to create a sidecar action for + // these rows. + if path == "" { + continue; + } self.sidecars.push(Self::visit_sidecar(i, path, getters)?); } } @@ -608,6 +614,37 @@ mod tests { Ok(()) } + #[test] + fn test_parse_sidecar_with_empty_string_as_path() -> DeltaResult<()> { + let engine = SyncEngine::new(); + let json_handler = engine.get_json_handler(); + let json_strings: StringArray = vec![ + r#"{"sidecar":{"path":"","sizeInBytes":9268,"modificationTime":1714496113961,"tags":null}}"#, + r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":null}}"#, + + ] + .into(); + let output_schema = get_log_schema().clone(); + let batch = json_handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap(); + + let mut visitor = SidecarVisitor::default(); + visitor.visit_rows_of(batch.as_ref())?; + + let sidecar1 = Sidecar { + path: "016ae953-37a9-438e-8683-9a9a4a79a395.parquet".into(), + size_in_bytes: 9268, + modification_time: 1714496113961, + tags: None, + }; + + assert_eq!(visitor.sidecars.len(), 1); + assert_eq!(visitor.sidecars[0], sidecar1); + + Ok(()) + } + #[test] fn test_parse_metadata() -> DeltaResult<()> { let data = action_batch(); diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 74985996d..0c251349a 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -228,11 +228,11 @@ impl LogSegment { Left(None.into_iter()) } else { Right(Self::create_checkpoint_stream( - self, engine, checkpoint_read_schema, meta_predicate, checkpoint_parts, + self.log_root.clone(), )?) }; @@ -250,11 +250,11 @@ impl LogSegment { /// stored directly in the checkpoint, providing a more efficient storage mechanism /// for large change sets. fn create_checkpoint_stream( - &self, engine: &dyn Engine, checkpoint_read_schema: SchemaRef, meta_predicate: Option, checkpoint_parts: Vec, + log_root: Url, ) -> DeltaResult, bool)>> + Send> { // We checked that checkpoint_parts is not empty before calling this function require!( @@ -262,8 +262,7 @@ impl LogSegment { Error::generic("Checkpoint parts should not be empty") ); - let need_file_actions = checkpoint_read_schema.contains(ADD_NAME) - && checkpoint_read_schema.contains(SIDECAR_NAME); + let need_file_actions = checkpoint_read_schema.contains(ADD_NAME); require!( !need_file_actions || checkpoint_read_schema.contains(SIDECAR_NAME), Error::generic( @@ -274,7 +273,7 @@ impl LogSegment { let is_json_checkpoint = checkpoint_parts[0].location.path().ends_with(".json"); let actions = Self::read_checkpoint_files( engine, - checkpoint_parts, + checkpoint_parts.clone(), checkpoint_read_schema.clone(), meta_predicate, is_json_checkpoint, @@ -283,11 +282,10 @@ impl LogSegment { // 1. In the case where the schema does not contain add/remove actions, we return the checkpoint // batch directly as sidecar files only have to be read when the schema contains add/remove actions. // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is. - if !need_file_actions || self.checkpoint_parts.len() > 1 { + if !need_file_actions || checkpoint_parts.len() > 1 { return Ok(Left(actions.map_ok(|batch| (batch, false)))); } - let log_root = self.log_root.clone(); let parquet_handler = engine.get_parquet_handler().clone(); // Process checkpoint batches with potential sidecar file references that need to be read diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 59e6837ef..96400c2c7 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -4,12 +4,19 @@ use itertools::Itertools; use object_store::{memory::InMemory, path::Path, ObjectStore}; use url::Url; +use crate::actions::{ + get_log_add_schema, get_log_schema, Add, Sidecar, ADD_NAME, METADATA_NAME, SIDECAR_NAME, +}; use crate::engine::default::executor::tokio::TokioBackgroundExecutor; use crate::engine::default::filesystem::ObjectStoreFileSystemClient; use crate::engine::sync::SyncEngine; use crate::log_segment::LogSegment; +use crate::scan::test_utils::{ + add_batch_simple, add_batch_with_remove, sidecar_batch_with_given_paths, +}; use crate::snapshot::CheckpointMetadata; -use crate::{FileSystemClient, Table}; +use crate::utils::test_utils::{into_record_batch, Action, LocalMockTable}; +use crate::{Engine, FileMeta, FileSystemClient, Table}; use test_utils::delta_path_for_version; // NOTE: In addition to testing the meta-predicate for metadata replay, this test also verifies @@ -108,7 +115,7 @@ fn build_log_with_paths_and_checkpoint( } #[test] -fn build_snapshot_with_uuid_checkpoint() { +fn build_snapshot_with_uuid_checkpoint_parquet() { let (client, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -136,6 +143,75 @@ fn build_snapshot_with_uuid_checkpoint() { assert_eq!(versions, expected_versions); } +#[test] +fn build_snapshot_with_uuid_checkpoint_json() { + let (client, log_root) = build_log_with_paths_and_checkpoint( + &[ + delta_path_for_version(0, "json"), + delta_path_for_version(1, "checkpoint.parquet"), + delta_path_for_version(2, "json"), + delta_path_for_version(3, "checkpoint.parquet"), + delta_path_for_version(4, "json"), + delta_path_for_version(5, "json"), + delta_path_for_version(5, "checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json"), + delta_path_for_version(6, "json"), + delta_path_for_version(7, "json"), + ], + None, + ); + + let log_segment = LogSegment::for_snapshot(client.as_ref(), log_root, None, None).unwrap(); + let commit_files = log_segment.ascending_commit_files; + let checkpoint_parts = log_segment.checkpoint_parts; + + assert_eq!(checkpoint_parts.len(), 1); + assert_eq!(checkpoint_parts[0].version, 5); + + let versions = commit_files.into_iter().map(|x| x.version).collect_vec(); + let expected_versions = vec![6, 7]; + assert_eq!(versions, expected_versions); +} + +#[test] +fn build_snapshot_with_correct_last_uuid_checkpoint() { + let checkpoint_metadata = CheckpointMetadata { + version: 5, + size: 10, + parts: Some(1), + size_in_bytes: None, + num_of_add_files: None, + checkpoint_schema: None, + checksum: None, + }; + + let (client, log_root) = build_log_with_paths_and_checkpoint( + &[ + delta_path_for_version(0, "json"), + delta_path_for_version(1, "checkpoint.parquet"), + delta_path_for_version(1, "json"), + delta_path_for_version(2, "json"), + delta_path_for_version(3, "checkpoint.parquet"), + delta_path_for_version(3, "json"), + delta_path_for_version(4, "json"), + delta_path_for_version(5, "checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.parquet"), + delta_path_for_version(5, "json"), + delta_path_for_version(6, "json"), + delta_path_for_version(7, "json"), + ], + Some(&checkpoint_metadata), + ); + + let log_segment = + LogSegment::for_snapshot(client.as_ref(), log_root, checkpoint_metadata, None).unwrap(); + let commit_files = log_segment.ascending_commit_files; + let checkpoint_parts = log_segment.checkpoint_parts; + + assert_eq!(checkpoint_parts.len(), 1); + assert_eq!(commit_files.len(), 2); + assert_eq!(checkpoint_parts[0].version, 5); + assert_eq!(commit_files[0].version, 6); + assert_eq!(commit_files[1].version, 7); +} #[test] fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() { let (client, log_root) = build_log_with_paths_and_checkpoint( @@ -620,3 +696,469 @@ fn table_changes_fails_with_larger_start_version_than_end() { let log_segment_res = LogSegment::for_table_changes(client.as_ref(), log_root, 1, Some(0)); assert!(log_segment_res.is_err()); } +#[test] +fn test_sidecar_to_filemeta() { + let log_root = Url::parse("file:///var/_delta_log/").unwrap(); + let test_cases = [ + ( + "example.parquet", + "file:///var/_delta_log/_sidecars/example.parquet", + ), + ( + "file:///var/_delta_log/_sidecars/example.parquet", + "file:///var/_delta_log/_sidecars/example.parquet", + ), + // This forms a valid URL but represents an invalid path since sidecar files + // are restricted to the `_delta_log/_sidecars` directory. + // Attempting to read this file will fail because it does not exist. + ( + "_delta_log/_sidecars/example.parquet", + "file:///var/_delta_log/_sidecars/_delta_log/_sidecars/example.parquet", + ), + ]; + for (input_path, expected_url) in test_cases { + let sidecar = Sidecar { + path: input_path.into(), + modification_time: 0, + size_in_bytes: 1000, + tags: None, + }; + + let result = LogSegment::sidecar_to_filemeta(&sidecar, &log_root).unwrap(); + + assert_eq!( + result.location.as_str(), + expected_url, + "Mismatch for input path: {}", + input_path + ); + } +} + +#[test] +fn test_checkpoint_batch_with_no_sidecars_returns_checkpoint_batch_as_is() { + let engine = Arc::new(SyncEngine::new()); + let log_root = Url::parse("s3://example-bucket/logs/").unwrap(); + let checkpoint_batch = add_batch_simple(get_log_schema().clone()); + + let result = LogSegment::process_single_checkpoint_batch( + engine.get_parquet_handler(), + log_root, + checkpoint_batch, + ) + .unwrap(); + let mut iter = result.into_iter(); + + // Assert that the batch is returned as is when no sidecars are present + let returned_batch = iter.next().unwrap().unwrap(); + assert_eq!( + into_record_batch(returned_batch), + into_record_batch(add_batch_simple(get_log_schema().clone())) + ); + + // No extra batches should exist + assert!(iter.next().is_none()); +} + +#[tokio::test] +async fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() { + let mut mock_table = LocalMockTable::new(); + let sidecar_schema = get_log_add_schema().clone(); + mock_table + .sidecar( + add_batch_simple(sidecar_schema.clone()), + "sidecarfile1.parquet", + ) + .await; + mock_table + .sidecar( + add_batch_with_remove(sidecar_schema.clone()), + "sidecarfile2.parquet", + ) + .await; + + let engine = Arc::new(SyncEngine::new()); + let checkpoint_batch = sidecar_batch_with_given_paths( + vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], + get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(), + ); + + let result = LogSegment::process_single_checkpoint_batch( + engine.get_parquet_handler(), + mock_table.log_root(), + checkpoint_batch, + ) + .unwrap(); + + let mut iter = result.into_iter(); + + // Assert that the first batch returned is from reading sidecarfile1 + let first_batch = iter.next().unwrap().unwrap(); + assert_eq!( + into_record_batch(first_batch), + into_record_batch(add_batch_simple(sidecar_schema.clone())) + ); + + // Assert that the second batch returned is from reading sidecarfile2 + let second_batch = iter.next().unwrap().unwrap(); + assert_eq!( + into_record_batch(second_batch), + into_record_batch(add_batch_with_remove(sidecar_schema.clone())) + ); + + // No extra batches should exist + assert!(iter.next().is_none()); +} + +#[test] +fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() { + let mock_table = LocalMockTable::new(); + let engine = Arc::new(SyncEngine::new()); + let checkpoint_batch = sidecar_batch_with_given_paths( + vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], + get_log_schema().clone(), + ); + + let result = LogSegment::process_single_checkpoint_batch( + engine.get_parquet_handler(), + mock_table.log_root(), + checkpoint_batch, + ) + .unwrap(); + let mut iter = result.into_iter(); + + // Assert that an error is returned when trying to read sidecar files that do not exist + let _ = iter.next().unwrap().is_err(); + if let Err(e) = iter.next().unwrap() { + assert!(e.to_string().contains("No such file or directory")); + } +} + +#[test] +fn test_create_checkpoint_stream_errors_when_checkpoint_parts_is_empty() { + let engine = SyncEngine::new(); + + let stream = LogSegment::create_checkpoint_stream( + &engine, + get_log_schema().clone(), + None, + vec![], + Url::parse("s3://example-bucket/logs/").unwrap(), + ); + + assert!(stream.is_err()); + if let Err(e) = stream { + assert!(e + .to_string() + .contains("Checkpoint parts should not be empty")); + } +} + +fn create_file_meta(path: &str) -> FileMeta { + FileMeta { + location: Url::parse(path).expect("Invalid file URL"), + last_modified: 0, + size: 0, + } +} + +#[test] +fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_action() { + let engine = SyncEngine::new(); + + let stream = LogSegment::create_checkpoint_stream( + &engine, + get_log_add_schema().clone(), + None, + vec![create_file_meta("file:///1.parquet")], + Url::parse("s3://example-bucket/logs/").unwrap(), + ); + + assert!(stream.is_err()); + if let Err(e) = stream { + assert!(e + .to_string() + .contains("If the checkpoint read schema contains file actions, it must contain the sidecar column")); + } +} + +#[tokio::test] +async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_no_file_actions( +) { + let engine = SyncEngine::new(); + let v2_checkpoint_read_schema = get_log_schema().project(&[METADATA_NAME]).unwrap(); + let mut mock_table = LocalMockTable::new(); + mock_table + .parquet_checkpoint( + add_batch_simple(v2_checkpoint_read_schema.clone()), + "00001.checkpoint.parquet", + ) + .await; + + let checkpoint_one_file = mock_table + .log_root() + .join("00001.checkpoint.parquet") + .unwrap() + .to_string(); + + let stream = LogSegment::create_checkpoint_stream( + &engine, + v2_checkpoint_read_schema.clone(), + None, + vec![create_file_meta(&checkpoint_one_file)], + mock_table.log_root(), + ) + .unwrap(); + + let mut iter = stream.into_iter(); + + // Assert that the first batch returned is from reading checkpoint file 1 + let (first_batch, is_log_batch) = iter.next().unwrap().unwrap(); + assert!(!is_log_batch); + assert_eq!( + into_record_batch(first_batch), + into_record_batch(add_batch_simple(v2_checkpoint_read_schema.clone())) + ); + + // No extra batches should exist + assert!(iter.next().is_none()); +} + +#[tokio::test] +async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_multi_part() { + let engine = SyncEngine::new(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(); + let mut mock_table = LocalMockTable::new(); + + // Multi-part checkpoints can never have sidecar actions. + // We place batches with sidecar actions present in multi-part checkpoints to verify we do not read the actions, + // as we should short-circuit and return the checkpoint batches as-is when encountering multi-part checkpoints. + mock_table + .parquet_checkpoint( + sidecar_batch_with_given_paths( + vec!["sidecar1.parquet"], + v2_checkpoint_read_schema.clone(), + ), + "00001.checkpoint.1.2.parquet", + ) + .await; + mock_table + .parquet_checkpoint( + sidecar_batch_with_given_paths( + vec!["sidecar2.parquet"], + v2_checkpoint_read_schema.clone(), + ), + "00001.checkpoint.2.2.parquet", + ) + .await; + + let checkpoint_one_file = mock_table + .log_root() + .join("00001.checkpoint.1.2.parquet") + .unwrap() + .to_string(); + + let checkpoint_two_file = mock_table + .log_root() + .join("00001.checkpoint.2.2.parquet") + .unwrap() + .to_string(); + + let stream = LogSegment::create_checkpoint_stream( + &engine, + v2_checkpoint_read_schema.clone(), + None, + vec![ + create_file_meta(&checkpoint_one_file), + create_file_meta(&checkpoint_two_file), + ], + mock_table.log_root(), + ) + .unwrap(); + + let mut iter = stream.into_iter(); + + // Assert that the first batch returned is from reading checkpoint file 1 + let (first_batch, is_log_batch) = iter.next().unwrap().unwrap(); + assert!(!is_log_batch); + assert_eq!( + into_record_batch(first_batch), + into_record_batch(sidecar_batch_with_given_paths( + vec!["sidecar1.parquet"], + v2_checkpoint_read_schema.clone(), + )) + ); + + // Assert that the first batch returned is from reading checkpoint file 2 + let (second_batch, is_log_batch) = iter.next().unwrap().unwrap(); + assert!(!is_log_batch); + assert_eq!( + into_record_batch(second_batch), + into_record_batch(sidecar_batch_with_given_paths( + vec!["sidecar2.parquet"], + v2_checkpoint_read_schema.clone(), + )) + ); + + // No extra batches should exist + assert!(iter.next().is_none()); +} + +// Test showcases weird behavior where reading a batch with a missing column causes the reader to +// insert the empty-string in string fields. This is seen in this test where we find two instance of Sidecars with +// empty-string path fields after visiting the batch with the SidecarVisitor due to the batch being read with +// the schema which includes the Sidecar column. +#[tokio::test] +async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch() { + let engine = SyncEngine::new(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(); + let mut mock_table = LocalMockTable::new(); + + mock_table + .parquet_checkpoint( + add_batch_simple(v2_checkpoint_read_schema.clone()), + "00001.checkpoint.parquet", + ) + .await; + + let checkpoint_one_file = mock_table + .log_root() + .join("00001.checkpoint.parquet") + .unwrap() + .to_string(); + + let stream = LogSegment::create_checkpoint_stream( + &engine, + v2_checkpoint_read_schema.clone(), + None, + vec![create_file_meta(&checkpoint_one_file)], + mock_table.log_root(), + ) + .unwrap(); + + let mut iter = stream.into_iter(); + + // Assert that the first batch returned is from reading checkpoint file 1 + let (first_batch, is_log_batch) = iter.next().unwrap().unwrap(); + assert!(!is_log_batch); + assert_eq!( + into_record_batch(first_batch), + into_record_batch(add_batch_simple(v2_checkpoint_read_schema.clone())) + ); + + // No extra batches should exist + assert!(iter.next().is_none()); +} + +#[tokio::test] +async fn test_create_checkpoint_stream_reads_json_checkpoint_batch() { + let engine = SyncEngine::new(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(); + let mut mock_table = LocalMockTable::new(); + + mock_table + .json_checkpoint( + [Action::Add(Add { + path: "fake_path_1".into(), + data_change: true, + ..Default::default() + })], + "00001.checkpoint.json", + ) + .await; + + let checkpoint_one_file = mock_table + .log_root() + .join("00001.checkpoint.json") + .unwrap() + .to_string(); + + let stream = LogSegment::create_checkpoint_stream( + &engine, + v2_checkpoint_read_schema.clone(), + None, + vec![create_file_meta(&checkpoint_one_file)], + mock_table.log_root(), + ) + .unwrap(); + + let mut iter = stream.into_iter(); + + // Assert that the first batch returned is from reading checkpoint file 1 + let (_first_batch, is_log_batch) = iter.next().unwrap().unwrap(); + assert!(!is_log_batch); + // TODO: Convert JSON checkpoint to RecordBatch and assert that it is as expected + + // No extra batches should exist + assert!(iter.next().is_none()); +} + +// More integration test than unit test below: + +// Encapsulates logic that has already been tested but tests the interaction between the functions, +// such as performing a map operation on the returned sidecar batches to include the is_log_batch flag +#[tokio::test] +async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches() { + let engine = SyncEngine::new(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(); + let sidecar_schema = get_log_add_schema(); + let mut mock_table = LocalMockTable::new(); + + mock_table + .parquet_checkpoint( + sidecar_batch_with_given_paths( + vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], + get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(), + ), + "00001.checkpoint.parquet", + ) + .await; + mock_table + .sidecar( + add_batch_simple(sidecar_schema.clone()), + "sidecarfile1.parquet", + ) + .await; + mock_table + .sidecar( + add_batch_with_remove(sidecar_schema.clone()), + "sidecarfile2.parquet", + ) + .await; + + let checkpoint_one_file = mock_table + .log_root() + .join("00001.checkpoint.parquet") + .unwrap() + .to_string(); + + let stream = LogSegment::create_checkpoint_stream( + &engine, + v2_checkpoint_read_schema.clone(), + None, + vec![create_file_meta(&checkpoint_one_file)], + mock_table.log_root(), + ) + .unwrap(); + + let mut iter = stream.into_iter(); + + // Assert that the first batch returned is from reading sidecarfile1 + let (first_batch, is_log_batch) = iter.next().unwrap().unwrap(); + assert!(!is_log_batch); + assert_eq!( + into_record_batch(first_batch), + into_record_batch(add_batch_simple(sidecar_schema.clone())) + ); + + // Assert that the second batch returned is from reading sidecarfile2 + let (second_batch, is_log_batch) = iter.next().unwrap().unwrap(); + assert!(!is_log_batch); + assert_eq!( + into_record_batch(second_batch), + into_record_batch(add_batch_with_remove(sidecar_schema.clone())) + ); + + // No extra batches should exist + assert!(iter.next().is_none()); +} diff --git a/kernel/src/path.rs b/kernel/src/path.rs index 58fbba657..200eb0d09 100644 --- a/kernel/src/path.rs +++ b/kernel/src/path.rs @@ -176,7 +176,6 @@ impl ParsedLogPath { #[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] #[allow(dead_code)] // currently only used in tests, which don't "count" fn is_unknown(&self) -> bool { - // TODO: Stop treating UuidCheckpoint as unknown once we support v2 checkpoints matches!(self.file_type, LogPathFileType::Unknown) } } diff --git a/kernel/src/scan/log_replay.rs b/kernel/src/scan/log_replay.rs index 177996a80..cebce5b6c 100644 --- a/kernel/src/scan/log_replay.rs +++ b/kernel/src/scan/log_replay.rs @@ -323,6 +323,7 @@ pub(crate) fn scan_action_iter( mod tests { use std::{collections::HashMap, sync::Arc}; + use crate::actions::get_log_schema; use crate::expressions::{column_name, Scalar}; use crate::scan::state::{DvInfo, Stats}; use crate::scan::test_utils::{ @@ -364,7 +365,7 @@ mod tests { #[test] fn test_scan_action_iter() { run_with_validate_callback( - vec![add_batch_simple()], + vec![add_batch_simple(get_log_schema().clone())], None, // not testing schema None, // not testing transform &[true, false], @@ -376,7 +377,7 @@ mod tests { #[test] fn test_scan_action_iter_with_remove() { run_with_validate_callback( - vec![add_batch_with_remove()], + vec![add_batch_with_remove(get_log_schema().clone())], None, // not testing schema None, // not testing transform &[false, false, true, false], @@ -387,7 +388,7 @@ mod tests { #[test] fn test_no_transforms() { - let batch = vec![add_batch_simple()]; + let batch = vec![add_batch_simple(get_log_schema().clone())]; let logical_schema = Arc::new(crate::schema::StructType::new(vec![])); let iter = scan_action_iter( &SyncEngine::new(), diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 610bb3d40..dab4a7571 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -690,23 +690,47 @@ pub(crate) mod test_utils { Box::new(ArrowEngineData::new(batch)) } - // simple add - pub(crate) fn add_batch_simple() -> Box { + // simple sidecar + pub(crate) fn sidecar_batch_with_given_paths( + paths: Vec<&str>, + output_schema: SchemaRef, + ) -> Box { + let handler = SyncJsonHandler {}; + + let json_strings: StringArray = paths + .iter() + .map(|path| { + format!( + r#"{{"sidecar":{{"path":"{}","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{{"tag_foo":"tag_bar"}}}}}}"#, + path + ) + }) + .collect::>() + .into(); + + let parsed = handler + .parse_json(string_array_to_engine_data(json_strings), output_schema) + .unwrap(); + + ArrowEngineData::try_from_engine_data(parsed).unwrap() + } + + // A simple add batch parsed with the schema provided + pub(crate) fn add_batch_simple(output_schema: SchemaRef) -> Box { let handler = SyncJsonHandler {}; let json_strings: StringArray = vec![ r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues": {"date": "2017-12-10"},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}}"#, r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, ] .into(); - let output_schema = get_log_schema().clone(); let parsed = handler .parse_json(string_array_to_engine_data(json_strings), output_schema) .unwrap(); ArrowEngineData::try_from_engine_data(parsed).unwrap() } - // add batch with a removed file - pub(crate) fn add_batch_with_remove() -> Box { + // An add batch with a removed file parsed with the schema provided + pub(crate) fn add_batch_with_remove(output_schema: SchemaRef) -> Box { let handler = SyncJsonHandler {}; let json_strings: StringArray = vec![ r#"{"remove":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c001.snappy.parquet","deletionTimestamp":1677811194426,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":635,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, @@ -715,7 +739,6 @@ pub(crate) mod test_utils { r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, ] .into(); - let output_schema = get_log_schema().clone(); let parsed = handler .parse_json(string_array_to_engine_data(json_strings), output_schema) .unwrap(); diff --git a/kernel/src/scan/state.rs b/kernel/src/scan/state.rs index 85eb6e4a7..0dfecc4ee 100644 --- a/kernel/src/scan/state.rs +++ b/kernel/src/scan/state.rs @@ -243,6 +243,7 @@ impl RowVisitor for ScanFileVisitor<'_, T> { mod tests { use std::collections::HashMap; + use crate::actions::get_log_schema; use crate::scan::test_utils::{add_batch_simple, run_with_validate_callback}; use crate::ExpressionRef; @@ -282,7 +283,7 @@ mod tests { fn test_simple_visit_scan_data() { let context = TestContext { id: 2 }; run_with_validate_callback( - vec![add_batch_simple()], + vec![add_batch_simple(get_log_schema().clone())], None, // not testing schema None, // not testing transform &[true, false], diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index 8f4fcf818..4bae71e50 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -13,15 +13,22 @@ pub(crate) use require; #[cfg(test)] pub(crate) mod test_utils { + use arrow_array::RecordBatch; use itertools::Itertools; - use object_store::local::LocalFileSystem; use object_store::ObjectStore; + use object_store::{local::LocalFileSystem, path}; + use parquet::arrow::ArrowWriter; use serde::Serialize; use std::{path::Path, sync::Arc}; use tempfile::TempDir; use test_utils::delta_path_for_version; + use url::Url; - use crate::actions::{Add, Cdc, CommitInfo, Metadata, Protocol, Remove}; + use crate::{ + actions::{Add, Cdc, CommitInfo, Metadata, Protocol, Remove}, + engine::arrow_data::ArrowEngineData, + EngineData, + }; #[derive(Serialize)] pub(crate) enum Action { @@ -73,9 +80,84 @@ pub(crate) mod test_utils { .await .expect("put log file in store"); } + + /// Writes all `actions` to a new json checkpoint in the log + pub(crate) async fn json_checkpoint( + &mut self, + actions: impl IntoIterator, + filename: &str, + ) { + let data = actions + .into_iter() + .map(|action| serde_json::to_string(&action).unwrap()) + .join("\n"); + + let path = format!("_delta_log/{filename:020}"); + + self.store + .put(&path::Path::from(path), data.into()) + .await + .expect("put log file in store"); + } + + /// Writes all `actions` to a new parquet checkpoint in the log + pub(crate) async fn parquet_checkpoint( + &mut self, + data: Box, + filename: &str, + ) { + let batch: Box<_> = ArrowEngineData::try_from_engine_data(data).unwrap(); + let record_batch = batch.record_batch(); + + let mut buffer = vec![]; + let mut writer = + ArrowWriter::try_new(&mut buffer, record_batch.schema(), None).unwrap(); + writer.write(record_batch).unwrap(); + writer.close().unwrap(); // writer must be closed to write footer + + let path = format!("_delta_log/{filename:020}"); + + self.store + .put(&path::Path::from(path), buffer.into()) + .await + .expect("put sidecar file in store"); + } + + /// Writes all `actions` as EngineData to a new sidecar file in the `_delta_log/_sidecars` directory + pub(crate) async fn sidecar(&mut self, data: Box, filename: &str) { + let batch: Box<_> = ArrowEngineData::try_from_engine_data(data).unwrap(); + let record_batch = batch.record_batch(); + + let mut buffer = vec![]; + let mut writer = + ArrowWriter::try_new(&mut buffer, record_batch.schema(), None).unwrap(); + writer.write(record_batch).unwrap(); + writer.close().unwrap(); // writer must be closed to write footer + + let path = format!("_delta_log/_sidecars/{filename:020}"); + + self.store + .put(&path::Path::from(path), buffer.into()) + .await + .expect("put sidecar file in store"); + } + /// Get the path to the root of the table. pub(crate) fn table_root(&self) -> &Path { self.dir.path() } + + /// Get the path to the root of the log in the table. + pub(crate) fn log_root(&self) -> Url { + Url::from_directory_path(self.dir.path().join("_delta_log")).unwrap() + } + } + + /// Try to convert an `EngineData` into a `RecordBatch`. Panics if not using `ArrowEngineData` from + /// the default module + pub(crate) fn into_record_batch(engine_data: Box) -> RecordBatch { + ArrowEngineData::try_from_engine_data(engine_data) + .unwrap() + .into() } } From 8a8c5fb3a9d1d7a38c1d748bcd153aaa33f85528 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 6 Feb 2025 23:55:06 -0800 Subject: [PATCH 04/37] note --- kernel/src/actions/visitors.rs | 5 +++-- kernel/src/log_segment/tests.rs | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 9ebb330f5..be6242250 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -476,11 +476,12 @@ impl RowVisitor for SidecarVisitor { ); for i in 0..row_count { // Since path column is required, use it to detect presence of a sidecar action - if let Some(path) = getters[0].get_opt(i, "sidecar.path")? { + let opt_path: Option = getters[0].get_opt(i, "sidecar.path")?; + if let Some(path) = opt_path { // We read checkpoint batches with the sidecar action. This results in empty paths // if a row is not a sidecar action. We do not want to create a sidecar action for // these rows. - if path == "" { + if path.is_empty() { continue; } self.sidecars.push(Self::visit_sidecar(i, path, getters)?); diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 96400c2c7..f3c7f31fc 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -882,6 +882,8 @@ fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_actio } } +// TODO: Use a batch of sidecar actions to test that we do not visit batches when the schema has no file actions +// View multi_part checkpoint test for more details. #[tokio::test] async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_no_file_actions( ) { From 62dd034f30abc42d041b728c6ce0c804aa52349c Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 7 Feb 2025 10:41:32 -0800 Subject: [PATCH 05/37] return result from tests to use shortcut ? --- kernel/src/log_segment/tests.rs | 263 ++++++++++++++------------------ kernel/src/utils.rs | 6 +- 2 files changed, 123 insertions(+), 146 deletions(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index f3c7f31fc..03e97fd9b 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -15,8 +15,8 @@ use crate::scan::test_utils::{ add_batch_simple, add_batch_with_remove, sidecar_batch_with_given_paths, }; use crate::snapshot::CheckpointMetadata; -use crate::utils::test_utils::{into_record_batch, Action, LocalMockTable}; -use crate::{Engine, FileMeta, FileSystemClient, Table}; +use crate::utils::test_utils::{assert_batch_matches, Action, LocalMockTable}; +use crate::{DeltaResult, Engine, FileMeta, FileSystemClient, Table}; use test_utils::delta_path_for_version; // NOTE: In addition to testing the meta-predicate for metadata replay, this test also verifies @@ -697,8 +697,8 @@ fn table_changes_fails_with_larger_start_version_than_end() { assert!(log_segment_res.is_err()); } #[test] -fn test_sidecar_to_filemeta() { - let log_root = Url::parse("file:///var/_delta_log/").unwrap(); +fn test_sidecar_to_filemeta() -> DeltaResult<()> { + let log_root = Url::parse("file:///var/_delta_log/")?; let test_cases = [ ( "example.parquet", @@ -712,8 +712,8 @@ fn test_sidecar_to_filemeta() { // are restricted to the `_delta_log/_sidecars` directory. // Attempting to read this file will fail because it does not exist. ( - "_delta_log/_sidecars/example.parquet", - "file:///var/_delta_log/_sidecars/_delta_log/_sidecars/example.parquet", + "test/test/example.parquet", + "file:///var/_delta_log/_sidecars/test/test/example.parquet", ), ]; for (input_path, expected_url) in test_cases { @@ -724,7 +724,7 @@ fn test_sidecar_to_filemeta() { tags: None, }; - let result = LogSegment::sidecar_to_filemeta(&sidecar, &log_root).unwrap(); + let result = LogSegment::sidecar_to_filemeta(&sidecar, &log_root)?; assert_eq!( result.location.as_str(), @@ -733,35 +733,35 @@ fn test_sidecar_to_filemeta() { input_path ); } + + Ok(()) } #[test] -fn test_checkpoint_batch_with_no_sidecars_returns_checkpoint_batch_as_is() { +fn test_checkpoint_batch_with_no_sidecars_returns_checkpoint_batch_as_is() -> DeltaResult<()> { let engine = Arc::new(SyncEngine::new()); - let log_root = Url::parse("s3://example-bucket/logs/").unwrap(); + let log_root = Url::parse("s3://example-bucket/logs/")?; let checkpoint_batch = add_batch_simple(get_log_schema().clone()); - let result = LogSegment::process_single_checkpoint_batch( + let mut iter = LogSegment::process_single_checkpoint_batch( engine.get_parquet_handler(), log_root, checkpoint_batch, - ) - .unwrap(); - let mut iter = result.into_iter(); + )? + .into_iter(); - // Assert that the batch is returned as is when no sidecars are present - let returned_batch = iter.next().unwrap().unwrap(); - assert_eq!( - into_record_batch(returned_batch), - into_record_batch(add_batch_simple(get_log_schema().clone())) + // Assert the correctness of batches returned + assert_batch_matches( + iter.next().unwrap()?, + add_batch_simple(get_log_schema().clone()), ); - - // No extra batches should exist assert!(iter.next().is_none()); + + Ok(()) } #[tokio::test] -async fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() { +async fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult<()> { let mut mock_table = LocalMockTable::new(); let sidecar_schema = get_log_add_schema().clone(); mock_table @@ -780,38 +780,32 @@ async fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() { let engine = Arc::new(SyncEngine::new()); let checkpoint_batch = sidecar_batch_with_given_paths( vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], - get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(), + get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?, ); - let result = LogSegment::process_single_checkpoint_batch( + let mut iter = LogSegment::process_single_checkpoint_batch( engine.get_parquet_handler(), mock_table.log_root(), checkpoint_batch, - ) - .unwrap(); - - let mut iter = result.into_iter(); + )? + .into_iter(); - // Assert that the first batch returned is from reading sidecarfile1 - let first_batch = iter.next().unwrap().unwrap(); - assert_eq!( - into_record_batch(first_batch), - into_record_batch(add_batch_simple(sidecar_schema.clone())) + // Assert the correctness of batches returned + assert_batch_matches( + iter.next().unwrap()?, + add_batch_simple(sidecar_schema.clone()), ); - - // Assert that the second batch returned is from reading sidecarfile2 - let second_batch = iter.next().unwrap().unwrap(); - assert_eq!( - into_record_batch(second_batch), - into_record_batch(add_batch_with_remove(sidecar_schema.clone())) + assert_batch_matches( + iter.next().unwrap()?, + add_batch_with_remove(sidecar_schema.clone()), ); - - // No extra batches should exist assert!(iter.next().is_none()); + + Ok(()) } #[test] -fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() { +fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() -> DeltaResult<()> { let mock_table = LocalMockTable::new(); let engine = Arc::new(SyncEngine::new()); let checkpoint_batch = sidecar_batch_with_given_paths( @@ -823,8 +817,8 @@ fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() { engine.get_parquet_handler(), mock_table.log_root(), checkpoint_batch, - ) - .unwrap(); + )?; + let mut iter = result.into_iter(); // Assert that an error is returned when trying to read sidecar files that do not exist @@ -832,10 +826,12 @@ fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() { if let Err(e) = iter.next().unwrap() { assert!(e.to_string().contains("No such file or directory")); } + + Ok(()) } #[test] -fn test_create_checkpoint_stream_errors_when_checkpoint_parts_is_empty() { +fn test_create_checkpoint_stream_errors_when_checkpoint_parts_is_empty() -> DeltaResult<()> { let engine = SyncEngine::new(); let stream = LogSegment::create_checkpoint_stream( @@ -843,7 +839,7 @@ fn test_create_checkpoint_stream_errors_when_checkpoint_parts_is_empty() { get_log_schema().clone(), None, vec![], - Url::parse("s3://example-bucket/logs/").unwrap(), + Url::parse("s3://example-bucket/logs/")?, ); assert!(stream.is_err()); @@ -852,6 +848,8 @@ fn test_create_checkpoint_stream_errors_when_checkpoint_parts_is_empty() { .to_string() .contains("Checkpoint parts should not be empty")); } + + Ok(()) } fn create_file_meta(path: &str) -> FileMeta { @@ -863,7 +861,8 @@ fn create_file_meta(path: &str) -> FileMeta { } #[test] -fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_action() { +fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_action( +) -> DeltaResult<()> { let engine = SyncEngine::new(); let stream = LogSegment::create_checkpoint_stream( @@ -871,7 +870,7 @@ fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_actio get_log_add_schema().clone(), None, vec![create_file_meta("file:///1.parquet")], - Url::parse("s3://example-bucket/logs/").unwrap(), + Url::parse("s3://example-bucket/logs/")?, ); assert!(stream.is_err()); @@ -880,15 +879,17 @@ fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_actio .to_string() .contains("If the checkpoint read schema contains file actions, it must contain the sidecar column")); } + + Ok(()) } // TODO: Use a batch of sidecar actions to test that we do not visit batches when the schema has no file actions // View multi_part checkpoint test for more details. #[tokio::test] async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_no_file_actions( -) { +) -> DeltaResult<()> { let engine = SyncEngine::new(); - let v2_checkpoint_read_schema = get_log_schema().project(&[METADATA_NAME]).unwrap(); + let v2_checkpoint_read_schema = get_log_schema().project(&[METADATA_NAME])?; let mut mock_table = LocalMockTable::new(); mock_table .parquet_checkpoint( @@ -899,37 +900,35 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schem let checkpoint_one_file = mock_table .log_root() - .join("00001.checkpoint.parquet") - .unwrap() + .join("00001.checkpoint.parquet")? .to_string(); - let stream = LogSegment::create_checkpoint_stream( + let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), None, vec![create_file_meta(&checkpoint_one_file)], mock_table.log_root(), - ) - .unwrap(); - - let mut iter = stream.into_iter(); + )? + .into_iter(); // Assert that the first batch returned is from reading checkpoint file 1 - let (first_batch, is_log_batch) = iter.next().unwrap().unwrap(); + let (first_batch, is_log_batch) = iter.next().unwrap()?; assert!(!is_log_batch); - assert_eq!( - into_record_batch(first_batch), - into_record_batch(add_batch_simple(v2_checkpoint_read_schema.clone())) + assert_batch_matches( + first_batch, + add_batch_simple(v2_checkpoint_read_schema.clone()), ); - - // No extra batches should exist assert!(iter.next().is_none()); + + Ok(()) } #[tokio::test] -async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_multi_part() { +async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_multi_part( +) -> DeltaResult<()> { let engine = SyncEngine::new(); - let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let mut mock_table = LocalMockTable::new(); // Multi-part checkpoints can never have sidecar actions. @@ -956,17 +955,15 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ let checkpoint_one_file = mock_table .log_root() - .join("00001.checkpoint.1.2.parquet") - .unwrap() + .join("00001.checkpoint.1.2.parquet")? .to_string(); let checkpoint_two_file = mock_table .log_root() - .join("00001.checkpoint.2.2.parquet") - .unwrap() + .join("00001.checkpoint.2.2.parquet")? .to_string(); - let stream = LogSegment::create_checkpoint_stream( + let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), None, @@ -975,35 +972,24 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ create_file_meta(&checkpoint_two_file), ], mock_table.log_root(), - ) - .unwrap(); - - let mut iter = stream.into_iter(); - - // Assert that the first batch returned is from reading checkpoint file 1 - let (first_batch, is_log_batch) = iter.next().unwrap().unwrap(); - assert!(!is_log_batch); - assert_eq!( - into_record_batch(first_batch), - into_record_batch(sidecar_batch_with_given_paths( - vec!["sidecar1.parquet"], - v2_checkpoint_read_schema.clone(), - )) - ); - - // Assert that the first batch returned is from reading checkpoint file 2 - let (second_batch, is_log_batch) = iter.next().unwrap().unwrap(); - assert!(!is_log_batch); - assert_eq!( - into_record_batch(second_batch), - into_record_batch(sidecar_batch_with_given_paths( - vec!["sidecar2.parquet"], - v2_checkpoint_read_schema.clone(), - )) - ); - - // No extra batches should exist + )? + .into_iter(); + + // Assert the correctness of batches returned + for expected_sidecar in ["sidecar1.parquet", "sidecar2.parquet"].iter() { + let (batch, is_log_batch) = iter.next().unwrap()?; + assert!(!is_log_batch); + assert_batch_matches( + batch, + sidecar_batch_with_given_paths( + vec![expected_sidecar], + v2_checkpoint_read_schema.clone(), + ), + ); + } assert!(iter.next().is_none()); + + Ok(()) } // Test showcases weird behavior where reading a batch with a missing column causes the reader to @@ -1011,9 +997,9 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ // empty-string path fields after visiting the batch with the SidecarVisitor due to the batch being read with // the schema which includes the Sidecar column. #[tokio::test] -async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch() { +async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch() -> DeltaResult<()> { let engine = SyncEngine::new(); - let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let mut mock_table = LocalMockTable::new(); mock_table @@ -1025,37 +1011,34 @@ async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch() { let checkpoint_one_file = mock_table .log_root() - .join("00001.checkpoint.parquet") - .unwrap() + .join("00001.checkpoint.parquet")? .to_string(); - let stream = LogSegment::create_checkpoint_stream( + let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), None, vec![create_file_meta(&checkpoint_one_file)], mock_table.log_root(), - ) - .unwrap(); - - let mut iter = stream.into_iter(); + )? + .into_iter(); // Assert that the first batch returned is from reading checkpoint file 1 - let (first_batch, is_log_batch) = iter.next().unwrap().unwrap(); + let (first_batch, is_log_batch) = iter.next().unwrap()?; assert!(!is_log_batch); - assert_eq!( - into_record_batch(first_batch), - into_record_batch(add_batch_simple(v2_checkpoint_read_schema.clone())) + assert_batch_matches( + first_batch, + add_batch_simple(v2_checkpoint_read_schema.clone()), ); - - // No extra batches should exist assert!(iter.next().is_none()); + + Ok(()) } #[tokio::test] -async fn test_create_checkpoint_stream_reads_json_checkpoint_batch() { +async fn test_create_checkpoint_stream_reads_json_checkpoint_batch() -> DeltaResult<()> { let engine = SyncEngine::new(); - let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let mut mock_table = LocalMockTable::new(); mock_table @@ -1071,28 +1054,25 @@ async fn test_create_checkpoint_stream_reads_json_checkpoint_batch() { let checkpoint_one_file = mock_table .log_root() - .join("00001.checkpoint.json") - .unwrap() + .join("00001.checkpoint.json")? .to_string(); - let stream = LogSegment::create_checkpoint_stream( + let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), None, vec![create_file_meta(&checkpoint_one_file)], mock_table.log_root(), - ) - .unwrap(); - - let mut iter = stream.into_iter(); + )? + .into_iter(); // Assert that the first batch returned is from reading checkpoint file 1 - let (_first_batch, is_log_batch) = iter.next().unwrap().unwrap(); + let (_first_batch, is_log_batch) = iter.next().unwrap()?; assert!(!is_log_batch); // TODO: Convert JSON checkpoint to RecordBatch and assert that it is as expected - - // No extra batches should exist assert!(iter.next().is_none()); + + Ok(()) } // More integration test than unit test below: @@ -1100,9 +1080,10 @@ async fn test_create_checkpoint_stream_reads_json_checkpoint_batch() { // Encapsulates logic that has already been tested but tests the interaction between the functions, // such as performing a map operation on the returned sidecar batches to include the is_log_batch flag #[tokio::test] -async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches() { +async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches( +) -> DeltaResult<()> { let engine = SyncEngine::new(); - let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let sidecar_schema = get_log_add_schema(); let mut mock_table = LocalMockTable::new(); @@ -1110,7 +1091,7 @@ async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar .parquet_checkpoint( sidecar_batch_with_given_paths( vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], - get_log_schema().project(&[ADD_NAME, SIDECAR_NAME]).unwrap(), + get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?, ), "00001.checkpoint.parquet", ) @@ -1130,37 +1111,29 @@ async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar let checkpoint_one_file = mock_table .log_root() - .join("00001.checkpoint.parquet") - .unwrap() + .join("00001.checkpoint.parquet")? .to_string(); - let stream = LogSegment::create_checkpoint_stream( + let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), None, vec![create_file_meta(&checkpoint_one_file)], mock_table.log_root(), - ) - .unwrap(); - - let mut iter = stream.into_iter(); + )? + .into_iter(); // Assert that the first batch returned is from reading sidecarfile1 - let (first_batch, is_log_batch) = iter.next().unwrap().unwrap(); + let (first_batch, is_log_batch) = iter.next().unwrap()?; assert!(!is_log_batch); - assert_eq!( - into_record_batch(first_batch), - into_record_batch(add_batch_simple(sidecar_schema.clone())) - ); + assert_batch_matches(first_batch, add_batch_simple(sidecar_schema.clone())); // Assert that the second batch returned is from reading sidecarfile2 - let (second_batch, is_log_batch) = iter.next().unwrap().unwrap(); + let (second_batch, is_log_batch) = iter.next().unwrap()?; assert!(!is_log_batch); - assert_eq!( - into_record_batch(second_batch), - into_record_batch(add_batch_with_remove(sidecar_schema.clone())) - ); + assert_batch_matches(second_batch, add_batch_with_remove(sidecar_schema.clone())); - // No extra batches should exist assert!(iter.next().is_none()); + + Ok(()) } diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index 4bae71e50..7f7b24c0e 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -155,9 +155,13 @@ pub(crate) mod test_utils { /// Try to convert an `EngineData` into a `RecordBatch`. Panics if not using `ArrowEngineData` from /// the default module - pub(crate) fn into_record_batch(engine_data: Box) -> RecordBatch { + fn into_record_batch(engine_data: Box) -> RecordBatch { ArrowEngineData::try_from_engine_data(engine_data) .unwrap() .into() } + + pub(crate) fn assert_batch_matches(actual: Box, expected: Box) { + assert_eq!(into_record_batch(actual), into_record_batch(expected)); + } } From c2ed82e8386ee245109d8acec841a539ad8e56b6 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 7 Feb 2025 11:50:53 -0800 Subject: [PATCH 06/37] remove snapshot creation with v2checkpoints --- kernel/src/actions/mod.rs | 8 +-- kernel/src/log_segment.rs | 8 +-- kernel/src/log_segment/tests.rs | 87 ++++--------------------------- kernel/src/path.rs | 20 ++++--- kernel/src/table_changes/mod.rs | 2 +- kernel/src/table_configuration.rs | 4 +- kernel/src/table_features/mod.rs | 7 --- kernel/tests/golden_tables.rs | 4 +- 8 files changed, 37 insertions(+), 103 deletions(-) diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs index 3e7d36438..8bcb5df50 100644 --- a/kernel/src/actions/mod.rs +++ b/kernel/src/actions/mod.rs @@ -783,7 +783,7 @@ mod tests { } #[test] - fn test_v2_checkpoint_supported() { + fn test_v2_checkpoint_unsupported() { let protocol = Protocol::try_new( 3, 7, @@ -791,7 +791,7 @@ mod tests { Some([ReaderFeatures::V2Checkpoint]), ) .unwrap(); - assert!(protocol.ensure_read_supported().is_ok()); + assert!(protocol.ensure_read_supported().is_err()); let protocol = Protocol::try_new( 4, @@ -821,7 +821,7 @@ mod tests { Some(&empty_features), ) .unwrap(); - assert!(protocol.ensure_read_supported().is_ok()); + assert!(protocol.ensure_read_supported().is_err()); let protocol = Protocol::try_new( 3, @@ -839,7 +839,7 @@ mod tests { Some([WriterFeatures::V2Checkpoint]), ) .unwrap(); - assert!(protocol.ensure_read_supported().is_ok()); + assert!(protocol.ensure_read_supported().is_err()); let protocol = Protocol { min_reader_version: 1, diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 0c251349a..39dfb9aa7 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -242,13 +242,13 @@ impl LogSegment { /// Returns an iterator over checkpoint data, processing sidecar files when necessary. /// /// Checkpoint data is returned directly if: - /// - Processing multi-part checkpoints - /// - Schema doesn't contain file actions + /// - Processing a multi-part checkpoint + /// - Schema does not contain file actions /// /// For single-part checkpoints, any referenced sidecar files are processed. These /// sidecar files contain the actual add/remove actions that would otherwise be - /// stored directly in the checkpoint, providing a more efficient storage mechanism - /// for large change sets. + /// stored directly in the checkpoint. The sidecar file batches replace the checkpoint + /// batch in the top level iterator to be returned. fn create_checkpoint_stream( engine: &dyn Engine, checkpoint_read_schema: SchemaRef, diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 03e97fd9b..f66846cfb 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -115,7 +115,7 @@ fn build_log_with_paths_and_checkpoint( } #[test] -fn build_snapshot_with_uuid_checkpoint_parquet() { +fn build_snapshot_with_unsupported_uuid_checkpoint() { let (client, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -130,88 +130,18 @@ fn build_snapshot_with_uuid_checkpoint_parquet() { ], None, ); - let log_segment = LogSegment::for_snapshot(client.as_ref(), log_root, None, None).unwrap(); let commit_files = log_segment.ascending_commit_files; let checkpoint_parts = log_segment.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); - assert_eq!(checkpoint_parts[0].version, 5); - - let versions = commit_files.into_iter().map(|x| x.version).collect_vec(); - let expected_versions = vec![6, 7]; - assert_eq!(versions, expected_versions); -} - -#[test] -fn build_snapshot_with_uuid_checkpoint_json() { - let (client, log_root) = build_log_with_paths_and_checkpoint( - &[ - delta_path_for_version(0, "json"), - delta_path_for_version(1, "checkpoint.parquet"), - delta_path_for_version(2, "json"), - delta_path_for_version(3, "checkpoint.parquet"), - delta_path_for_version(4, "json"), - delta_path_for_version(5, "json"), - delta_path_for_version(5, "checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json"), - delta_path_for_version(6, "json"), - delta_path_for_version(7, "json"), - ], - None, - ); - - let log_segment = LogSegment::for_snapshot(client.as_ref(), log_root, None, None).unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; - - assert_eq!(checkpoint_parts.len(), 1); - assert_eq!(checkpoint_parts[0].version, 5); + assert_eq!(checkpoint_parts[0].version, 3); let versions = commit_files.into_iter().map(|x| x.version).collect_vec(); - let expected_versions = vec![6, 7]; + let expected_versions = vec![4, 5, 6, 7]; assert_eq!(versions, expected_versions); } -#[test] -fn build_snapshot_with_correct_last_uuid_checkpoint() { - let checkpoint_metadata = CheckpointMetadata { - version: 5, - size: 10, - parts: Some(1), - size_in_bytes: None, - num_of_add_files: None, - checkpoint_schema: None, - checksum: None, - }; - - let (client, log_root) = build_log_with_paths_and_checkpoint( - &[ - delta_path_for_version(0, "json"), - delta_path_for_version(1, "checkpoint.parquet"), - delta_path_for_version(1, "json"), - delta_path_for_version(2, "json"), - delta_path_for_version(3, "checkpoint.parquet"), - delta_path_for_version(3, "json"), - delta_path_for_version(4, "json"), - delta_path_for_version(5, "checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.parquet"), - delta_path_for_version(5, "json"), - delta_path_for_version(6, "json"), - delta_path_for_version(7, "json"), - ], - Some(&checkpoint_metadata), - ); - - let log_segment = - LogSegment::for_snapshot(client.as_ref(), log_root, checkpoint_metadata, None).unwrap(); - let commit_files = log_segment.ascending_commit_files; - let checkpoint_parts = log_segment.checkpoint_parts; - - assert_eq!(checkpoint_parts.len(), 1); - assert_eq!(commit_files.len(), 2); - assert_eq!(checkpoint_parts[0].version, 5); - assert_eq!(commit_files[0].version, 6); - assert_eq!(commit_files[1].version, 7); -} #[test] fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() { let (client, log_root) = build_log_with_paths_and_checkpoint( @@ -711,6 +641,8 @@ fn test_sidecar_to_filemeta() -> DeltaResult<()> { // This forms a valid URL but represents an invalid path since sidecar files // are restricted to the `_delta_log/_sidecars` directory. // Attempting to read this file will fail because it does not exist. + + // TODO: Catch this error earlier to return a more informative error message. ( "test/test/example.parquet", "file:///var/_delta_log/_sidecars/test/test/example.parquet", @@ -884,7 +816,7 @@ fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_actio } // TODO: Use a batch of sidecar actions to test that we do not visit batches when the schema has no file actions -// View multi_part checkpoint test for more details. +// View multi-part checkpoint test for more details. #[tokio::test] async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_no_file_actions( ) -> DeltaResult<()> { @@ -1078,7 +1010,8 @@ async fn test_create_checkpoint_stream_reads_json_checkpoint_batch() -> DeltaRes // More integration test than unit test below: // Encapsulates logic that has already been tested but tests the interaction between the functions, -// such as performing a map operation on the returned sidecar batches to include the is_log_batch flag +// such as performing a map operation on the returned sidecar batches from `process_single_checkpoint_batch` +// to include the is_log_batch flag #[tokio::test] async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches( ) -> DeltaResult<()> { @@ -1109,7 +1042,7 @@ async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar ) .await; - let checkpoint_one_file = mock_table + let checkpoint_file_path = mock_table .log_root() .join("00001.checkpoint.parquet")? .to_string(); @@ -1118,7 +1051,7 @@ async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar &engine, v2_checkpoint_read_schema.clone(), None, - vec![create_file_meta(&checkpoint_one_file)], + vec![create_file_meta(&checkpoint_file_path)], mock_table.log_root(), )? .into_iter(); diff --git a/kernel/src/path.rs b/kernel/src/path.rs index 200eb0d09..23e7819de 100644 --- a/kernel/src/path.rs +++ b/kernel/src/path.rs @@ -166,9 +166,7 @@ impl ParsedLogPath { // TODO: Include UuidCheckpoint once we actually support v2 checkpoints matches!( self.file_type, - LogPathFileType::SinglePartCheckpoint - | LogPathFileType::MultiPartCheckpoint { .. } - | LogPathFileType::UuidCheckpoint(_) + LogPathFileType::SinglePartCheckpoint | LogPathFileType::MultiPartCheckpoint { .. } ) } @@ -176,7 +174,11 @@ impl ParsedLogPath { #[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] #[allow(dead_code)] // currently only used in tests, which don't "count" fn is_unknown(&self) -> bool { - matches!(self.file_type, LogPathFileType::Unknown) + // TODO: Stop treating UuidCheckpoint as unknown once we support v2 checkpoints + matches!( + self.file_type, + LogPathFileType::Unknown | LogPathFileType::UuidCheckpoint(_) + ) } } @@ -355,7 +357,10 @@ mod tests { LogPathFileType::UuidCheckpoint(ref u) if u == "3a0d65cd-4056-49b8-937b-95f9e3ee90e5", )); assert!(!log_path.is_commit()); - assert!(log_path.is_checkpoint()); + + // TODO: Support v2 checkpoints! Until then we can't treat these as checkpoint files. + assert!(!log_path.is_checkpoint()); + assert!(log_path.is_unknown()); let log_path = table_log_dir .join("00000000000000000002.checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json") @@ -372,7 +377,10 @@ mod tests { LogPathFileType::UuidCheckpoint(ref u) if u == "3a0d65cd-4056-49b8-937b-95f9e3ee90e5", )); assert!(!log_path.is_commit()); - assert!(log_path.is_checkpoint()); + + // TODO: Support v2 checkpoints! Until then we can't treat these as checkpoint files. + assert!(!log_path.is_checkpoint()); + assert!(log_path.is_unknown()); let log_path = table_log_dir .join("00000000000000000002.checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.foo") diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index e65b0ae53..c80745d3e 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -117,7 +117,7 @@ pub struct TableChanges { } impl TableChanges { - /// Creates a new [`TableChanges`] instance for the given version range. This function checks +/// Creates a new [`TableChanges`] instance for the given version range. This function checks /// these properties: /// - The change data feed table feature must be enabled in both the start or end versions. /// - Other than the deletion vector reader feature, no other reader features are enabled for the table. diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index 22447554b..565546d52 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -262,8 +262,8 @@ mod test { let protocol = Protocol::try_new( 3, 7, - Some([ReaderFeatures::UnsupportedFeature]), - Some([WriterFeatures::UnsupportedFeature]), + Some([ReaderFeatures::V2Checkpoint]), + Some([WriterFeatures::V2Checkpoint]), ) .unwrap(); let table_root = Url::try_from("file:///").unwrap(); diff --git a/kernel/src/table_features/mod.rs b/kernel/src/table_features/mod.rs index bc5499501..ee27fc17e 100644 --- a/kernel/src/table_features/mod.rs +++ b/kernel/src/table_features/mod.rs @@ -48,8 +48,6 @@ pub enum ReaderFeatures { /// vacuumProtocolCheck ReaderWriter feature ensures consistent application of reader and writer /// protocol checks during VACUUM operations VacuumProtocolCheck, - /// A dummy variant used to represent an unsupported feature for testing purposes - UnsupportedFeature, } /// Similar to reader features, writer features communicate capabilities that must be implemented @@ -111,8 +109,6 @@ pub enum WriterFeatures { /// vacuumProtocolCheck ReaderWriter feature ensures consistent application of reader and writer /// protocol checks during VACUUM operations VacuumProtocolCheck, - /// A dummy variant used to represent an unsupported feature for testing purposes - UnsupportedFeature, } impl From for String { @@ -137,7 +133,6 @@ pub(crate) static SUPPORTED_READER_FEATURES: LazyLock> = ReaderFeatures::TypeWidening, ReaderFeatures::TypeWideningPreview, ReaderFeatures::VacuumProtocolCheck, - ReaderFeatures::V2Checkpoint, ]) }); @@ -159,7 +154,6 @@ mod tests { (ReaderFeatures::TypeWideningPreview, "typeWidening-preview"), (ReaderFeatures::V2Checkpoint, "v2Checkpoint"), (ReaderFeatures::VacuumProtocolCheck, "vacuumProtocolCheck"), - (ReaderFeatures::UnsupportedFeature, "unsupportedFeature"), ]; assert_eq!(ReaderFeatures::VARIANTS.len(), cases.len()); @@ -198,7 +192,6 @@ mod tests { (WriterFeatures::IcebergCompatV1, "icebergCompatV1"), (WriterFeatures::IcebergCompatV2, "icebergCompatV2"), (WriterFeatures::VacuumProtocolCheck, "vacuumProtocolCheck"), - (WriterFeatures::UnsupportedFeature, "unsupportedFeature"), ]; assert_eq!(WriterFeatures::VARIANTS.len(), cases.len()); diff --git a/kernel/tests/golden_tables.rs b/kernel/tests/golden_tables.rs index a7c271432..58b1ab187 100644 --- a/kernel/tests/golden_tables.rs +++ b/kernel/tests/golden_tables.rs @@ -409,8 +409,8 @@ golden_test!("time-travel-start", latest_snapshot_test); golden_test!("time-travel-start-start20", latest_snapshot_test); golden_test!("time-travel-start-start20-start40", latest_snapshot_test); -golden_test!("v2-checkpoint-json", latest_snapshot_test); -golden_test!("v2-checkpoint-parquet", latest_snapshot_test); +skip_test!("v2-checkpoint-json": "v2 checkpoint not supported"); +skip_test!("v2-checkpoint-parquet": "v2 checkpoint not supported"); // BUG: // - AddFile: 'file:/some/unqualified/absolute/path' // - RemoveFile: '/some/unqualified/absolute/path' From d5ee495655b97915a37215efb0b89b61752243d7 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 7 Feb 2025 15:31:39 -0800 Subject: [PATCH 07/37] refactors --- kernel/src/log_segment.rs | 67 +++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 39dfb9aa7..6b4102d90 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -224,17 +224,19 @@ impl LogSegment { .iter() .map(|f| f.location.clone()) .collect(); - let checkpoint_stream = if checkpoint_parts.is_empty() { - Left(None.into_iter()) - } else { - Right(Self::create_checkpoint_stream( - engine, - checkpoint_read_schema, - meta_predicate, - checkpoint_parts, - self.log_root.clone(), - )?) - }; + let checkpoint_stream = (!checkpoint_parts.is_empty()) + .then(|| { + Self::create_checkpoint_stream( + engine, + checkpoint_read_schema, + meta_predicate, + checkpoint_parts, + self.log_root.clone(), + ) + }) + .transpose()? + .into_iter() + .flatten(); Ok(commit_stream.chain(checkpoint_stream)) } @@ -290,24 +292,27 @@ impl LogSegment { // Process checkpoint batches with potential sidecar file references that need to be read // to extract add/remove actions from the sidecar files. - Ok(Right( - actions - // Flatten the new batches returned. The new batches could be: - // - the checkpoint batch itself if no sidecar actions are present in the batch - // - one or more sidecar batches read from the sidecar files if sidecar actions are present - .flat_map(move |batch_result| match batch_result { - Ok(checkpoint_batch) => Right( - Self::process_single_checkpoint_batch( - parquet_handler.clone(), - log_root.clone(), - checkpoint_batch, - ) - .map_or_else(|e| Left(std::iter::once(Err(e))), Right) - .map_ok(|batch| (batch, false)), - ), + let actions_iter = actions + // Flatten the new batches returned. The new batches could be: + // - the checkpoint batch itself if no sidecar actions are present in the batch + // - one or more sidecar batches read from the sidecar files if sidecar actions are present + .flat_map(move |batch_result| { + let checkpoint_batch = match batch_result { + Ok(batch) => batch, + Err(e) => return Left(std::iter::once(Err(e))), + }; + + match Self::process_single_checkpoint_batch( + parquet_handler.clone(), + log_root.clone(), + checkpoint_batch, + ) { + Ok(iter) => Right(iter.map(|result| result.map(|batch| (batch, false)))), Err(e) => Left(std::iter::once(Err(e))), - }), - )) + } + }); + + return Ok(Right(actions_iter)); } fn process_single_checkpoint_batch( @@ -326,18 +331,18 @@ impl LogSegment { } // Convert sidecar actions to sidecar file paths - let sidecar_files: Result, _> = visitor + let sidecar_files: Vec<_> = visitor .sidecars .iter() .map(|sidecar| Self::sidecar_to_filemeta(sidecar, &log_root)) - .collect(); + .try_collect()?; let sidecar_read_schema = get_log_add_schema().clone(); // If sidecars files exist, read the sidecar files and return the iterator of sidecar batches // to replace the checkpoint batch in the top level iterator Ok(Right(parquet_handler.read_parquet_files( - &sidecar_files?, + &sidecar_files, sidecar_read_schema, None, )?)) From f5bbf00185138a61e42dff627e8b1c5567bd58be Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 7 Feb 2025 20:13:23 -0800 Subject: [PATCH 08/37] leverage flatten_ok() --- kernel/src/log_segment.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 6b4102d90..c8302ca3e 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -296,21 +296,19 @@ impl LogSegment { // Flatten the new batches returned. The new batches could be: // - the checkpoint batch itself if no sidecar actions are present in the batch // - one or more sidecar batches read from the sidecar files if sidecar actions are present - .flat_map(move |batch_result| { - let checkpoint_batch = match batch_result { - Ok(batch) => batch, - Err(e) => return Left(std::iter::once(Err(e))), - }; + // After flattenning, perform a map operation to return the actions and a boolean flag indicating + // that the batch was not read from a log file. + .map(move |batch_result| { + let checkpoint_batch = batch_result?; - match Self::process_single_checkpoint_batch( + Self::process_single_checkpoint_batch( parquet_handler.clone(), log_root.clone(), checkpoint_batch, - ) { - Ok(iter) => Right(iter.map(|result| result.map(|batch| (batch, false)))), - Err(e) => Left(std::iter::once(Err(e))), - } - }); + ) + }) + .flatten_ok() + .map(|result| result.and_then(|inner| inner.map(|actions| (actions, false)))); return Ok(Right(actions_iter)); } From 65abd95c10952811ed4d7ba623171dd659077351 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 9 Feb 2025 19:52:15 -0800 Subject: [PATCH 09/37] address feedback --- kernel/src/log_segment.rs | 185 ++++++++++++++++---------------- kernel/src/log_segment/tests.rs | 167 +++++++++++++++------------- kernel/src/scan/mod.rs | 3 +- kernel/src/table_changes/mod.rs | 2 +- kernel/src/utils.rs | 4 +- 5 files changed, 192 insertions(+), 169 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index c8302ca3e..e596b38a1 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -11,13 +11,13 @@ use crate::schema::SchemaRef; use crate::snapshot::CheckpointMetadata; use crate::utils::require; use crate::{ - DeltaResult, Engine, EngineData, Error, Expression, ExpressionRef, FileDataReadResultIterator, - FileMeta, FileSystemClient, ParquetHandler, RowVisitor, Version, + DeltaResult, Engine, EngineData, Error, Expression, ExpressionRef, FileMeta, FileSystemClient, + ParquetHandler, RowVisitor, Version, }; -use itertools::Either::{Left, Right}; use itertools::Itertools; use std::collections::HashMap; use std::convert::identity; +use std::path::Path; use std::sync::{Arc, LazyLock}; use tracing::warn; use url::Url; @@ -219,24 +219,15 @@ impl LogSegment { .read_json_files(&commit_files, commit_read_schema, meta_predicate.clone())? .map_ok(|batch| (batch, true)); - let checkpoint_parts: Vec<_> = self - .checkpoint_parts - .iter() - .map(|f| f.location.clone()) - .collect(); - let checkpoint_stream = (!checkpoint_parts.is_empty()) - .then(|| { - Self::create_checkpoint_stream( - engine, - checkpoint_read_schema, - meta_predicate, - checkpoint_parts, - self.log_root.clone(), - ) - }) - .transpose()? - .into_iter() - .flatten(); + let checkpoint_stream = Self::create_checkpoint_stream( + engine, + checkpoint_read_schema, + meta_predicate, + self.checkpoint_parts.clone(), + self.log_root.clone(), + ) + .into_iter() + .flatten(); Ok(commit_stream.chain(checkpoint_stream)) } @@ -249,21 +240,15 @@ impl LogSegment { /// /// For single-part checkpoints, any referenced sidecar files are processed. These /// sidecar files contain the actual add/remove actions that would otherwise be - /// stored directly in the checkpoint. The sidecar file batches replace the checkpoint - /// batch in the top level iterator to be returned. + /// stored directly in the checkpoint. The sidecar file batches are chained to the + /// checkpoint batch in the top level iterator to be returned. fn create_checkpoint_stream( engine: &dyn Engine, checkpoint_read_schema: SchemaRef, meta_predicate: Option, - checkpoint_parts: Vec, + checkpoint_parts: Vec, log_root: Url, ) -> DeltaResult, bool)>> + Send> { - // We checked that checkpoint_parts is not empty before calling this function - require!( - !checkpoint_parts.is_empty(), - Error::generic("Checkpoint parts should not be empty") - ); - let need_file_actions = checkpoint_read_schema.contains(ADD_NAME); require!( !need_file_actions || checkpoint_read_schema.contains(SIDECAR_NAME), @@ -272,60 +257,74 @@ impl LogSegment { ) ); - let is_json_checkpoint = checkpoint_parts[0].location.path().ends_with(".json"); - let actions = Self::read_checkpoint_files( - engine, - checkpoint_parts.clone(), - checkpoint_read_schema.clone(), - meta_predicate, - is_json_checkpoint, - )?; + let checkpoint_file_meta: Vec = checkpoint_parts + .iter() + .map(|log| log.location.clone()) + .collect(); + + let actions = if checkpoint_parts + .first() + .is_some_and(|p| p.extension == "json") + { + engine.get_json_handler().read_json_files( + &checkpoint_file_meta, + checkpoint_read_schema, + meta_predicate, + )? + } else { + engine.get_parquet_handler().read_parquet_files( + &checkpoint_file_meta, + checkpoint_read_schema, + meta_predicate, + )? + }; // 1. In the case where the schema does not contain add/remove actions, we return the checkpoint // batch directly as sidecar files only have to be read when the schema contains add/remove actions. // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is. - if !need_file_actions || checkpoint_parts.len() > 1 { - return Ok(Left(actions.map_ok(|batch| (batch, false)))); - } + let skip_sidecar_search = !need_file_actions || checkpoint_parts.len() > 1; - let parquet_handler = engine.get_parquet_handler().clone(); - - // Process checkpoint batches with potential sidecar file references that need to be read - // to extract add/remove actions from the sidecar files. - let actions_iter = actions - // Flatten the new batches returned. The new batches could be: - // - the checkpoint batch itself if no sidecar actions are present in the batch - // - one or more sidecar batches read from the sidecar files if sidecar actions are present - // After flattenning, perform a map operation to return the actions and a boolean flag indicating - // that the batch was not read from a log file. + let parquet_handler = engine.get_parquet_handler(); + Ok(actions .map(move |batch_result| { - let checkpoint_batch = batch_result?; - - Self::process_single_checkpoint_batch( - parquet_handler.clone(), - log_root.clone(), - checkpoint_batch, - ) + batch_result.and_then(|checkpoint_batch| { + // This closure maps the checkpoint batch to an iterator of batches + // by chaining the checkpoint batch with sidecar batches if they exist. + let sidecar_content = Self::process_sidecars( + parquet_handler.clone(), + log_root.clone(), + checkpoint_batch.as_ref(), + skip_sidecar_search, + )?; + + Ok(std::iter::once(Ok((checkpoint_batch, false))).chain( + sidecar_content + .into_iter() + .flatten() + .map_ok(|sidecar_batch| (sidecar_batch, false)), + )) + }) }) .flatten_ok() - .map(|result| result.and_then(|inner| inner.map(|actions| (actions, false)))); - - return Ok(Right(actions_iter)); + .map(|result| result?)) } - fn process_single_checkpoint_batch( + fn process_sidecars( parquet_handler: Arc, log_root: Url, - batch: Box, - ) -> DeltaResult>> + Send> { - let mut visitor = SidecarVisitor::default(); + batch: &dyn EngineData, + skip_sidecar_search: bool, + ) -> DeltaResult>> + Send>> { + if skip_sidecar_search { + return Ok(None); + } - // Collect sidecars - visitor.visit_rows_of(batch.as_ref())?; + let mut visitor = SidecarVisitor::default(); + visitor.visit_rows_of(batch)?; - // If there are no sidecars, return the batch as is + // If there are no sidecars, return None if visitor.sidecars.is_empty() { - return Ok(Left(std::iter::once(Ok(batch)))); + return Ok(None); } // Convert sidecar actions to sidecar file paths @@ -337,36 +336,38 @@ impl LogSegment { let sidecar_read_schema = get_log_add_schema().clone(); - // If sidecars files exist, read the sidecar files and return the iterator of sidecar batches - // to replace the checkpoint batch in the top level iterator - Ok(Right(parquet_handler.read_parquet_files( + // Read sidecar files and return as Some(Iterator) + Ok(Some(parquet_handler.read_parquet_files( &sidecar_files, sidecar_read_schema, None, )?)) } - // Helper function to read checkpoint files based on the file type - fn read_checkpoint_files( - engine: &dyn Engine, - checkpoint_parts: Vec, - schema: SchemaRef, - predicate: Option, - is_json: bool, - ) -> DeltaResult { - if is_json { - engine - .get_json_handler() - .read_json_files(&checkpoint_parts, schema, predicate) - } else { - engine - .get_parquet_handler() - .read_parquet_files(&checkpoint_parts, schema, predicate) + /// Convert a Sidecar record to a FileMeta. + /// + /// This helper first builds the URL by joining the provided log_root with + /// the "_sidecars/" folder and the given sidecar path. + /// + /// To catch a possible error earlier, if the sidecar.path is relative then + /// it is checked to ensure it is simply a file name (no directory components). + fn sidecar_to_filemeta(sidecar: &Sidecar, log_root: &Url) -> DeltaResult { + // If sidecar.path is relative (does not contain "://"), require that it is + // just a file name. This will catch cases like "test/test/example.parquet". + if !sidecar.path.contains("://") { + let path = Path::new(&sidecar.path); + // If there is any parent (i.e. directory component), consider it invalid. + if let Some(parent) = path.parent() { + // If the parent is not empty, then we have extra components. + if parent != Path::new("") { + return Err(Error::Generic(format!( + "Sidecar path '{}' is invalid: sidecar files must be in the `_delta_log/_sidecars/` directory as a file name only", + sidecar.path + ))); + } + } } - } - - // Helper function to convert a single sidecar action to a FileMeta - fn sidecar_to_filemeta(sidecar: &Sidecar, log_root: &Url) -> Result { + // Now build the full location by joining log_root, "_sidecars/", and the given path. let location = log_root.join("_sidecars/")?.join(&sidecar.path)?; Ok(FileMeta { location, diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index f66846cfb..87fa7ce9e 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -11,6 +11,7 @@ use crate::engine::default::executor::tokio::TokioBackgroundExecutor; use crate::engine::default::filesystem::ObjectStoreFileSystemClient; use crate::engine::sync::SyncEngine; use crate::log_segment::LogSegment; +use crate::path::ParsedLogPath; use crate::scan::test_utils::{ add_batch_simple, add_batch_with_remove, sidecar_batch_with_given_paths, }; @@ -641,12 +642,6 @@ fn test_sidecar_to_filemeta() -> DeltaResult<()> { // This forms a valid URL but represents an invalid path since sidecar files // are restricted to the `_delta_log/_sidecars` directory. // Attempting to read this file will fail because it does not exist. - - // TODO: Catch this error earlier to return a more informative error message. - ( - "test/test/example.parquet", - "file:///var/_delta_log/_sidecars/test/test/example.parquet", - ), ]; for (input_path, expected_url) in test_cases { let sidecar = Sidecar { @@ -666,27 +661,39 @@ fn test_sidecar_to_filemeta() -> DeltaResult<()> { ); } + let bad_sidecar = Sidecar { + // This is an invalid sidecar path because it is not in the `_delta_log/_sidecars` directory + path: "test/test/example.parquet".into(), + modification_time: 0, + size_in_bytes: 1000, + tags: None, + }; + let result = LogSegment::sidecar_to_filemeta(&bad_sidecar, &log_root); + + assert!(result.is_err()); + if let Err(e) = result { + assert!(e.to_string().contains("Sidecar path 'test/test/example.parquet' is invalid: sidecar files must be in the `_delta_log/_sidecars/` directory as a file name only")); + } + Ok(()) } #[test] -fn test_checkpoint_batch_with_no_sidecars_returns_checkpoint_batch_as_is() -> DeltaResult<()> { +fn test_checkpoint_batch_with_no_sidecars_returns_none() -> DeltaResult<()> { let engine = Arc::new(SyncEngine::new()); let log_root = Url::parse("s3://example-bucket/logs/")?; let checkpoint_batch = add_batch_simple(get_log_schema().clone()); - let mut iter = LogSegment::process_single_checkpoint_batch( + let mut iter = LogSegment::process_sidecars( engine.get_parquet_handler(), log_root, - checkpoint_batch, + checkpoint_batch.as_ref(), + false, )? - .into_iter(); + .into_iter() + .flatten(); - // Assert the correctness of batches returned - assert_batch_matches( - iter.next().unwrap()?, - add_batch_simple(get_log_schema().clone()), - ); + // Assert no batches are returned assert!(iter.next().is_none()); Ok(()) @@ -715,12 +722,14 @@ async fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaR get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?, ); - let mut iter = LogSegment::process_single_checkpoint_batch( + let mut iter = LogSegment::process_sidecars( engine.get_parquet_handler(), mock_table.log_root(), - checkpoint_batch, + checkpoint_batch.as_ref(), + false, )? - .into_iter(); + .into_iter() + .flatten(); // Assert the correctness of batches returned assert_batch_matches( @@ -745,13 +754,14 @@ fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() -> DeltaResult<( get_log_schema().clone(), ); - let result = LogSegment::process_single_checkpoint_batch( + let mut iter = LogSegment::process_sidecars( engine.get_parquet_handler(), mock_table.log_root(), - checkpoint_batch, - )?; - - let mut iter = result.into_iter(); + checkpoint_batch.as_ref(), + false, + )? + .into_iter() + .flatten(); // Assert that an error is returned when trying to read sidecar files that do not exist let _ = iter.next().unwrap().is_err(); @@ -763,33 +773,33 @@ fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() -> DeltaResult<( } #[test] -fn test_create_checkpoint_stream_errors_when_checkpoint_parts_is_empty() -> DeltaResult<()> { +fn test_create_checkpoint_stream_returns_none_if_checkpoint_parts_is_empty() -> DeltaResult<()> { let engine = SyncEngine::new(); - let stream = LogSegment::create_checkpoint_stream( + let mut iter = LogSegment::create_checkpoint_stream( &engine, get_log_schema().clone(), None, vec![], Url::parse("s3://example-bucket/logs/")?, - ); + ) + .into_iter() + .flatten(); - assert!(stream.is_err()); - if let Err(e) = stream { - assert!(e - .to_string() - .contains("Checkpoint parts should not be empty")); - } + // Assert no batches are returned + assert!(iter.next().is_none()); Ok(()) } -fn create_file_meta(path: &str) -> FileMeta { - FileMeta { +fn create_log_path(path: &str) -> ParsedLogPath { + ParsedLogPath::try_from(FileMeta { location: Url::parse(path).expect("Invalid file URL"), last_modified: 0, size: 0, - } + }) + .unwrap() + .unwrap() } #[test] @@ -797,26 +807,26 @@ fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_actio ) -> DeltaResult<()> { let engine = SyncEngine::new(); - let stream = LogSegment::create_checkpoint_stream( + // Create the stream over checkpoint batches. + let result = LogSegment::create_checkpoint_stream( &engine, get_log_add_schema().clone(), None, - vec![create_file_meta("file:///1.parquet")], + vec![create_log_path("file:///00000000000000000001.parquet")], Url::parse("s3://example-bucket/logs/")?, ); - assert!(stream.is_err()); - if let Err(e) = stream { + assert!(result.is_err(), "Expected an error, but got an Ok value."); + + if let Err(e) = result { assert!(e .to_string() .contains("If the checkpoint read schema contains file actions, it must contain the sidecar column")); - } + }; Ok(()) } -// TODO: Use a batch of sidecar actions to test that we do not visit batches when the schema has no file actions -// View multi-part checkpoint test for more details. #[tokio::test] async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_no_file_actions( ) -> DeltaResult<()> { @@ -825,21 +835,25 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schem let mut mock_table = LocalMockTable::new(); mock_table .parquet_checkpoint( - add_batch_simple(v2_checkpoint_read_schema.clone()), - "00001.checkpoint.parquet", + // Create a checkpoint batch with sidecar actions to verify that the sidecar actions are not read. + sidecar_batch_with_given_paths( + vec!["sidecar1.parquet"], + v2_checkpoint_read_schema.clone(), + ), + "00000000000000000001.checkpoint.parquet", ) .await; let checkpoint_one_file = mock_table .log_root() - .join("00001.checkpoint.parquet")? + .join("00000000000000000001.checkpoint.parquet")? .to_string(); let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), None, - vec![create_file_meta(&checkpoint_one_file)], + vec![create_log_path(&checkpoint_one_file)], mock_table.log_root(), )? .into_iter(); @@ -849,7 +863,7 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schem assert!(!is_log_batch); assert_batch_matches( first_batch, - add_batch_simple(v2_checkpoint_read_schema.clone()), + sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], v2_checkpoint_read_schema.clone()), ); assert!(iter.next().is_none()); @@ -864,15 +878,15 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ let mut mock_table = LocalMockTable::new(); // Multi-part checkpoints can never have sidecar actions. - // We place batches with sidecar actions present in multi-part checkpoints to verify we do not read the actions, - // as we should short-circuit and return the checkpoint batches as-is when encountering multi-part checkpoints. + // We place batches with sidecar actions in multi-part checkpoints to verify we do not read the actions, as we + // should instead short-circuit and return the checkpoint batches as-is when encountering multi-part checkpoints. mock_table .parquet_checkpoint( sidecar_batch_with_given_paths( vec!["sidecar1.parquet"], v2_checkpoint_read_schema.clone(), ), - "00001.checkpoint.1.2.parquet", + "00000000000000000001.checkpoint.0000000001.0000000002.parquet", ) .await; mock_table @@ -881,18 +895,18 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ vec!["sidecar2.parquet"], v2_checkpoint_read_schema.clone(), ), - "00001.checkpoint.2.2.parquet", + "00000000000000000001.checkpoint.0000000002.0000000002.parquet", ) .await; let checkpoint_one_file = mock_table .log_root() - .join("00001.checkpoint.1.2.parquet")? + .join("00000000000000000001.checkpoint.0000000001.0000000002.parquet")? .to_string(); let checkpoint_two_file = mock_table .log_root() - .join("00001.checkpoint.2.2.parquet")? + .join("00000000000000000001.checkpoint.0000000002.0000000002.parquet")? .to_string(); let mut iter = LogSegment::create_checkpoint_stream( @@ -900,8 +914,8 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ v2_checkpoint_read_schema.clone(), None, vec![ - create_file_meta(&checkpoint_one_file), - create_file_meta(&checkpoint_two_file), + create_log_path(&checkpoint_one_file), + create_log_path(&checkpoint_two_file), ], mock_table.log_root(), )? @@ -925,7 +939,7 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ } // Test showcases weird behavior where reading a batch with a missing column causes the reader to -// insert the empty-string in string fields. This is seen in this test where we find two instance of Sidecars with +// insert the empty-string in string fields. This is seen in this test where we find two instances of Sidecars with // empty-string path fields after visiting the batch with the SidecarVisitor due to the batch being read with // the schema which includes the Sidecar column. #[tokio::test] @@ -937,20 +951,20 @@ async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch() -> Delta mock_table .parquet_checkpoint( add_batch_simple(v2_checkpoint_read_schema.clone()), - "00001.checkpoint.parquet", + "00000000000000000001.checkpoint.parquet", ) .await; let checkpoint_one_file = mock_table .log_root() - .join("00001.checkpoint.parquet")? + .join("00000000000000000001.checkpoint.parquet")? .to_string(); let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), None, - vec![create_file_meta(&checkpoint_one_file)], + vec![create_log_path(&checkpoint_one_file)], mock_table.log_root(), )? .into_iter(); @@ -980,20 +994,20 @@ async fn test_create_checkpoint_stream_reads_json_checkpoint_batch() -> DeltaRes data_change: true, ..Default::default() })], - "00001.checkpoint.json", + "00000000000000000001.checkpoint.json", ) .await; let checkpoint_one_file = mock_table .log_root() - .join("00001.checkpoint.json")? + .join("00000000000000000001.checkpoint.json")? .to_string(); let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), None, - vec![create_file_meta(&checkpoint_one_file)], + vec![create_log_path(&checkpoint_one_file)], mock_table.log_root(), )? .into_iter(); @@ -1007,10 +1021,8 @@ async fn test_create_checkpoint_stream_reads_json_checkpoint_batch() -> DeltaRes Ok(()) } -// More integration test than unit test below: - // Encapsulates logic that has already been tested but tests the interaction between the functions, -// such as performing a map operation on the returned sidecar batches from `process_single_checkpoint_batch` +// such as performing a map operation on the returned sidecar batches from `process_sidecars` // to include the is_log_batch flag #[tokio::test] async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches( @@ -1026,7 +1038,7 @@ async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?, ), - "00001.checkpoint.parquet", + "00000000000000000001.checkpoint.parquet", ) .await; mock_table @@ -1044,27 +1056,38 @@ async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar let checkpoint_file_path = mock_table .log_root() - .join("00001.checkpoint.parquet")? + .join("00000000000000000001.checkpoint.parquet")? .to_string(); let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), None, - vec![create_file_meta(&checkpoint_file_path)], + vec![create_log_path(&checkpoint_file_path)], mock_table.log_root(), )? .into_iter(); - // Assert that the first batch returned is from reading sidecarfile1 + // Assert that the first batch returned is from reading checkpoint file 1 let (first_batch, is_log_batch) = iter.next().unwrap()?; assert!(!is_log_batch); - assert_batch_matches(first_batch, add_batch_simple(sidecar_schema.clone())); + assert_batch_matches( + first_batch, + sidecar_batch_with_given_paths( + vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], + get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?, + ), + ); + + // Assert that the second batch returned is from reading sidecarfile1 + let (second_batch, is_log_batch) = iter.next().unwrap()?; + assert!(!is_log_batch); + assert_batch_matches(second_batch, add_batch_simple(sidecar_schema.clone())); // Assert that the second batch returned is from reading sidecarfile2 - let (second_batch, is_log_batch) = iter.next().unwrap()?; + let (third_batch, is_log_batch) = iter.next().unwrap()?; assert!(!is_log_batch); - assert_batch_matches(second_batch, add_batch_with_remove(sidecar_schema.clone())); + assert_batch_matches(third_batch, add_batch_with_remove(sidecar_schema.clone())); assert!(iter.next().is_none()); diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index dab4a7571..22f8c3a02 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -701,8 +701,7 @@ pub(crate) mod test_utils { .iter() .map(|path| { format!( - r#"{{"sidecar":{{"path":"{}","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{{"tag_foo":"tag_bar"}}}}}}"#, - path + r#"{{"sidecar":{{"path":"{path}","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{{"tag_foo":"tag_bar"}}}}}}"#, ) }) .collect::>() diff --git a/kernel/src/table_changes/mod.rs b/kernel/src/table_changes/mod.rs index c80745d3e..e65b0ae53 100644 --- a/kernel/src/table_changes/mod.rs +++ b/kernel/src/table_changes/mod.rs @@ -117,7 +117,7 @@ pub struct TableChanges { } impl TableChanges { -/// Creates a new [`TableChanges`] instance for the given version range. This function checks + /// Creates a new [`TableChanges`] instance for the given version range. This function checks /// these properties: /// - The change data feed table feature must be enabled in both the start or end versions. /// - Other than the deletion vector reader feature, no other reader features are enabled for the table. diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index 7f7b24c0e..1844566fe 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -92,7 +92,7 @@ pub(crate) mod test_utils { .map(|action| serde_json::to_string(&action).unwrap()) .join("\n"); - let path = format!("_delta_log/{filename:020}"); + let path = format!("_delta_log/{filename}"); self.store .put(&path::Path::from(path), data.into()) @@ -115,7 +115,7 @@ pub(crate) mod test_utils { writer.write(record_batch).unwrap(); writer.close().unwrap(); // writer must be closed to write footer - let path = format!("_delta_log/{filename:020}"); + let path = format!("_delta_log/{filename}"); self.store .put(&path::Path::from(path), buffer.into()) From 1c101ea32d9347f3209ded7bbda340788c5b8bf9 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 9 Feb 2025 20:17:00 -0800 Subject: [PATCH 10/37] assertions for error refactor --- kernel/src/log_segment/tests.rs | 93 ++++++++++++++++----------------- kernel/src/utils.rs | 20 +++++++ 2 files changed, 64 insertions(+), 49 deletions(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 87fa7ce9e..240275bb5 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -16,7 +16,9 @@ use crate::scan::test_utils::{ add_batch_simple, add_batch_with_remove, sidecar_batch_with_given_paths, }; use crate::snapshot::CheckpointMetadata; -use crate::utils::test_utils::{assert_batch_matches, Action, LocalMockTable}; +use crate::utils::test_utils::{ + assert_batch_matches, assert_error_contains, Action, LocalMockTable, +}; use crate::{DeltaResult, Engine, FileMeta, FileSystemClient, Table}; use test_utils::delta_path_for_version; @@ -628,7 +630,7 @@ fn table_changes_fails_with_larger_start_version_than_end() { assert!(log_segment_res.is_err()); } #[test] -fn test_sidecar_to_filemeta() -> DeltaResult<()> { +fn test_sidecar_to_filemeta_valid_paths() -> DeltaResult<()> { let log_root = Url::parse("file:///var/_delta_log/")?; let test_cases = [ ( @@ -639,42 +641,31 @@ fn test_sidecar_to_filemeta() -> DeltaResult<()> { "file:///var/_delta_log/_sidecars/example.parquet", "file:///var/_delta_log/_sidecars/example.parquet", ), - // This forms a valid URL but represents an invalid path since sidecar files - // are restricted to the `_delta_log/_sidecars` directory. - // Attempting to read this file will fail because it does not exist. ]; - for (input_path, expected_url) in test_cases { - let sidecar = Sidecar { - path: input_path.into(), - modification_time: 0, - size_in_bytes: 1000, - tags: None, - }; - - let result = LogSegment::sidecar_to_filemeta(&sidecar, &log_root)?; + for (input_path, expected_url) in test_cases.iter() { + let sidecar = create_sidecar(*input_path); + let filemeta = LogSegment::sidecar_to_filemeta(&sidecar, &log_root)?; assert_eq!( - result.location.as_str(), - expected_url, + filemeta.location.as_str(), + *expected_url, "Mismatch for input path: {}", input_path ); } + Ok(()) +} - let bad_sidecar = Sidecar { - // This is an invalid sidecar path because it is not in the `_delta_log/_sidecars` directory - path: "test/test/example.parquet".into(), - modification_time: 0, - size_in_bytes: 1000, - tags: None, - }; +#[test] +fn test_sidecar_to_filemeta_invalid_path() -> DeltaResult<()> { + let log_root = Url::parse("file:///var/_delta_log/")?; + let bad_sidecar = create_sidecar("test/test/example.parquet"); let result = LogSegment::sidecar_to_filemeta(&bad_sidecar, &log_root); - assert!(result.is_err()); - if let Err(e) = result { - assert!(e.to_string().contains("Sidecar path 'test/test/example.parquet' is invalid: sidecar files must be in the `_delta_log/_sidecars/` directory as a file name only")); - } - + assert_error_contains( + result, + "Sidecar path 'test/test/example.parquet' is invalid: sidecar files must be in the `_delta_log/_sidecars/` directory", + ); Ok(()) } @@ -764,10 +755,7 @@ fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() -> DeltaResult<( .flatten(); // Assert that an error is returned when trying to read sidecar files that do not exist - let _ = iter.next().unwrap().is_err(); - if let Err(e) = iter.next().unwrap() { - assert!(e.to_string().contains("No such file or directory")); - } + assert_error_contains(iter.next().unwrap(), "No such file or directory"); Ok(()) } @@ -792,16 +780,6 @@ fn test_create_checkpoint_stream_returns_none_if_checkpoint_parts_is_empty() -> Ok(()) } -fn create_log_path(path: &str) -> ParsedLogPath { - ParsedLogPath::try_from(FileMeta { - location: Url::parse(path).expect("Invalid file URL"), - last_modified: 0, - size: 0, - }) - .unwrap() - .unwrap() -} - #[test] fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_action( ) -> DeltaResult<()> { @@ -816,13 +794,10 @@ fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_actio Url::parse("s3://example-bucket/logs/")?, ); - assert!(result.is_err(), "Expected an error, but got an Ok value."); - - if let Err(e) = result { - assert!(e - .to_string() - .contains("If the checkpoint read schema contains file actions, it must contain the sidecar column")); - }; + assert_error_contains( + result, + "If the checkpoint read schema contains file actions, it must contain the sidecar column", + ); Ok(()) } @@ -1093,3 +1068,23 @@ async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar Ok(()) } + +/// Creates a Sidecar instance with default modification time, file size and no tags. +fn create_sidecar>(path: P) -> Sidecar { + Sidecar { + path: path.into(), + modification_time: 0, + size_in_bytes: 1000, + tags: None, + } +} + +fn create_log_path(path: &str) -> ParsedLogPath { + ParsedLogPath::try_from(FileMeta { + location: Url::parse(path).expect("Invalid file URL"), + last_modified: 0, + size: 0, + }) + .unwrap() + .unwrap() +} diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index 1844566fe..d94029f97 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -161,7 +161,27 @@ pub(crate) mod test_utils { .into() } + /// Checks that two `EngineData` objects are equal by converting them to `RecordBatch` and comparing pub(crate) fn assert_batch_matches(actual: Box, expected: Box) { assert_eq!(into_record_batch(actual), into_record_batch(expected)); } + + /// Checks that the error message from a Result contains the expected substring. + pub(crate) fn assert_error_contains(result: Result, expected: &str) { + match result { + Ok(_) => panic!( + "Expected an error containing \"{}\", but the result was Ok", + expected + ), + Err(e) => { + let err_str = e.to_string(); + if !err_str.contains(expected) { + panic!( + "Error \"{}\" did not contain expected substring \"{}\"", + err_str, expected + ); + } + } + } + } } From 7cc3c810ec4560851e175e7c6cf39c1317e16170 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 9 Feb 2025 20:41:43 -0800 Subject: [PATCH 11/37] remove redundant type conversions --- kernel/src/log_segment/tests.rs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 240275bb5..f47ea3ba5 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -830,8 +830,7 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schem None, vec![create_log_path(&checkpoint_one_file)], mock_table.log_root(), - )? - .into_iter(); + )?; // Assert that the first batch returned is from reading checkpoint file 1 let (first_batch, is_log_batch) = iter.next().unwrap()?; @@ -893,8 +892,7 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ create_log_path(&checkpoint_two_file), ], mock_table.log_root(), - )? - .into_iter(); + )?; // Assert the correctness of batches returned for expected_sidecar in ["sidecar1.parquet", "sidecar2.parquet"].iter() { @@ -984,12 +982,14 @@ async fn test_create_checkpoint_stream_reads_json_checkpoint_batch() -> DeltaRes None, vec![create_log_path(&checkpoint_one_file)], mock_table.log_root(), - )? - .into_iter(); + )?; // Assert that the first batch returned is from reading checkpoint file 1 let (_first_batch, is_log_batch) = iter.next().unwrap()?; assert!(!is_log_batch); + // Although we do not assert the contents, we know the JSON checkpoint is read correctly as + // a single batch is returned and no errors are thrown. + // TODO: Convert JSON checkpoint to RecordBatch and assert that it is as expected assert!(iter.next().is_none()); @@ -1040,8 +1040,7 @@ async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar None, vec![create_log_path(&checkpoint_file_path)], mock_table.log_root(), - )? - .into_iter(); + )?; // Assert that the first batch returned is from reading checkpoint file 1 let (first_batch, is_log_batch) = iter.next().unwrap()?; From fc403c2f337ce16b933f20622ea6cc45afbddb8d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 9 Feb 2025 21:05:19 -0800 Subject: [PATCH 12/37] refactor --- kernel/src/log_segment.rs | 11 ++++++++--- kernel/tests/golden_tables.rs | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index e596b38a1..e11f9073d 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -291,6 +291,7 @@ impl LogSegment { // This closure maps the checkpoint batch to an iterator of batches // by chaining the checkpoint batch with sidecar batches if they exist. let sidecar_content = Self::process_sidecars( + // cheap Arc clone parquet_handler.clone(), log_root.clone(), checkpoint_batch.as_ref(), @@ -309,6 +310,10 @@ impl LogSegment { .map(|result| result?)) } + /// Processes sidecar files for the given checkpoint batch. + /// + /// This function extracts any sidecar file references from the provided batch. + /// Each sidecar file is read and an iterator of sidecar file batches is returned fn process_sidecars( parquet_handler: Arc, log_root: Url, @@ -319,15 +324,15 @@ impl LogSegment { return Ok(None); } + // Visit the rows of the checkpoint batch to extract sidecar file references let mut visitor = SidecarVisitor::default(); visitor.visit_rows_of(batch)?; - // If there are no sidecars, return None if visitor.sidecars.is_empty() { return Ok(None); } - // Convert sidecar actions to sidecar file paths + // Convert sidecar records to FileMeta for reading let sidecar_files: Vec<_> = visitor .sidecars .iter() @@ -336,7 +341,7 @@ impl LogSegment { let sidecar_read_schema = get_log_add_schema().clone(); - // Read sidecar files and return as Some(Iterator) + // Read the sidecar files and return an iterator of sidecar file batches Ok(Some(parquet_handler.read_parquet_files( &sidecar_files, sidecar_read_schema, diff --git a/kernel/tests/golden_tables.rs b/kernel/tests/golden_tables.rs index 58b1ab187..120271ef2 100644 --- a/kernel/tests/golden_tables.rs +++ b/kernel/tests/golden_tables.rs @@ -411,6 +411,7 @@ golden_test!("time-travel-start-start20-start40", latest_snapshot_test); skip_test!("v2-checkpoint-json": "v2 checkpoint not supported"); skip_test!("v2-checkpoint-parquet": "v2 checkpoint not supported"); + // BUG: // - AddFile: 'file:/some/unqualified/absolute/path' // - RemoveFile: '/some/unqualified/absolute/path' From 4d4e6011374bed0f57129d6c66d758444914bd79 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 9 Feb 2025 21:52:58 -0800 Subject: [PATCH 13/37] remove redundant .into_iter --- kernel/src/log_segment/tests.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index f47ea3ba5..5dd6912c2 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -939,8 +939,7 @@ async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch() -> Delta None, vec![create_log_path(&checkpoint_one_file)], mock_table.log_root(), - )? - .into_iter(); + )?; // Assert that the first batch returned is from reading checkpoint file 1 let (first_batch, is_log_batch) = iter.next().unwrap()?; From 415c2f4b451e2778c26391882d2d9ed5438c34cb Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 9 Feb 2025 22:28:49 -0800 Subject: [PATCH 14/37] handle errors from windows os --- kernel/src/log_segment/tests.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 5dd6912c2..b9460d75e 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -755,7 +755,12 @@ fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() -> DeltaResult<( .flatten(); // Assert that an error is returned when trying to read sidecar files that do not exist - assert_error_contains(iter.next().unwrap(), "No such file or directory"); + let err = iter.next().unwrap(); + if cfg!(windows) { + assert_error_contains(err, "The system cannot find the path specified"); + } else { + assert_error_contains(err, "No such file or directory"); + } Ok(()) } From 2547c4255c2c680256755c4b507c42fc0306a45d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 9 Feb 2025 23:49:35 -0800 Subject: [PATCH 15/37] remove unnecessary empty path check --- kernel/src/actions/mod.rs | 1 - kernel/src/actions/visitors.rs | 41 +-------------------------------- kernel/src/log_segment/tests.rs | 8 ++++--- 3 files changed, 6 insertions(+), 44 deletions(-) diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs index 8bcb5df50..02d30163a 100644 --- a/kernel/src/actions/mod.rs +++ b/kernel/src/actions/mod.rs @@ -515,7 +515,6 @@ pub struct SetTransaction { /// file actions. This action is only allowed in checkpoints following the V2 spec. /// /// [More info]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#sidecar-file-information -#[allow(unused)] //TODO: Remove once we implement V2 checkpoint file processing #[derive(Schema, Debug, PartialEq)] #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] pub(crate) struct Sidecar { diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index be6242250..095ae89e1 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -438,7 +438,6 @@ impl RowVisitor for SetTransactionVisitor { } } -#[allow(unused)] //TODO: Remove once we implement V2 checkpoint file processing #[derive(Default)] #[cfg_attr(feature = "developer-visibility", visibility::make(pub))] pub(crate) struct SidecarVisitor { @@ -476,14 +475,7 @@ impl RowVisitor for SidecarVisitor { ); for i in 0..row_count { // Since path column is required, use it to detect presence of a sidecar action - let opt_path: Option = getters[0].get_opt(i, "sidecar.path")?; - if let Some(path) = opt_path { - // We read checkpoint batches with the sidecar action. This results in empty paths - // if a row is not a sidecar action. We do not want to create a sidecar action for - // these rows. - if path.is_empty() { - continue; - } + if let Some(path) = getters[0].get_opt(i, "cdc.path")? { self.sidecars.push(Self::visit_sidecar(i, path, getters)?); } } @@ -615,37 +607,6 @@ mod tests { Ok(()) } - #[test] - fn test_parse_sidecar_with_empty_string_as_path() -> DeltaResult<()> { - let engine = SyncEngine::new(); - let json_handler = engine.get_json_handler(); - let json_strings: StringArray = vec![ - r#"{"sidecar":{"path":"","sizeInBytes":9268,"modificationTime":1714496113961,"tags":null}}"#, - r#"{"sidecar":{"path":"016ae953-37a9-438e-8683-9a9a4a79a395.parquet","sizeInBytes":9268,"modificationTime":1714496113961,"tags":null}}"#, - - ] - .into(); - let output_schema = get_log_schema().clone(); - let batch = json_handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) - .unwrap(); - - let mut visitor = SidecarVisitor::default(); - visitor.visit_rows_of(batch.as_ref())?; - - let sidecar1 = Sidecar { - path: "016ae953-37a9-438e-8683-9a9a4a79a395.parquet".into(), - size_in_bytes: 9268, - modification_time: 1714496113961, - tags: None, - }; - - assert_eq!(visitor.sidecars.len(), 1); - assert_eq!(visitor.sidecars[0], sidecar1); - - Ok(()) - } - #[test] fn test_parse_metadata() -> DeltaResult<()> { let data = action_batch(); diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index b9460d75e..0b5fac5f9 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -921,14 +921,15 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ // empty-string path fields after visiting the batch with the SidecarVisitor due to the batch being read with // the schema which includes the Sidecar column. #[tokio::test] -async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch() -> DeltaResult<()> { +async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_sidecars( +) -> DeltaResult<()> { let engine = SyncEngine::new(); let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let mut mock_table = LocalMockTable::new(); mock_table .parquet_checkpoint( - add_batch_simple(v2_checkpoint_read_schema.clone()), + add_batch_simple(get_log_add_schema().clone()), "00000000000000000001.checkpoint.parquet", ) .await; @@ -959,7 +960,8 @@ async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch() -> Delta } #[tokio::test] -async fn test_create_checkpoint_stream_reads_json_checkpoint_batch() -> DeltaResult<()> { +async fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars( +) -> DeltaResult<()> { let engine = SyncEngine::new(); let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let mut mock_table = LocalMockTable::new(); From ea7349a90b43a77670b6f2459d1ac5b98c598de9 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 9 Feb 2025 23:52:23 -0800 Subject: [PATCH 16/37] typo --- kernel/src/actions/visitors.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 095ae89e1..d1af804ae 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -475,7 +475,7 @@ impl RowVisitor for SidecarVisitor { ); for i in 0..row_count { // Since path column is required, use it to detect presence of a sidecar action - if let Some(path) = getters[0].get_opt(i, "cdc.path")? { + if let Some(path) = getters[0].get_opt(i, "sidecar.path")? { self.sidecars.push(Self::visit_sidecar(i, path, getters)?); } } From 39a145136032fe5abc6262acf7d75f70594df2c9 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 9 Feb 2025 23:56:37 -0800 Subject: [PATCH 17/37] nits --- kernel/src/log_segment.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index e11f9073d..84cca730e 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -259,7 +259,7 @@ impl LogSegment { let checkpoint_file_meta: Vec = checkpoint_parts .iter() - .map(|log| log.location.clone()) + .map(|f: &ParsedLogPath| f.location.clone()) .collect(); let actions = if checkpoint_parts @@ -320,6 +320,8 @@ impl LogSegment { batch: &dyn EngineData, skip_sidecar_search: bool, ) -> DeltaResult>> + Send>> { + // If the schema does not contain add/remove actions or we have a multi-part checkpoint, + // we return early as sidecar files are not needed. if skip_sidecar_search { return Ok(None); } @@ -328,11 +330,11 @@ impl LogSegment { let mut visitor = SidecarVisitor::default(); visitor.visit_rows_of(batch)?; + // If there are no sidecar files, return early if visitor.sidecars.is_empty() { return Ok(None); } - // Convert sidecar records to FileMeta for reading let sidecar_files: Vec<_> = visitor .sidecars .iter() From ab9ef11254043a5adb59d022b7f1d64839606539 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 10 Feb 2025 10:09:05 -0800 Subject: [PATCH 18/37] infer type --- kernel/src/log_segment.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 84cca730e..30a9ad1f5 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -259,7 +259,7 @@ impl LogSegment { let checkpoint_file_meta: Vec = checkpoint_parts .iter() - .map(|f: &ParsedLogPath| f.location.clone()) + .map(|f| f.location.clone()) .collect(); let actions = if checkpoint_parts From 302efed9a36fc9f08bea3bfdfbb7cb127d750c24 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 10 Feb 2025 11:26:43 -0800 Subject: [PATCH 19/37] review & nits --- kernel/src/log_segment.rs | 24 +-- kernel/src/log_segment/tests.rs | 17 +- kernel/src/scan/iterator.rs | 340 ++++++++++++++++++++++++++++++++ 3 files changed, 346 insertions(+), 35 deletions(-) create mode 100644 kernel/src/scan/iterator.rs diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 30a9ad1f5..13ff771f5 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -17,7 +17,6 @@ use crate::{ use itertools::Itertools; use std::collections::HashMap; use std::convert::identity; -use std::path::Path; use std::sync::{Arc, LazyLock}; use tracing::warn; use url::Url; @@ -307,6 +306,7 @@ impl LogSegment { }) }) .flatten_ok() + // Map converts Result, _>,_> to Result, _> .map(|result| result?)) } @@ -355,29 +355,9 @@ impl LogSegment { /// /// This helper first builds the URL by joining the provided log_root with /// the "_sidecars/" folder and the given sidecar path. - /// - /// To catch a possible error earlier, if the sidecar.path is relative then - /// it is checked to ensure it is simply a file name (no directory components). fn sidecar_to_filemeta(sidecar: &Sidecar, log_root: &Url) -> DeltaResult { - // If sidecar.path is relative (does not contain "://"), require that it is - // just a file name. This will catch cases like "test/test/example.parquet". - if !sidecar.path.contains("://") { - let path = Path::new(&sidecar.path); - // If there is any parent (i.e. directory component), consider it invalid. - if let Some(parent) = path.parent() { - // If the parent is not empty, then we have extra components. - if parent != Path::new("") { - return Err(Error::Generic(format!( - "Sidecar path '{}' is invalid: sidecar files must be in the `_delta_log/_sidecars/` directory as a file name only", - sidecar.path - ))); - } - } - } - // Now build the full location by joining log_root, "_sidecars/", and the given path. - let location = log_root.join("_sidecars/")?.join(&sidecar.path)?; Ok(FileMeta { - location, + location: log_root.join("_sidecars/")?.join(&sidecar.path)?, last_modified: sidecar.modification_time, size: sidecar.size_in_bytes as usize, }) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 0b5fac5f9..bf70803fe 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -641,6 +641,10 @@ fn test_sidecar_to_filemeta_valid_paths() -> DeltaResult<()> { "file:///var/_delta_log/_sidecars/example.parquet", "file:///var/_delta_log/_sidecars/example.parquet", ), + ( + "test/test/example.parquet", + "file:///var/_delta_log/_sidecars/test/test/example.parquet", + ), ]; for (input_path, expected_url) in test_cases.iter() { @@ -656,19 +660,6 @@ fn test_sidecar_to_filemeta_valid_paths() -> DeltaResult<()> { Ok(()) } -#[test] -fn test_sidecar_to_filemeta_invalid_path() -> DeltaResult<()> { - let log_root = Url::parse("file:///var/_delta_log/")?; - let bad_sidecar = create_sidecar("test/test/example.parquet"); - let result = LogSegment::sidecar_to_filemeta(&bad_sidecar, &log_root); - - assert_error_contains( - result, - "Sidecar path 'test/test/example.parquet' is invalid: sidecar files must be in the `_delta_log/_sidecars/` directory", - ); - Ok(()) -} - #[test] fn test_checkpoint_batch_with_no_sidecars_returns_none() -> DeltaResult<()> { let engine = Arc::new(SyncEngine::new()); diff --git a/kernel/src/scan/iterator.rs b/kernel/src/scan/iterator.rs new file mode 100644 index 000000000..21ccfc5ea --- /dev/null +++ b/kernel/src/scan/iterator.rs @@ -0,0 +1,340 @@ +use std::clone::Clone; +use std::collections::{HashSet, VecDeque}; +use std::sync::{Arc, LazyLock}; + +use itertools::Itertools; +use tracing::debug; + +use super::data_skipping::DataSkippingFilter; +use super::ScanData; +use crate::actions::{get_log_add_schema, get_log_schema, ADD_NAME, REMOVE_NAME}; +use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; +use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; +use crate::path::{self, LogPathFileType, ParsedLogPath}; +use crate::scan::DeletionVectorDescriptor; +use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; +use crate::utils::require; +use crate::{ + DeltaResult, Engine, EngineData, Error, ExpressionEvaluator, JsonHandler, ParquetHandler, +}; + +#[derive(Debug, Hash, Eq, PartialEq)] +struct FileActionKey { + path: String, + dv_unique_id: Option, +} +impl FileActionKey { + fn new(path: impl Into, dv_unique_id: Option) -> Self { + let path = path.into(); + Self { path, dv_unique_id } + } +} + +struct LogReplayScanner { + filter: Option, + + /// A set of (data file path, dv_unique_id) pairs that have been seen thus + /// far in the log. This is used to filter out files with Remove actions as + /// well as duplicate entries in the log. + seen: HashSet, +} + +/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log +/// replay visits actions newest-first, so once we've seen a file action for a given (path, dvId) +/// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the +/// first action for a given file is a remove, then that file does not show up in the result at all. +struct AddRemoveDedupVisitor<'seen> { + seen: &'seen mut HashSet, + selection_vector: Vec, + is_log_batch: bool, +} + +impl AddRemoveDedupVisitor<'_> { + /// Checks if log replay already processed this logical file (in which case the current action + /// should be ignored). If not already seen, register it so we can recognize future duplicates. + fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { + // Note: each (add.path + add.dv_unique_id()) pair has a + // unique Add + Remove pair in the log. For example: + // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json + + if self.seen.contains(&key) { + debug!( + "Ignoring duplicate ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + true + } else { + debug!( + "Including ({}, {:?}) in scan, is log {}", + key.path, key.dv_unique_id, self.is_log_batch + ); + if self.is_log_batch { + // Remember file actions from this batch so we can ignore duplicates as we process + // batches from older commit and/or checkpoint files. We don't track checkpoint + // batches because they are already the oldest actions and never replace anything. + self.seen.insert(key); + } + false + } + } + + /// True if this row contains an Add action that should survive log replay. Skip it if the row + /// is not an Add action, or the file has already been seen previously. + fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { + // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may + // have a remove with a path at index 4. In either case, extract the three dv getters at + // indexes that immediately follow a valid path index. + let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { + (path, &getters[1..4], true) + } else if !self.is_log_batch { + return Ok(false); + } else if let Some(path) = getters[4].get_opt(i, "remove.path")? { + (path, &getters[5..8], false) + } else { + return Ok(false); + }; + + let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { + Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( + storage_type, + dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, + dv_getters[2].get_opt(i, "deletionVector.offset")?, + )), + None => None, + }; + + // Process both adds and removes, but only return not already-seen adds + let file_key = FileActionKey::new(path, dv_unique_id); + Ok(!self.check_and_record_seen(file_key) && is_add) + } +} + +impl RowVisitor for AddRemoveDedupVisitor<'_> { + fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { + // NOTE: The visitor assumes a schema with adds first and removes optionally afterward. + static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { + const STRING: DataType = DataType::STRING; + const INTEGER: DataType = DataType::INTEGER; + let types_and_names = vec![ + (STRING, column_name!("add.path")), + (STRING, column_name!("add.deletionVector.storageType")), + (STRING, column_name!("add.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("add.deletionVector.offset")), + (STRING, column_name!("remove.path")), + (STRING, column_name!("remove.deletionVector.storageType")), + (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), + (INTEGER, column_name!("remove.deletionVector.offset")), + ]; + let (types, names) = types_and_names.into_iter().unzip(); + (names, types).into() + }); + let (names, types) = NAMES_AND_TYPES.as_ref(); + if self.is_log_batch { + (names, types) + } else { + // All checkpoint actions are already reconciled and Remove actions in checkpoint files + // only serve as tombstones for vacuum jobs. So we only need to examine the adds here. + (&names[..4], &types[..4]) + } + } + + fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { + let expected_getters = if self.is_log_batch { 8 } else { 4 }; + require!( + getters.len() == expected_getters, + Error::InternalError(format!( + "Wrong number of AddRemoveDedupVisitor getters: {}", + getters.len() + )) + ); + + for i in 0..row_count { + if self.selection_vector[i] { + self.selection_vector[i] = self.is_valid_add(i, getters)?; + } + } + Ok(()) + } +} + +// NB: If you update this schema, ensure you update the comment describing it in the doc comment +// for `scan_row_schema` in scan/mod.rs! You'll also need to update ScanFileVisitor as the +// indexes will be off, and [`get_add_transform_expr`] below to match it. +pub(crate) static SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| { + // Note that fields projected out of a nullable struct must be nullable + let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); + let file_constant_values = + StructType::new([StructField::new("partitionValues", partition_values, true)]); + let deletion_vector = StructType::new([ + StructField::new("storageType", DataType::STRING, true), + StructField::new("pathOrInlineDv", DataType::STRING, true), + StructField::new("offset", DataType::INTEGER, true), + StructField::new("sizeInBytes", DataType::INTEGER, true), + StructField::new("cardinality", DataType::LONG, true), + ]); + Arc::new(StructType::new([ + StructField::new("path", DataType::STRING, true), + StructField::new("size", DataType::LONG, true), + StructField::new("modificationTime", DataType::LONG, true), + StructField::new("stats", DataType::STRING, true), + StructField::new("deletionVector", deletion_vector, true), + StructField::new("fileConstantValues", file_constant_values, true), + ])) +}); + +pub(crate) static SCAN_ROW_DATATYPE: LazyLock = + LazyLock::new(|| SCAN_ROW_SCHEMA.clone().into()); + +fn get_add_transform_expr() -> Expression { + Expression::Struct(vec![ + column_expr!("add.path"), + column_expr!("add.size"), + column_expr!("add.modificationTime"), + column_expr!("add.stats"), + column_expr!("add.deletionVector"), + Expression::Struct(vec![column_expr!("add.partitionValues")]), + ]) +} + +impl LogReplayScanner { + /// Create a new [`LogReplayScanner`] instance + fn new(engine: &dyn Engine, physical_predicate: Option<(ExpressionRef, SchemaRef)>) -> Self { + Self { + filter: DataSkippingFilter::new(engine, physical_predicate), + seen: Default::default(), + } + } + + fn process_scan_batch( + &mut self, + add_transform: &dyn ExpressionEvaluator, + actions: &dyn EngineData, + is_log_batch: bool, + ) -> DeltaResult { + // Apply data skipping to get back a selection vector for actions that passed skipping. We + // will update the vector below as log replay identifies duplicates that should be ignored. + let selection_vector = match &self.filter { + Some(filter) => filter.apply(actions)?, + None => vec![true; actions.len()], + }; + assert_eq!(selection_vector.len(), actions.len()); + + let mut visitor = AddRemoveDedupVisitor { + seen: &mut self.seen, + selection_vector, + is_log_batch, + }; + visitor.visit_rows_of(actions)?; + + // TODO: Teach expression eval to respect the selection vector we just computed so carefully! + let selection_vector = visitor.selection_vector; + let result = add_transform.evaluate(actions)?; + Ok((result, selection_vector)) + } +} + +pub struct DeltaScanIterator { + // engine: &dyn Engine, + files_list: VecDeque, + log_scanner: LogReplayScanner, + physical_predicate: Option<(ExpressionRef, SchemaRef)>, + commit_read_schema: SchemaRef, + checkpoint_read_schema: SchemaRef, + add_transform: Arc, + json_handler: Arc, + parquet_handler: Arc, +} + +impl DeltaScanIterator { + pub(crate) fn new( + engine: &dyn Engine, + commit_files: Vec, + checkpoint_files: Vec, + physical_predicate: Option<(ExpressionRef, SchemaRef)>, + ) -> Self { + let mut files_list = VecDeque::new(); + + // Add commit files in reverse order + for path in commit_files.into_iter().rev() { + files_list.push_back(path); + } + + // Add checkpoint files + for path in checkpoint_files { + files_list.push_back(path); + } + + let log_scanner = LogReplayScanner::new(engine, physical_predicate.clone()); + let commit_read_schema = get_log_schema().project(&[ADD_NAME, REMOVE_NAME]).unwrap(); + let checkpoint_read_schema = get_log_add_schema().clone(); + + let add_transform: Arc = + engine.get_expression_handler().get_evaluator( + get_log_add_schema().clone(), + get_add_transform_expr(), + SCAN_ROW_DATATYPE.clone(), + ); + + Self { + files_list, + log_scanner, + physical_predicate, + commit_read_schema, + checkpoint_read_schema, + add_transform, + json_handler: engine.get_json_handler(), + parquet_handler: engine.get_parquet_handler(), + } + } +} + +impl Iterator for DeltaScanIterator { + type Item = DeltaResult; + + fn next(&mut self) -> Option { + if let Some(path) = self.files_list.pop_front() { + // check if commit or checkpoint file or ... + match path.file_type { + LogPathFileType::Commit => { + let action_iter = self + .json_handler + .read_json_files( + &[path.location.clone()], + self.commit_read_schema.clone(), + None, + ) + .unwrap(); + + result + //we are returning an iterator here, shit + //problem stems from the fact that our file reader returns batches, and we somehow need to turn batches of enginedata into a single ScanData + //idk if this is possible... + } + LogPathFileType::UuidCheckpoint(uuid) => { + let action_iter = self + .json_handler + .read_json_files( + &[path.location.clone()], + self.commit_read_schema.clone(), + None, + ) + .unwrap(); + + let result = action_iter.map(move |actions| -> DeltaResult<_> { + let actions = actions?; + + self.log_scanner.process_scan_batch( + self.add_transform.as_ref(), + actions.as_ref(), + false, + ) + }); + Some(Ok(result.flatten().next().unwrap())) + } + _ => None, + } + } else { + None + } + } +} From 06e5c92fbb331b888b9971512ef93582c748c857 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Mon, 10 Feb 2025 11:32:00 -0800 Subject: [PATCH 20/37] remove test iterator --- kernel/src/scan/iterator.rs | 340 ------------------------------------ 1 file changed, 340 deletions(-) delete mode 100644 kernel/src/scan/iterator.rs diff --git a/kernel/src/scan/iterator.rs b/kernel/src/scan/iterator.rs deleted file mode 100644 index 21ccfc5ea..000000000 --- a/kernel/src/scan/iterator.rs +++ /dev/null @@ -1,340 +0,0 @@ -use std::clone::Clone; -use std::collections::{HashSet, VecDeque}; -use std::sync::{Arc, LazyLock}; - -use itertools::Itertools; -use tracing::debug; - -use super::data_skipping::DataSkippingFilter; -use super::ScanData; -use crate::actions::{get_log_add_schema, get_log_schema, ADD_NAME, REMOVE_NAME}; -use crate::engine_data::{GetData, RowVisitor, TypedGetData as _}; -use crate::expressions::{column_expr, column_name, ColumnName, Expression, ExpressionRef}; -use crate::path::{self, LogPathFileType, ParsedLogPath}; -use crate::scan::DeletionVectorDescriptor; -use crate::schema::{ColumnNamesAndTypes, DataType, MapType, SchemaRef, StructField, StructType}; -use crate::utils::require; -use crate::{ - DeltaResult, Engine, EngineData, Error, ExpressionEvaluator, JsonHandler, ParquetHandler, -}; - -#[derive(Debug, Hash, Eq, PartialEq)] -struct FileActionKey { - path: String, - dv_unique_id: Option, -} -impl FileActionKey { - fn new(path: impl Into, dv_unique_id: Option) -> Self { - let path = path.into(); - Self { path, dv_unique_id } - } -} - -struct LogReplayScanner { - filter: Option, - - /// A set of (data file path, dv_unique_id) pairs that have been seen thus - /// far in the log. This is used to filter out files with Remove actions as - /// well as duplicate entries in the log. - seen: HashSet, -} - -/// A visitor that deduplicates a stream of add and remove actions into a stream of valid adds. Log -/// replay visits actions newest-first, so once we've seen a file action for a given (path, dvId) -/// pair, we should ignore all subsequent (older) actions for that same (path, dvId) pair. If the -/// first action for a given file is a remove, then that file does not show up in the result at all. -struct AddRemoveDedupVisitor<'seen> { - seen: &'seen mut HashSet, - selection_vector: Vec, - is_log_batch: bool, -} - -impl AddRemoveDedupVisitor<'_> { - /// Checks if log replay already processed this logical file (in which case the current action - /// should be ignored). If not already seen, register it so we can recognize future duplicates. - fn check_and_record_seen(&mut self, key: FileActionKey) -> bool { - // Note: each (add.path + add.dv_unique_id()) pair has a - // unique Add + Remove pair in the log. For example: - // https://github.com/delta-io/delta/blob/master/spark/src/test/resources/delta/table-with-dv-large/_delta_log/00000000000000000001.json - - if self.seen.contains(&key) { - debug!( - "Ignoring duplicate ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - true - } else { - debug!( - "Including ({}, {:?}) in scan, is log {}", - key.path, key.dv_unique_id, self.is_log_batch - ); - if self.is_log_batch { - // Remember file actions from this batch so we can ignore duplicates as we process - // batches from older commit and/or checkpoint files. We don't track checkpoint - // batches because they are already the oldest actions and never replace anything. - self.seen.insert(key); - } - false - } - } - - /// True if this row contains an Add action that should survive log replay. Skip it if the row - /// is not an Add action, or the file has already been seen previously. - fn is_valid_add<'a>(&mut self, i: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult { - // Add will have a path at index 0 if it is valid; otherwise, if it is a log batch, we may - // have a remove with a path at index 4. In either case, extract the three dv getters at - // indexes that immediately follow a valid path index. - let (path, dv_getters, is_add) = if let Some(path) = getters[0].get_str(i, "add.path")? { - (path, &getters[1..4], true) - } else if !self.is_log_batch { - return Ok(false); - } else if let Some(path) = getters[4].get_opt(i, "remove.path")? { - (path, &getters[5..8], false) - } else { - return Ok(false); - }; - - let dv_unique_id = match dv_getters[0].get_opt(i, "deletionVector.storageType")? { - Some(storage_type) => Some(DeletionVectorDescriptor::unique_id_from_parts( - storage_type, - dv_getters[1].get(i, "deletionVector.pathOrInlineDv")?, - dv_getters[2].get_opt(i, "deletionVector.offset")?, - )), - None => None, - }; - - // Process both adds and removes, but only return not already-seen adds - let file_key = FileActionKey::new(path, dv_unique_id); - Ok(!self.check_and_record_seen(file_key) && is_add) - } -} - -impl RowVisitor for AddRemoveDedupVisitor<'_> { - fn selected_column_names_and_types(&self) -> (&'static [ColumnName], &'static [DataType]) { - // NOTE: The visitor assumes a schema with adds first and removes optionally afterward. - static NAMES_AND_TYPES: LazyLock = LazyLock::new(|| { - const STRING: DataType = DataType::STRING; - const INTEGER: DataType = DataType::INTEGER; - let types_and_names = vec![ - (STRING, column_name!("add.path")), - (STRING, column_name!("add.deletionVector.storageType")), - (STRING, column_name!("add.deletionVector.pathOrInlineDv")), - (INTEGER, column_name!("add.deletionVector.offset")), - (STRING, column_name!("remove.path")), - (STRING, column_name!("remove.deletionVector.storageType")), - (STRING, column_name!("remove.deletionVector.pathOrInlineDv")), - (INTEGER, column_name!("remove.deletionVector.offset")), - ]; - let (types, names) = types_and_names.into_iter().unzip(); - (names, types).into() - }); - let (names, types) = NAMES_AND_TYPES.as_ref(); - if self.is_log_batch { - (names, types) - } else { - // All checkpoint actions are already reconciled and Remove actions in checkpoint files - // only serve as tombstones for vacuum jobs. So we only need to examine the adds here. - (&names[..4], &types[..4]) - } - } - - fn visit<'a>(&mut self, row_count: usize, getters: &[&'a dyn GetData<'a>]) -> DeltaResult<()> { - let expected_getters = if self.is_log_batch { 8 } else { 4 }; - require!( - getters.len() == expected_getters, - Error::InternalError(format!( - "Wrong number of AddRemoveDedupVisitor getters: {}", - getters.len() - )) - ); - - for i in 0..row_count { - if self.selection_vector[i] { - self.selection_vector[i] = self.is_valid_add(i, getters)?; - } - } - Ok(()) - } -} - -// NB: If you update this schema, ensure you update the comment describing it in the doc comment -// for `scan_row_schema` in scan/mod.rs! You'll also need to update ScanFileVisitor as the -// indexes will be off, and [`get_add_transform_expr`] below to match it. -pub(crate) static SCAN_ROW_SCHEMA: LazyLock> = LazyLock::new(|| { - // Note that fields projected out of a nullable struct must be nullable - let partition_values = MapType::new(DataType::STRING, DataType::STRING, true); - let file_constant_values = - StructType::new([StructField::new("partitionValues", partition_values, true)]); - let deletion_vector = StructType::new([ - StructField::new("storageType", DataType::STRING, true), - StructField::new("pathOrInlineDv", DataType::STRING, true), - StructField::new("offset", DataType::INTEGER, true), - StructField::new("sizeInBytes", DataType::INTEGER, true), - StructField::new("cardinality", DataType::LONG, true), - ]); - Arc::new(StructType::new([ - StructField::new("path", DataType::STRING, true), - StructField::new("size", DataType::LONG, true), - StructField::new("modificationTime", DataType::LONG, true), - StructField::new("stats", DataType::STRING, true), - StructField::new("deletionVector", deletion_vector, true), - StructField::new("fileConstantValues", file_constant_values, true), - ])) -}); - -pub(crate) static SCAN_ROW_DATATYPE: LazyLock = - LazyLock::new(|| SCAN_ROW_SCHEMA.clone().into()); - -fn get_add_transform_expr() -> Expression { - Expression::Struct(vec![ - column_expr!("add.path"), - column_expr!("add.size"), - column_expr!("add.modificationTime"), - column_expr!("add.stats"), - column_expr!("add.deletionVector"), - Expression::Struct(vec![column_expr!("add.partitionValues")]), - ]) -} - -impl LogReplayScanner { - /// Create a new [`LogReplayScanner`] instance - fn new(engine: &dyn Engine, physical_predicate: Option<(ExpressionRef, SchemaRef)>) -> Self { - Self { - filter: DataSkippingFilter::new(engine, physical_predicate), - seen: Default::default(), - } - } - - fn process_scan_batch( - &mut self, - add_transform: &dyn ExpressionEvaluator, - actions: &dyn EngineData, - is_log_batch: bool, - ) -> DeltaResult { - // Apply data skipping to get back a selection vector for actions that passed skipping. We - // will update the vector below as log replay identifies duplicates that should be ignored. - let selection_vector = match &self.filter { - Some(filter) => filter.apply(actions)?, - None => vec![true; actions.len()], - }; - assert_eq!(selection_vector.len(), actions.len()); - - let mut visitor = AddRemoveDedupVisitor { - seen: &mut self.seen, - selection_vector, - is_log_batch, - }; - visitor.visit_rows_of(actions)?; - - // TODO: Teach expression eval to respect the selection vector we just computed so carefully! - let selection_vector = visitor.selection_vector; - let result = add_transform.evaluate(actions)?; - Ok((result, selection_vector)) - } -} - -pub struct DeltaScanIterator { - // engine: &dyn Engine, - files_list: VecDeque, - log_scanner: LogReplayScanner, - physical_predicate: Option<(ExpressionRef, SchemaRef)>, - commit_read_schema: SchemaRef, - checkpoint_read_schema: SchemaRef, - add_transform: Arc, - json_handler: Arc, - parquet_handler: Arc, -} - -impl DeltaScanIterator { - pub(crate) fn new( - engine: &dyn Engine, - commit_files: Vec, - checkpoint_files: Vec, - physical_predicate: Option<(ExpressionRef, SchemaRef)>, - ) -> Self { - let mut files_list = VecDeque::new(); - - // Add commit files in reverse order - for path in commit_files.into_iter().rev() { - files_list.push_back(path); - } - - // Add checkpoint files - for path in checkpoint_files { - files_list.push_back(path); - } - - let log_scanner = LogReplayScanner::new(engine, physical_predicate.clone()); - let commit_read_schema = get_log_schema().project(&[ADD_NAME, REMOVE_NAME]).unwrap(); - let checkpoint_read_schema = get_log_add_schema().clone(); - - let add_transform: Arc = - engine.get_expression_handler().get_evaluator( - get_log_add_schema().clone(), - get_add_transform_expr(), - SCAN_ROW_DATATYPE.clone(), - ); - - Self { - files_list, - log_scanner, - physical_predicate, - commit_read_schema, - checkpoint_read_schema, - add_transform, - json_handler: engine.get_json_handler(), - parquet_handler: engine.get_parquet_handler(), - } - } -} - -impl Iterator for DeltaScanIterator { - type Item = DeltaResult; - - fn next(&mut self) -> Option { - if let Some(path) = self.files_list.pop_front() { - // check if commit or checkpoint file or ... - match path.file_type { - LogPathFileType::Commit => { - let action_iter = self - .json_handler - .read_json_files( - &[path.location.clone()], - self.commit_read_schema.clone(), - None, - ) - .unwrap(); - - result - //we are returning an iterator here, shit - //problem stems from the fact that our file reader returns batches, and we somehow need to turn batches of enginedata into a single ScanData - //idk if this is possible... - } - LogPathFileType::UuidCheckpoint(uuid) => { - let action_iter = self - .json_handler - .read_json_files( - &[path.location.clone()], - self.commit_read_schema.clone(), - None, - ) - .unwrap(); - - let result = action_iter.map(move |actions| -> DeltaResult<_> { - let actions = actions?; - - self.log_scanner.process_scan_batch( - self.add_transform.as_ref(), - actions.as_ref(), - false, - ) - }); - Some(Ok(result.flatten().next().unwrap())) - } - _ => None, - } - } else { - None - } - } -} From 0e57bae7854572c800d54e3018f6115db71e1283 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Feb 2025 12:12:45 -0800 Subject: [PATCH 21/37] review --- kernel/src/log_segment.rs | 43 ++-- kernel/src/log_segment/tests.rs | 428 +++++++++++++++++++------------- kernel/src/utils.rs | 89 +------ 3 files changed, 271 insertions(+), 289 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 13ff771f5..9ae5fbc78 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -224,9 +224,7 @@ impl LogSegment { meta_predicate, self.checkpoint_parts.clone(), self.log_root.clone(), - ) - .into_iter() - .flatten(); + )?; Ok(commit_stream.chain(checkpoint_stream)) } @@ -238,7 +236,7 @@ impl LogSegment { /// - Schema does not contain file actions /// /// For single-part checkpoints, any referenced sidecar files are processed. These - /// sidecar files contain the actual add/remove actions that would otherwise be + /// sidecar files contain the actual add actions that would otherwise be /// stored directly in the checkpoint. The sidecar file batches are chained to the /// checkpoint batch in the top level iterator to be returned. fn create_checkpoint_stream( @@ -248,9 +246,9 @@ impl LogSegment { checkpoint_parts: Vec, log_root: Url, ) -> DeltaResult, bool)>> + Send> { - let need_file_actions = checkpoint_read_schema.contains(ADD_NAME); + let need_add_actions = checkpoint_read_schema.contains(ADD_NAME); require!( - !need_file_actions || checkpoint_read_schema.contains(SIDECAR_NAME), + !need_add_actions || checkpoint_read_schema.contains(SIDECAR_NAME), Error::generic( "If the checkpoint read schema contains file actions, it must contain the sidecar column" ) @@ -278,24 +276,26 @@ impl LogSegment { )? }; - // 1. In the case where the schema does not contain add/remove actions, we return the checkpoint - // batch directly as sidecar files only have to be read when the schema contains add/remove actions. - // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is. - let skip_sidecar_search = !need_file_actions || checkpoint_parts.len() > 1; - let parquet_handler = engine.get_parquet_handler(); Ok(actions .map(move |batch_result| { batch_result.and_then(|checkpoint_batch| { // This closure maps the checkpoint batch to an iterator of batches // by chaining the checkpoint batch with sidecar batches if they exist. - let sidecar_content = Self::process_sidecars( - // cheap Arc clone - parquet_handler.clone(), - log_root.clone(), - checkpoint_batch.as_ref(), - skip_sidecar_search, - )?; + + // 1. In the case where the schema does not contain add/remove actions, we return the checkpoint + // batch directly as sidecar files only have to be read when the schema contains add/remove actions. + // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is. + let sidecar_content = if !need_add_actions || checkpoint_parts.len() > 1 { + None + } else { + Self::process_sidecars( + // cheap Arc clone + parquet_handler.clone(), + log_root.clone(), + checkpoint_batch.as_ref(), + )? + }; Ok(std::iter::once(Ok((checkpoint_batch, false))).chain( sidecar_content @@ -318,14 +318,7 @@ impl LogSegment { parquet_handler: Arc, log_root: Url, batch: &dyn EngineData, - skip_sidecar_search: bool, ) -> DeltaResult>> + Send>> { - // If the schema does not contain add/remove actions or we have a multi-part checkpoint, - // we return early as sidecar files are not needed. - if skip_sidecar_search { - return Ok(None); - } - // Visit the rows of the checkpoint batch to extract sidecar file references let mut visitor = SidecarVisitor::default(); visitor.visit_rows_of(batch)?; diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index bf70803fe..b12d30618 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -2,13 +2,16 @@ use std::{path::PathBuf, sync::Arc}; use itertools::Itertools; use object_store::{memory::InMemory, path::Path, ObjectStore}; +use parquet::arrow::ArrowWriter; use url::Url; use crate::actions::{ get_log_add_schema, get_log_schema, Add, Sidecar, ADD_NAME, METADATA_NAME, SIDECAR_NAME, }; +use crate::engine::arrow_data::ArrowEngineData; use crate::engine::default::executor::tokio::TokioBackgroundExecutor; use crate::engine::default::filesystem::ObjectStoreFileSystemClient; +use crate::engine::default::DefaultEngine; use crate::engine::sync::SyncEngine; use crate::log_segment::LogSegment; use crate::path::ParsedLogPath; @@ -16,10 +19,8 @@ use crate::scan::test_utils::{ add_batch_simple, add_batch_with_remove, sidecar_batch_with_given_paths, }; use crate::snapshot::CheckpointMetadata; -use crate::utils::test_utils::{ - assert_batch_matches, assert_error_contains, Action, LocalMockTable, -}; -use crate::{DeltaResult, Engine, FileMeta, FileSystemClient, Table}; +use crate::utils::test_utils::{assert_batch_matches, Action}; +use crate::{DeltaResult, Engine, EngineData, FileMeta, FileSystemClient, Table}; use test_utils::delta_path_for_version; // NOTE: In addition to testing the meta-predicate for metadata replay, this test also verifies @@ -117,6 +118,100 @@ fn build_log_with_paths_and_checkpoint( (Box::new(client), log_root) } +// Create an in-memory store and return the store and the URL for the store's _delta_log directory. +fn new_in_memory_store() -> (Arc, Url) { + ( + Arc::new(InMemory::new()), + Url::parse("memory:///") + .unwrap() + .join("_delta_log/") + .unwrap(), + ) +} + +// Writes a record batch obtained from engine data to the in-memory store at a given path. +fn write_record_batch_to_store( + store: &Arc, + path: String, + data: Box, +) -> DeltaResult<()> { + let batch = ArrowEngineData::try_from_engine_data(data)?; + let record_batch = batch.record_batch(); + + let mut buffer = vec![]; + let mut writer = ArrowWriter::try_new(&mut buffer, record_batch.schema(), None)?; + writer.write(record_batch)?; + writer.close()?; + println!("Writing to path: {}", path); + + tokio::runtime::Runtime::new() + .expect("create tokio runtime") + .block_on(async { + if let Err(e) = store.put(&Path::from(path), buffer.into()).await { + eprintln!("Error writing to store: {}", e); + } + }); + + Ok(()) +} + +/// Writes all actions to a _delta_log parquet checkpoint file in the store. +/// This function formats the provided filename into the _delta_log directory. +fn add_checkpoint_to_store( + store: &Arc, + data: Box, + filename: &str, +) -> DeltaResult<()> { + let path = format!("_delta_log/{}", filename); + write_record_batch_to_store(store, path, data) +} + +/// Writes all actions to a _delta_log/_sidecars file in the store. +/// This function formats the provided filename into the _sidecars subdirectory. +fn add_sidecar_to_store( + store: &Arc, + data: Box, + filename: &str, +) -> DeltaResult<()> { + let path = format!("_delta_log/_sidecars/{}", filename); + write_record_batch_to_store(store, path, data) +} + +/// Writes all actions to a _delta_log json checkpoint file in the store. +/// This function formats the provided filename into the _delta_log directory. +fn add_json_checkpoint_to_store( + store: &Arc, + actions: Vec, + filename: &str, +) -> DeltaResult<()> { + let json_lines: Vec = actions + .into_iter() + .map(|action| serde_json::to_string(&action).expect("action to string")) + .collect(); + let content = json_lines.join("\n"); + let checkpoint_path = format!("_delta_log/{}", filename); + + tokio::runtime::Runtime::new() + .expect("create tokio runtime") + .block_on(async { + store + .put(&Path::from(checkpoint_path), content.into()) + .await + })?; + + Ok(()) +} + +fn create_log_path(path: &str) -> ParsedLogPath { + ParsedLogPath::try_from(FileMeta { + location: Url::parse(path).expect("Invalid file URL"), + last_modified: 0, + size: 0, + }) + .unwrap() + .unwrap() +} + #[test] fn build_snapshot_with_unsupported_uuid_checkpoint() { let (client, log_root) = build_log_with_paths_and_checkpoint( @@ -647,12 +742,18 @@ fn test_sidecar_to_filemeta_valid_paths() -> DeltaResult<()> { ), ]; - for (input_path, expected_url) in test_cases.iter() { - let sidecar = create_sidecar(*input_path); + for (input_path, expected_url) in test_cases.into_iter() { + let sidecar = Sidecar { + path: expected_url.to_string(), + modification_time: 0, + size_in_bytes: 1000, + tags: None, + }; + let filemeta = LogSegment::sidecar_to_filemeta(&sidecar, &log_root)?; assert_eq!( filemeta.location.as_str(), - *expected_url, + expected_url, "Mismatch for input path: {}", input_path ); @@ -662,15 +763,14 @@ fn test_sidecar_to_filemeta_valid_paths() -> DeltaResult<()> { #[test] fn test_checkpoint_batch_with_no_sidecars_returns_none() -> DeltaResult<()> { + let (_, log_root) = new_in_memory_store(); let engine = Arc::new(SyncEngine::new()); - let log_root = Url::parse("s3://example-bucket/logs/")?; let checkpoint_batch = add_batch_simple(get_log_schema().clone()); let mut iter = LogSegment::process_sidecars( engine.get_parquet_handler(), log_root, checkpoint_batch.as_ref(), - false, )? .into_iter() .flatten(); @@ -681,24 +781,28 @@ fn test_checkpoint_batch_with_no_sidecars_returns_none() -> DeltaResult<()> { Ok(()) } -#[tokio::test] -async fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult<()> { - let mut mock_table = LocalMockTable::new(); +#[test] +fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult<()> { + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngine::new( + store.clone(), + Path::from("/"), + Arc::new(TokioBackgroundExecutor::new()), + ); + let sidecar_schema = get_log_add_schema().clone(); - mock_table - .sidecar( - add_batch_simple(sidecar_schema.clone()), - "sidecarfile1.parquet", - ) - .await; - mock_table - .sidecar( - add_batch_with_remove(sidecar_schema.clone()), - "sidecarfile2.parquet", - ) - .await; - let engine = Arc::new(SyncEngine::new()); + let _ = add_sidecar_to_store( + &store, + add_batch_simple(sidecar_schema.clone()), + "sidecarfile1.parquet", + )?; + let _ = add_sidecar_to_store( + &store, + add_batch_with_remove(sidecar_schema.clone()), + "sidecarfile2.parquet", + )?; + let checkpoint_batch = sidecar_batch_with_given_paths( vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?, @@ -706,9 +810,8 @@ async fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaR let mut iter = LogSegment::process_sidecars( engine.get_parquet_handler(), - mock_table.log_root(), + log_root, checkpoint_batch.as_ref(), - false, )? .into_iter() .flatten(); @@ -729,8 +832,13 @@ async fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaR #[test] fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() -> DeltaResult<()> { - let mock_table = LocalMockTable::new(); - let engine = Arc::new(SyncEngine::new()); + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngine::new( + store.clone(), + Path::from("/"), + Arc::new(TokioBackgroundExecutor::new()), + ); + let checkpoint_batch = sidecar_batch_with_given_paths( vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], get_log_schema().clone(), @@ -738,20 +846,15 @@ fn test_checkpoint_batch_with_sidecar_files_that_do_not_exist() -> DeltaResult<( let mut iter = LogSegment::process_sidecars( engine.get_parquet_handler(), - mock_table.log_root(), + log_root, checkpoint_batch.as_ref(), - false, )? .into_iter() .flatten(); // Assert that an error is returned when trying to read sidecar files that do not exist let err = iter.next().unwrap(); - if cfg!(windows) { - assert_error_contains(err, "The system cannot find the path specified"); - } else { - assert_error_contains(err, "No such file or directory"); - } + assert!(err.is_err()); Ok(()) } @@ -790,33 +893,30 @@ fn test_create_checkpoint_stream_errors_when_schema_has_add_but_no_sidecar_actio Url::parse("s3://example-bucket/logs/")?, ); - assert_error_contains( - result, - "If the checkpoint read schema contains file actions, it must contain the sidecar column", - ); + // Errors because the schema has an ADD action but no SIDECAR action. + assert!(result.is_err()); Ok(()) } -#[tokio::test] -async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_no_file_actions( +#[test] +fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_no_file_actions( ) -> DeltaResult<()> { - let engine = SyncEngine::new(); + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngine::new( + store.clone(), + Path::from("/"), + Arc::new(TokioBackgroundExecutor::new()), + ); let v2_checkpoint_read_schema = get_log_schema().project(&[METADATA_NAME])?; - let mut mock_table = LocalMockTable::new(); - mock_table - .parquet_checkpoint( - // Create a checkpoint batch with sidecar actions to verify that the sidecar actions are not read. - sidecar_batch_with_given_paths( - vec!["sidecar1.parquet"], - v2_checkpoint_read_schema.clone(), - ), - "00000000000000000001.checkpoint.parquet", - ) - .await; + let _ = add_checkpoint_to_store( + &store, + // Create a checkpoint batch with sidecar actions to verify that the sidecar actions are not read. + sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], v2_checkpoint_read_schema.clone()), + "00000000000000000001.checkpoint.parquet", + )?; - let checkpoint_one_file = mock_table - .log_root() + let checkpoint_one_file = log_root .join("00000000000000000001.checkpoint.parquet")? .to_string(); @@ -825,7 +925,7 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schem v2_checkpoint_read_schema.clone(), None, vec![create_log_path(&checkpoint_one_file)], - mock_table.log_root(), + log_root, )?; // Assert that the first batch returned is from reading checkpoint file 1 @@ -840,44 +940,37 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schem Ok(()) } -#[tokio::test] -async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_multi_part( +#[test] +fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_multi_part( ) -> DeltaResult<()> { - let engine = SyncEngine::new(); + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngine::new( + store.clone(), + Path::from("/"), + Arc::new(TokioBackgroundExecutor::new()), + ); let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; - let mut mock_table = LocalMockTable::new(); // Multi-part checkpoints can never have sidecar actions. // We place batches with sidecar actions in multi-part checkpoints to verify we do not read the actions, as we // should instead short-circuit and return the checkpoint batches as-is when encountering multi-part checkpoints. - mock_table - .parquet_checkpoint( - sidecar_batch_with_given_paths( - vec!["sidecar1.parquet"], - v2_checkpoint_read_schema.clone(), - ), - "00000000000000000001.checkpoint.0000000001.0000000002.parquet", - ) - .await; - mock_table - .parquet_checkpoint( - sidecar_batch_with_given_paths( - vec!["sidecar2.parquet"], - v2_checkpoint_read_schema.clone(), - ), - "00000000000000000001.checkpoint.0000000002.0000000002.parquet", - ) - .await; + let checkpoint_part_1 = "00000000000000000001.checkpoint.0000000001.0000000002.parquet"; + let checkpoint_part_2 = "00000000000000000001.checkpoint.0000000002.0000000002.parquet"; - let checkpoint_one_file = mock_table - .log_root() - .join("00000000000000000001.checkpoint.0000000001.0000000002.parquet")? - .to_string(); + let _ = add_checkpoint_to_store( + &store, + sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], v2_checkpoint_read_schema.clone()), + checkpoint_part_1, + )?; + let _ = add_checkpoint_to_store( + &store, + sidecar_batch_with_given_paths(vec!["sidecar2.parquet"], v2_checkpoint_read_schema.clone()), + checkpoint_part_2, + )?; - let checkpoint_two_file = mock_table - .log_root() - .join("00000000000000000001.checkpoint.0000000002.0000000002.parquet")? - .to_string(); + let checkpoint_one_file = log_root.join(checkpoint_part_1)?.to_string(); + + let checkpoint_two_file = log_root.join(checkpoint_part_2)?.to_string(); let mut iter = LogSegment::create_checkpoint_stream( &engine, @@ -887,7 +980,7 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ create_log_path(&checkpoint_one_file), create_log_path(&checkpoint_two_file), ], - mock_table.log_root(), + log_root, )?; // Assert the correctness of batches returned @@ -907,26 +1000,27 @@ async fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_ Ok(()) } -// Test showcases weird behavior where reading a batch with a missing column causes the reader to -// insert the empty-string in string fields. This is seen in this test where we find two instances of Sidecars with -// empty-string path fields after visiting the batch with the SidecarVisitor due to the batch being read with -// the schema which includes the Sidecar column. -#[tokio::test] -async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_sidecars( -) -> DeltaResult<()> { - let engine = SyncEngine::new(); +// TODO: arrow v53.3 introduces bug surrounding nullabilty of fields in schema projection. +// Once addressed, pass the `v2_checkpoint_read_schema` to `add_batch_simple` instead of `get_log_add_schema()` +// to ensure the issue is fixed +#[test] +fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_sidecars() -> DeltaResult<()> +{ + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngine::new( + store.clone(), + Path::from("/"), + Arc::new(TokioBackgroundExecutor::new()), + ); let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; - let mut mock_table = LocalMockTable::new(); - mock_table - .parquet_checkpoint( - add_batch_simple(get_log_add_schema().clone()), - "00000000000000000001.checkpoint.parquet", - ) - .await; + let _ = add_checkpoint_to_store( + &store, + add_batch_simple(get_log_add_schema().clone()), + "00000000000000000001.checkpoint.parquet", + )?; - let checkpoint_one_file = mock_table - .log_root() + let checkpoint_one_file = log_root .join("00000000000000000001.checkpoint.parquet")? .to_string(); @@ -935,7 +1029,7 @@ async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_si v2_checkpoint_read_schema.clone(), None, vec![create_log_path(&checkpoint_one_file)], - mock_table.log_root(), + log_root, )?; // Assert that the first batch returned is from reading checkpoint file 1 @@ -950,26 +1044,27 @@ async fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_si Ok(()) } -#[tokio::test] -async fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars( -) -> DeltaResult<()> { - let engine = SyncEngine::new(); +#[test] +fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() -> DeltaResult<()> { + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngine::new( + store.clone(), + Path::from("/"), + Arc::new(TokioBackgroundExecutor::new()), + ); let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; - let mut mock_table = LocalMockTable::new(); - - mock_table - .json_checkpoint( - [Action::Add(Add { - path: "fake_path_1".into(), - data_change: true, - ..Default::default() - })], - "00000000000000000001.checkpoint.json", - ) - .await; - - let checkpoint_one_file = mock_table - .log_root() + + let _ = add_json_checkpoint_to_store( + &store, + vec![Action::Add(Add { + path: "fake_path_1".into(), + data_change: true, + ..Default::default() + })], + "00000000000000000001.checkpoint.json", + ); + + let checkpoint_one_file = log_root .join("00000000000000000001.checkpoint.json")? .to_string(); @@ -978,7 +1073,7 @@ async fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidec v2_checkpoint_read_schema.clone(), None, vec![create_log_path(&checkpoint_one_file)], - mock_table.log_root(), + log_root, )?; // Assert that the first batch returned is from reading checkpoint file 1 @@ -987,7 +1082,7 @@ async fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidec // Although we do not assert the contents, we know the JSON checkpoint is read correctly as // a single batch is returned and no errors are thrown. - // TODO: Convert JSON checkpoint to RecordBatch and assert that it is as expected + // TODO: Convert JSON checkpoint to RecordBatch and assert that it is as expected for better testing assert!(iter.next().is_none()); Ok(()) @@ -996,38 +1091,39 @@ async fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidec // Encapsulates logic that has already been tested but tests the interaction between the functions, // such as performing a map operation on the returned sidecar batches from `process_sidecars` // to include the is_log_batch flag -#[tokio::test] -async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches( +#[test] +fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches( ) -> DeltaResult<()> { - let engine = SyncEngine::new(); + let (store, log_root) = new_in_memory_store(); + let engine = DefaultEngine::new( + store.clone(), + Path::from("/"), + Arc::new(TokioBackgroundExecutor::new()), + ); let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let sidecar_schema = get_log_add_schema(); - let mut mock_table = LocalMockTable::new(); - mock_table - .parquet_checkpoint( - sidecar_batch_with_given_paths( - vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], - get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?, - ), - "00000000000000000001.checkpoint.parquet", - ) - .await; - mock_table - .sidecar( - add_batch_simple(sidecar_schema.clone()), - "sidecarfile1.parquet", - ) - .await; - mock_table - .sidecar( - add_batch_with_remove(sidecar_schema.clone()), - "sidecarfile2.parquet", - ) - .await; - - let checkpoint_file_path = mock_table - .log_root() + let _ = add_checkpoint_to_store( + &store, + sidecar_batch_with_given_paths( + vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], + get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?, + ), + "00000000000000000001.checkpoint.parquet", + )?; + + let _ = add_sidecar_to_store( + &store, + add_batch_simple(sidecar_schema.clone()), + "sidecarfile1.parquet", + )?; + let _ = add_sidecar_to_store( + &store, + add_batch_with_remove(sidecar_schema.clone()), + "sidecarfile2.parquet", + )?; + + let checkpoint_file_path = log_root .join("00000000000000000001.checkpoint.parquet")? .to_string(); @@ -1036,7 +1132,7 @@ async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar v2_checkpoint_read_schema.clone(), None, vec![create_log_path(&checkpoint_file_path)], - mock_table.log_root(), + log_root, )?; // Assert that the first batch returned is from reading checkpoint file 1 @@ -1064,23 +1160,3 @@ async fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar Ok(()) } - -/// Creates a Sidecar instance with default modification time, file size and no tags. -fn create_sidecar>(path: P) -> Sidecar { - Sidecar { - path: path.into(), - modification_time: 0, - size_in_bytes: 1000, - tags: None, - } -} - -fn create_log_path(path: &str) -> ParsedLogPath { - ParsedLogPath::try_from(FileMeta { - location: Url::parse(path).expect("Invalid file URL"), - last_modified: 0, - size: 0, - }) - .unwrap() - .unwrap() -} diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index d94029f97..f8d0bfe27 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -15,14 +15,12 @@ pub(crate) use require; pub(crate) mod test_utils { use arrow_array::RecordBatch; use itertools::Itertools; + use object_store::local::LocalFileSystem; use object_store::ObjectStore; - use object_store::{local::LocalFileSystem, path}; - use parquet::arrow::ArrowWriter; use serde::Serialize; use std::{path::Path, sync::Arc}; use tempfile::TempDir; use test_utils::delta_path_for_version; - use url::Url; use crate::{ actions::{Add, Cdc, CommitInfo, Metadata, Protocol, Remove}, @@ -81,76 +79,10 @@ pub(crate) mod test_utils { .expect("put log file in store"); } - /// Writes all `actions` to a new json checkpoint in the log - pub(crate) async fn json_checkpoint( - &mut self, - actions: impl IntoIterator, - filename: &str, - ) { - let data = actions - .into_iter() - .map(|action| serde_json::to_string(&action).unwrap()) - .join("\n"); - - let path = format!("_delta_log/{filename}"); - - self.store - .put(&path::Path::from(path), data.into()) - .await - .expect("put log file in store"); - } - - /// Writes all `actions` to a new parquet checkpoint in the log - pub(crate) async fn parquet_checkpoint( - &mut self, - data: Box, - filename: &str, - ) { - let batch: Box<_> = ArrowEngineData::try_from_engine_data(data).unwrap(); - let record_batch = batch.record_batch(); - - let mut buffer = vec![]; - let mut writer = - ArrowWriter::try_new(&mut buffer, record_batch.schema(), None).unwrap(); - writer.write(record_batch).unwrap(); - writer.close().unwrap(); // writer must be closed to write footer - - let path = format!("_delta_log/{filename}"); - - self.store - .put(&path::Path::from(path), buffer.into()) - .await - .expect("put sidecar file in store"); - } - - /// Writes all `actions` as EngineData to a new sidecar file in the `_delta_log/_sidecars` directory - pub(crate) async fn sidecar(&mut self, data: Box, filename: &str) { - let batch: Box<_> = ArrowEngineData::try_from_engine_data(data).unwrap(); - let record_batch = batch.record_batch(); - - let mut buffer = vec![]; - let mut writer = - ArrowWriter::try_new(&mut buffer, record_batch.schema(), None).unwrap(); - writer.write(record_batch).unwrap(); - writer.close().unwrap(); // writer must be closed to write footer - - let path = format!("_delta_log/_sidecars/{filename:020}"); - - self.store - .put(&path::Path::from(path), buffer.into()) - .await - .expect("put sidecar file in store"); - } - /// Get the path to the root of the table. pub(crate) fn table_root(&self) -> &Path { self.dir.path() } - - /// Get the path to the root of the log in the table. - pub(crate) fn log_root(&self) -> Url { - Url::from_directory_path(self.dir.path().join("_delta_log")).unwrap() - } } /// Try to convert an `EngineData` into a `RecordBatch`. Panics if not using `ArrowEngineData` from @@ -165,23 +97,4 @@ pub(crate) mod test_utils { pub(crate) fn assert_batch_matches(actual: Box, expected: Box) { assert_eq!(into_record_batch(actual), into_record_batch(expected)); } - - /// Checks that the error message from a Result contains the expected substring. - pub(crate) fn assert_error_contains(result: Result, expected: &str) { - match result { - Ok(_) => panic!( - "Expected an error containing \"{}\", but the result was Ok", - expected - ), - Err(e) => { - let err_str = e.to_string(); - if !err_str.contains(expected) { - panic!( - "Error \"{}\" did not contain expected substring \"{}\"", - err_str, expected - ); - } - } - } - } } From d49f8350e2c18517c7f0d320d1a9f40a43dfbac1 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Feb 2025 12:15:43 -0800 Subject: [PATCH 22/37] clippy --- kernel/src/log_segment/tests.rs | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index b12d30618..a7a192708 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -792,12 +792,12 @@ fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult< let sidecar_schema = get_log_add_schema().clone(); - let _ = add_sidecar_to_store( + add_sidecar_to_store( &store, add_batch_simple(sidecar_schema.clone()), "sidecarfile1.parquet", )?; - let _ = add_sidecar_to_store( + add_sidecar_to_store( &store, add_batch_with_remove(sidecar_schema.clone()), "sidecarfile2.parquet", @@ -909,7 +909,7 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_ Arc::new(TokioBackgroundExecutor::new()), ); let v2_checkpoint_read_schema = get_log_schema().project(&[METADATA_NAME])?; - let _ = add_checkpoint_to_store( + add_checkpoint_to_store( &store, // Create a checkpoint batch with sidecar actions to verify that the sidecar actions are not read. sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], v2_checkpoint_read_schema.clone()), @@ -957,12 +957,12 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_mul let checkpoint_part_1 = "00000000000000000001.checkpoint.0000000001.0000000002.parquet"; let checkpoint_part_2 = "00000000000000000001.checkpoint.0000000002.0000000002.parquet"; - let _ = add_checkpoint_to_store( + add_checkpoint_to_store( &store, sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], v2_checkpoint_read_schema.clone()), checkpoint_part_1, )?; - let _ = add_checkpoint_to_store( + add_checkpoint_to_store( &store, sidecar_batch_with_given_paths(vec!["sidecar2.parquet"], v2_checkpoint_read_schema.clone()), checkpoint_part_2, @@ -1014,7 +1014,7 @@ fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_sidecars ); let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; - let _ = add_checkpoint_to_store( + add_checkpoint_to_store( &store, add_batch_simple(get_log_add_schema().clone()), "00000000000000000001.checkpoint.parquet", @@ -1054,7 +1054,7 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() ); let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; - let _ = add_json_checkpoint_to_store( + add_json_checkpoint_to_store( &store, vec![Action::Add(Add { path: "fake_path_1".into(), @@ -1062,7 +1062,7 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() ..Default::default() })], "00000000000000000001.checkpoint.json", - ); + )?; let checkpoint_one_file = log_root .join("00000000000000000001.checkpoint.json")? @@ -1103,7 +1103,7 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; let sidecar_schema = get_log_add_schema(); - let _ = add_checkpoint_to_store( + add_checkpoint_to_store( &store, sidecar_batch_with_given_paths( vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], @@ -1112,12 +1112,12 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch "00000000000000000001.checkpoint.parquet", )?; - let _ = add_sidecar_to_store( + add_sidecar_to_store( &store, add_batch_simple(sidecar_schema.clone()), "sidecarfile1.parquet", )?; - let _ = add_sidecar_to_store( + add_sidecar_to_store( &store, add_batch_with_remove(sidecar_schema.clone()), "sidecarfile2.parquet", From 510dc35502ad5b2d64e5bcf58f7330386a956fe5 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Feb 2025 13:53:03 -0800 Subject: [PATCH 23/37] link issue --- kernel/src/log_segment/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index a7a192708..446f86e81 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -1000,7 +1000,7 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_mul Ok(()) } -// TODO: arrow v53.3 introduces bug surrounding nullabilty of fields in schema projection. +// TODO: Fix https://github.com/apache/arrow-rs/issues/7119 // Once addressed, pass the `v2_checkpoint_read_schema` to `add_batch_simple` instead of `get_log_add_schema()` // to ensure the issue is fixed #[test] From c49402cdd2e07b08adc7227aef7540941ab861b7 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 12 Feb 2025 11:02:52 -0800 Subject: [PATCH 24/37] nits --- kernel/src/log_segment.rs | 55 +++++++++++++++++++-------------------- kernel/src/scan/mod.rs | 3 ++- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 9ae5fbc78..ba3b241c9 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -277,37 +277,36 @@ impl LogSegment { }; let parquet_handler = engine.get_parquet_handler(); - Ok(actions - .map(move |batch_result| { - batch_result.and_then(|checkpoint_batch| { - // This closure maps the checkpoint batch to an iterator of batches - // by chaining the checkpoint batch with sidecar batches if they exist. - - // 1. In the case where the schema does not contain add/remove actions, we return the checkpoint - // batch directly as sidecar files only have to be read when the schema contains add/remove actions. - // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is. - let sidecar_content = if !need_add_actions || checkpoint_parts.len() > 1 { - None - } else { - Self::process_sidecars( - // cheap Arc clone - parquet_handler.clone(), - log_root.clone(), - checkpoint_batch.as_ref(), - )? - }; - - Ok(std::iter::once(Ok((checkpoint_batch, false))).chain( - sidecar_content - .into_iter() - .flatten() - .map_ok(|sidecar_batch| (sidecar_batch, false)), - )) - }) + let actions_iter = actions + .map(move |batch_result| -> DeltaResult<_> { + let checkpoint_batch = batch_result?; + // This closure maps the checkpoint batch to an iterator of batches + // by chaining the checkpoint batch with sidecar batches if they exist. + + // 1. In the case where the schema does not contain add/remove actions, we return the checkpoint + // batch directly as sidecar files only have to be read when the schema contains add/remove actions. + // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is. + let sidecar_content = if !need_add_actions || checkpoint_parts.len() > 1 { + None + } else { + Self::process_sidecars( + parquet_handler.clone(), // cheap Arc clone + log_root.clone(), + checkpoint_batch.as_ref(), + )? + }; + + let combined_batches = std::iter::once(Ok(checkpoint_batch)) + .chain(sidecar_content.into_iter().flatten()) + .map_ok(|sidecar_batch| (sidecar_batch, false)); + + Ok(combined_batches) }) .flatten_ok() // Map converts Result, _>,_> to Result, _> - .map(|result| result?)) + .map(|result| result?); + + Ok(actions_iter) } /// Processes sidecar files for the given checkpoint batch. diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 22f8c3a02..b4d6f51ef 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -667,6 +667,7 @@ pub(crate) mod test_utils { use arrow_array::{RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use itertools::Itertools; use crate::{ actions::get_log_schema, @@ -704,7 +705,7 @@ pub(crate) mod test_utils { r#"{{"sidecar":{{"path":"{path}","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{{"tag_foo":"tag_bar"}}}}}}"#, ) }) - .collect::>() + .collect_vec() .into(); let parsed = handler From 2fd8216e5d965c3cdd863394a56bed6fbd1dfed6 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 12 Feb 2025 11:15:25 -0800 Subject: [PATCH 25/37] nits --- kernel/src/log_segment/tests.rs | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 446f86e81..99ac66244 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -5,6 +5,7 @@ use object_store::{memory::InMemory, path::Path, ObjectStore}; use parquet::arrow::ArrowWriter; use url::Url; +use crate::actions::visitors::AddVisitor; use crate::actions::{ get_log_add_schema, get_log_schema, Add, Sidecar, ADD_NAME, METADATA_NAME, SIDECAR_NAME, }; @@ -20,7 +21,7 @@ use crate::scan::test_utils::{ }; use crate::snapshot::CheckpointMetadata; use crate::utils::test_utils::{assert_batch_matches, Action}; -use crate::{DeltaResult, Engine, EngineData, FileMeta, FileSystemClient, Table}; +use crate::{DeltaResult, Engine, EngineData, FileMeta, FileSystemClient, RowVisitor, Table}; use test_utils::delta_path_for_version; // NOTE: In addition to testing the meta-predicate for metadata replay, this test also verifies @@ -130,7 +131,7 @@ fn new_in_memory_store() -> (Arc, Url) { } // Writes a record batch obtained from engine data to the in-memory store at a given path. -fn write_record_batch_to_store( +fn write_parquet_to_store( store: &Arc, path: String, data: Box, @@ -163,7 +164,7 @@ fn add_checkpoint_to_store( filename: &str, ) -> DeltaResult<()> { let path = format!("_delta_log/{}", filename); - write_record_batch_to_store(store, path, data) + write_parquet_to_store(store, path, data) } /// Writes all actions to a _delta_log/_sidecars file in the store. @@ -174,12 +175,12 @@ fn add_sidecar_to_store( filename: &str, ) -> DeltaResult<()> { let path = format!("_delta_log/_sidecars/{}", filename); - write_record_batch_to_store(store, path, data) + write_parquet_to_store(store, path, data) } /// Writes all actions to a _delta_log json checkpoint file in the store. /// This function formats the provided filename into the _delta_log directory. -fn add_json_checkpoint_to_store( +fn write_json_to_store( store: &Arc, actions: Vec, filename: &str, @@ -1054,7 +1055,7 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() ); let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; - add_json_checkpoint_to_store( + write_json_to_store( &store, vec![Action::Add(Add { path: "fake_path_1".into(), @@ -1077,12 +1078,13 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() )?; // Assert that the first batch returned is from reading checkpoint file 1 - let (_first_batch, is_log_batch) = iter.next().unwrap()?; + let (first_batch, is_log_batch) = iter.next().unwrap()?; assert!(!is_log_batch); - // Although we do not assert the contents, we know the JSON checkpoint is read correctly as - // a single batch is returned and no errors are thrown. + let mut visitor = AddVisitor::default(); + visitor.visit_rows_of(&*first_batch)?; + assert!(visitor.adds.len() == 1); + assert!(visitor.adds[0].path == "fake_path_1"); - // TODO: Convert JSON checkpoint to RecordBatch and assert that it is as expected for better testing assert!(iter.next().is_none()); Ok(()) From f8defbe5c08fdb391afe79825b78cc695bda0090 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 12 Feb 2025 14:17:38 -0800 Subject: [PATCH 26/37] test review --- kernel/src/actions/visitors.rs | 9 ++++++- kernel/src/log_segment/tests.rs | 43 ++++++++++++++++----------------- kernel/src/scan/mod.rs | 22 +++++++++++------ 3 files changed, 44 insertions(+), 30 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index d1af804ae..000fc743d 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -475,7 +475,14 @@ impl RowVisitor for SidecarVisitor { ); for i in 0..row_count { // Since path column is required, use it to detect presence of a sidecar action - if let Some(path) = getters[0].get_opt(i, "sidecar.path")? { + let opt_path: Option = getters[0].get_opt(i, "sidecar.path")?; + if let Some(path) = opt_path { + // TODO: Known issue here https://github.com/apache/arrow-rs/issues/7119 + // Once https://github.com/delta-io/delta-kernel-rs/pull/692 is merged, remove empty path check. + // This is a workaround to avoid constructing a sidecar action with an empty path for testing. + if path.is_empty() { + continue; + } self.sidecars.push(Self::visit_sidecar(i, path, getters)?); } } diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 99ac66244..677dc3aa7 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -791,16 +791,14 @@ fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult< Arc::new(TokioBackgroundExecutor::new()), ); - let sidecar_schema = get_log_add_schema().clone(); - add_sidecar_to_store( &store, - add_batch_simple(sidecar_schema.clone()), + add_batch_simple(get_log_schema().clone()), "sidecarfile1.parquet", )?; add_sidecar_to_store( &store, - add_batch_with_remove(sidecar_schema.clone()), + add_batch_with_remove(get_log_schema().clone()), "sidecarfile2.parquet", )?; @@ -818,6 +816,7 @@ fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult< .flatten(); // Assert the correctness of batches returned + let sidecar_schema = get_log_add_schema().clone(); assert_batch_matches( iter.next().unwrap()?, add_batch_simple(sidecar_schema.clone()), @@ -909,11 +908,10 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_ Path::from("/"), Arc::new(TokioBackgroundExecutor::new()), ); - let v2_checkpoint_read_schema = get_log_schema().project(&[METADATA_NAME])?; add_checkpoint_to_store( &store, // Create a checkpoint batch with sidecar actions to verify that the sidecar actions are not read. - sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], v2_checkpoint_read_schema.clone()), + sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], get_log_schema().clone()), "00000000000000000001.checkpoint.parquet", )?; @@ -921,6 +919,8 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_as_is_if_schema_has_ .join("00000000000000000001.checkpoint.parquet")? .to_string(); + let v2_checkpoint_read_schema = get_log_schema().project(&[METADATA_NAME])?; + let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), @@ -950,7 +950,6 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_mul Path::from("/"), Arc::new(TokioBackgroundExecutor::new()), ); - let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; // Multi-part checkpoints can never have sidecar actions. // We place batches with sidecar actions in multi-part checkpoints to verify we do not read the actions, as we @@ -960,19 +959,20 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_mul add_checkpoint_to_store( &store, - sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], v2_checkpoint_read_schema.clone()), + sidecar_batch_with_given_paths(vec!["sidecar1.parquet"], get_log_schema().clone()), checkpoint_part_1, )?; add_checkpoint_to_store( &store, - sidecar_batch_with_given_paths(vec!["sidecar2.parquet"], v2_checkpoint_read_schema.clone()), + sidecar_batch_with_given_paths(vec!["sidecar2.parquet"], get_log_schema().clone()), checkpoint_part_2, )?; let checkpoint_one_file = log_root.join(checkpoint_part_1)?.to_string(); - let checkpoint_two_file = log_root.join(checkpoint_part_2)?.to_string(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; + let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), @@ -1001,9 +1001,6 @@ fn test_create_checkpoint_stream_returns_checkpoint_batches_if_checkpoint_is_mul Ok(()) } -// TODO: Fix https://github.com/apache/arrow-rs/issues/7119 -// Once addressed, pass the `v2_checkpoint_read_schema` to `add_batch_simple` instead of `get_log_add_schema()` -// to ensure the issue is fixed #[test] fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_sidecars() -> DeltaResult<()> { @@ -1013,11 +1010,10 @@ fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_sidecars Path::from("/"), Arc::new(TokioBackgroundExecutor::new()), ); - let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; add_checkpoint_to_store( &store, - add_batch_simple(get_log_add_schema().clone()), + add_batch_simple(get_log_schema().clone()), "00000000000000000001.checkpoint.parquet", )?; @@ -1025,6 +1021,8 @@ fn test_create_checkpoint_stream_reads_parquet_checkpoint_batch_without_sidecars .join("00000000000000000001.checkpoint.parquet")? .to_string(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; + let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), @@ -1053,7 +1051,6 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() Path::from("/"), Arc::new(TokioBackgroundExecutor::new()), ); - let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; write_json_to_store( &store, @@ -1069,6 +1066,8 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() .join("00000000000000000001.checkpoint.json")? .to_string(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; + let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), @@ -1102,26 +1101,24 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch Path::from("/"), Arc::new(TokioBackgroundExecutor::new()), ); - let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; - let sidecar_schema = get_log_add_schema(); add_checkpoint_to_store( &store, sidecar_batch_with_given_paths( vec!["sidecarfile1.parquet", "sidecarfile2.parquet"], - get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?, + get_log_schema().clone(), ), "00000000000000000001.checkpoint.parquet", )?; add_sidecar_to_store( &store, - add_batch_simple(sidecar_schema.clone()), + add_batch_simple(get_log_schema().clone()), "sidecarfile1.parquet", )?; add_sidecar_to_store( &store, - add_batch_with_remove(sidecar_schema.clone()), + add_batch_with_remove(get_log_schema().clone()), "sidecarfile2.parquet", )?; @@ -1129,6 +1126,8 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch .join("00000000000000000001.checkpoint.parquet")? .to_string(); + let v2_checkpoint_read_schema = get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?; + let mut iter = LogSegment::create_checkpoint_stream( &engine, v2_checkpoint_read_schema.clone(), @@ -1147,8 +1146,8 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch get_log_schema().project(&[ADD_NAME, SIDECAR_NAME])?, ), ); - // Assert that the second batch returned is from reading sidecarfile1 + let sidecar_schema = get_log_add_schema(); let (second_batch, is_log_batch) = iter.next().unwrap()?; assert!(!is_log_batch); assert_batch_matches(second_batch, add_batch_simple(sidecar_schema.clone())); diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index b4d6f51ef..9dd8095a9 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -691,31 +691,39 @@ pub(crate) mod test_utils { Box::new(ArrowEngineData::new(batch)) } - // simple sidecar + // Generates a batch of sidecar actions with the given paths. + // The schema is provided as null columns affect equality checks. pub(crate) fn sidecar_batch_with_given_paths( paths: Vec<&str>, output_schema: SchemaRef, ) -> Box { let handler = SyncJsonHandler {}; - let json_strings: StringArray = paths + let mut json_strings: Vec = paths .iter() .map(|path| { format!( - r#"{{"sidecar":{{"path":"{path}","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{{"tag_foo":"tag_bar"}}}}}}"#, + r#"{{"sidecar":{{"path":"{path}","sizeInBytes":9268,"modificationTime":1714496113961,"tags":{{"tag_foo":"tag_bar"}}}}}}"# ) }) - .collect_vec() - .into(); + .collect(); + json_strings.push(r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#.to_string()); + + let json_strings_array: StringArray = + json_strings.iter().map(|s| s.as_str()).collect_vec().into(); let parsed = handler - .parse_json(string_array_to_engine_data(json_strings), output_schema) + .parse_json( + string_array_to_engine_data(json_strings_array), + output_schema, + ) .unwrap(); ArrowEngineData::try_from_engine_data(parsed).unwrap() } - // A simple add batch parsed with the schema provided + // Generates a batch with an add action. + // The schema is provided as null columns affect equality checks. pub(crate) fn add_batch_simple(output_schema: SchemaRef) -> Box { let handler = SyncJsonHandler {}; let json_strings: StringArray = vec![ From 638387c846a9cc944e1a1da283c3d6a531bd1e5d Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 12 Feb 2025 15:24:29 -0800 Subject: [PATCH 27/37] nits --- kernel/src/log_segment.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index ba3b241c9..2b2cfaf9d 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -249,7 +249,7 @@ impl LogSegment { let need_add_actions = checkpoint_read_schema.contains(ADD_NAME); require!( !need_add_actions || checkpoint_read_schema.contains(SIDECAR_NAME), - Error::generic( + Error::invalid_checkpoint( "If the checkpoint read schema contains file actions, it must contain the sidecar column" ) ); @@ -333,12 +333,10 @@ impl LogSegment { .map(|sidecar| Self::sidecar_to_filemeta(sidecar, &log_root)) .try_collect()?; - let sidecar_read_schema = get_log_add_schema().clone(); - // Read the sidecar files and return an iterator of sidecar file batches Ok(Some(parquet_handler.read_parquet_files( &sidecar_files, - sidecar_read_schema, + get_log_add_schema().clone(), None, )?)) } From 4984655d018c2290b3118d205dd8fc563ed64039 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Thu, 13 Feb 2025 14:28:36 -0800 Subject: [PATCH 28/37] remove debug statements --- kernel/src/log_segment/tests.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 677dc3aa7..7aa2caed0 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -143,15 +143,10 @@ fn write_parquet_to_store( let mut writer = ArrowWriter::try_new(&mut buffer, record_batch.schema(), None)?; writer.write(record_batch)?; writer.close()?; - println!("Writing to path: {}", path); tokio::runtime::Runtime::new() .expect("create tokio runtime") - .block_on(async { - if let Err(e) = store.put(&Path::from(path), buffer.into()).await { - eprintln!("Error writing to store: {}", e); - } - }); + .block_on(async { store.put(&Path::from(path), buffer.into()).await })?; Ok(()) } From a77333cac4cf3b99d6025ab79f7955a6c6e5ba62 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 18 Feb 2025 19:47:17 -0800 Subject: [PATCH 29/37] review --- kernel/src/actions/visitors.rs | 13 +++---------- kernel/src/log_segment.rs | 18 +++++++++++------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index 000fc743d..bacc1ac57 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -352,7 +352,7 @@ impl RowVisitor for CdcVisitor { )) ); for i in 0..row_count { - // Since path column is required, use it to detect presence of an Add action + // Since path column is required, use it to detect presence of an Cdc action if let Some(path) = getters[0].get_opt(i, "cdc.path")? { self.cdcs.push(Self::visit_cdc(i, path, getters)?); } @@ -474,15 +474,8 @@ impl RowVisitor for SidecarVisitor { )) ); for i in 0..row_count { - // Since path column is required, use it to detect presence of a sidecar action - let opt_path: Option = getters[0].get_opt(i, "sidecar.path")?; - if let Some(path) = opt_path { - // TODO: Known issue here https://github.com/apache/arrow-rs/issues/7119 - // Once https://github.com/delta-io/delta-kernel-rs/pull/692 is merged, remove empty path check. - // This is a workaround to avoid constructing a sidecar action with an empty path for testing. - if path.is_empty() { - continue; - } + // Since path column is required, use it to detect presence of an Sidecar action + if let Some(path) = getters[0].get_opt(i, "sidecar.path")? { self.sidecars.push(Self::visit_sidecar(i, path, getters)?); } } diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index 2b2cfaf9d..e44f591b4 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -259,6 +259,12 @@ impl LogSegment { .map(|f| f.location.clone()) .collect(); + let parquet_handler = engine.get_parquet_handler(); + + // Historically, we had a shared file reader trait for JSON and Parquet handlers, + // but it was removed to avoid unnecessary coupling. This is a concrete case + // where it *could* have been useful, but for now, we're keeping them separate. + // If similar patterns start appearing elsewhere, we should reconsider that decision. let actions = if checkpoint_parts .first() .is_some_and(|p| p.extension == "json") @@ -269,14 +275,13 @@ impl LogSegment { meta_predicate, )? } else { - engine.get_parquet_handler().read_parquet_files( + parquet_handler.read_parquet_files( &checkpoint_file_meta, checkpoint_read_schema, meta_predicate, )? }; - let parquet_handler = engine.get_parquet_handler(); let actions_iter = actions .map(move |batch_result| -> DeltaResult<_> { let checkpoint_batch = batch_result?; @@ -286,14 +291,14 @@ impl LogSegment { // 1. In the case where the schema does not contain add/remove actions, we return the checkpoint // batch directly as sidecar files only have to be read when the schema contains add/remove actions. // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is. - let sidecar_content = if !need_add_actions || checkpoint_parts.len() > 1 { - None - } else { + let sidecar_content = if need_add_actions && checkpoint_parts.len() == 1 { Self::process_sidecars( parquet_handler.clone(), // cheap Arc clone log_root.clone(), checkpoint_batch.as_ref(), )? + } else { + None }; let combined_batches = std::iter::once(Ok(checkpoint_batch)) @@ -303,8 +308,7 @@ impl LogSegment { Ok(combined_batches) }) .flatten_ok() - // Map converts Result, _>,_> to Result, _> - .map(|result| result?); + .map(|result| result?); // result-result to result Ok(actions_iter) } From b25bb58216549ede645d1202c6e599b167652bb5 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 19 Feb 2025 10:11:16 -0800 Subject: [PATCH 30/37] comments & review --- kernel/src/log_segment.rs | 11 ++++++----- kernel/src/log_segment/tests.rs | 20 ++++++++++++-------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index e44f591b4..33253327b 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -231,9 +231,10 @@ impl LogSegment { /// Returns an iterator over checkpoint data, processing sidecar files when necessary. /// - /// Checkpoint data is returned directly if: - /// - Processing a multi-part checkpoint - /// - Schema does not contain file actions + /// By default, `create_checkpoint_stream` checks for the presence of sidecar files, and + /// reads their contents if present. Checking for sidecar files is skipped if: + /// - The checkpoint is a multi-part checkpoint + /// - The checkpoint read schema does not contain the add action /// /// For single-part checkpoints, any referenced sidecar files are processed. These /// sidecar files contain the actual add actions that would otherwise be @@ -288,8 +289,8 @@ impl LogSegment { // This closure maps the checkpoint batch to an iterator of batches // by chaining the checkpoint batch with sidecar batches if they exist. - // 1. In the case where the schema does not contain add/remove actions, we return the checkpoint - // batch directly as sidecar files only have to be read when the schema contains add/remove actions. + // 1. In the case where the schema does not contain the add action, we return the checkpoint + // batch directly as sidecar files only have to be read when the schema contains the add action. // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is. let sidecar_content = if need_add_actions && checkpoint_parts.len() == 1 { Self::process_sidecars( diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 7aa2caed0..53e284fe9 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -7,7 +7,8 @@ use url::Url; use crate::actions::visitors::AddVisitor; use crate::actions::{ - get_log_add_schema, get_log_schema, Add, Sidecar, ADD_NAME, METADATA_NAME, SIDECAR_NAME, + get_log_add_schema, get_log_schema, Add, Sidecar, ADD_NAME, METADATA_NAME, REMOVE_NAME, + SIDECAR_NAME, }; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::default::executor::tokio::TokioBackgroundExecutor; @@ -788,12 +789,12 @@ fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult< add_sidecar_to_store( &store, - add_batch_simple(get_log_schema().clone()), + add_batch_simple(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?), "sidecarfile1.parquet", )?; add_sidecar_to_store( &store, - add_batch_with_remove(get_log_schema().clone()), + add_batch_with_remove(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?), "sidecarfile2.parquet", )?; @@ -1084,9 +1085,12 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() Ok(()) } -// Encapsulates logic that has already been tested but tests the interaction between the functions, -// such as performing a map operation on the returned sidecar batches from `process_sidecars` -// to include the is_log_batch flag +// Tests the end-to-end process of creating a checkpoint stream. +// Verifies that: +// - The checkpoint file is read and produces batches containing references to sidecar files. +// - As sidecar references are present, the corresponding sidecar files are processed correctly. +// - Batches from both the checkpoint file and sidecar files are returned. +// - Each returned batch is correctly flagged with is_log_batch set to false #[test] fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches( ) -> DeltaResult<()> { @@ -1108,12 +1112,12 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch add_sidecar_to_store( &store, - add_batch_simple(get_log_schema().clone()), + add_batch_simple(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?), "sidecarfile1.parquet", )?; add_sidecar_to_store( &store, - add_batch_with_remove(get_log_schema().clone()), + add_batch_with_remove(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?), "sidecarfile2.parquet", )?; From 93971c5459aa3b95b4e750c985c2cbf5826cc307 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 19 Feb 2025 10:57:20 -0800 Subject: [PATCH 31/37] typo --- kernel/src/actions/visitors.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index bacc1ac57..de9d4f389 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -474,7 +474,7 @@ impl RowVisitor for SidecarVisitor { )) ); for i in 0..row_count { - // Since path column is required, use it to detect presence of an Sidecar action + // Since path column is required, use it to detect presence of a Sidecar action if let Some(path) = getters[0].get_opt(i, "sidecar.path")? { self.sidecars.push(Self::visit_sidecar(i, path, getters)?); } From 501c675736dd102a691bc2132c6e81579cf4a1a6 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 19 Feb 2025 11:14:58 -0800 Subject: [PATCH 32/37] typo --- kernel/src/actions/visitors.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/actions/visitors.rs b/kernel/src/actions/visitors.rs index de9d4f389..3d623cdce 100644 --- a/kernel/src/actions/visitors.rs +++ b/kernel/src/actions/visitors.rs @@ -352,7 +352,7 @@ impl RowVisitor for CdcVisitor { )) ); for i in 0..row_count { - // Since path column is required, use it to detect presence of an Cdc action + // Since path column is required, use it to detect presence of a Cdc action if let Some(path) = getters[0].get_opt(i, "cdc.path")? { self.cdcs.push(Self::visit_cdc(i, path, getters)?); } From 0b9dc43afa784d16583177935020d2e0b69fdff7 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Fri, 7 Feb 2025 12:32:38 -0800 Subject: [PATCH 33/37] snapshot creation with v2checkpoints mvp This reverts commit 3b0b2a034263421c9da51e225200f00b0ff2cffa. --- kernel/src/actions/mod.rs | 8 ++-- kernel/src/log_segment/tests.rs | 79 +++++++++++++++++++++++++++++-- kernel/src/path.rs | 20 +++----- kernel/src/table_configuration.rs | 4 +- kernel/src/table_features/mod.rs | 7 +++ 5 files changed, 94 insertions(+), 24 deletions(-) diff --git a/kernel/src/actions/mod.rs b/kernel/src/actions/mod.rs index 02d30163a..108b501a3 100644 --- a/kernel/src/actions/mod.rs +++ b/kernel/src/actions/mod.rs @@ -782,7 +782,7 @@ mod tests { } #[test] - fn test_v2_checkpoint_unsupported() { + fn test_v2_checkpoint_supported() { let protocol = Protocol::try_new( 3, 7, @@ -790,7 +790,7 @@ mod tests { Some([ReaderFeatures::V2Checkpoint]), ) .unwrap(); - assert!(protocol.ensure_read_supported().is_err()); + assert!(protocol.ensure_read_supported().is_ok()); let protocol = Protocol::try_new( 4, @@ -820,7 +820,7 @@ mod tests { Some(&empty_features), ) .unwrap(); - assert!(protocol.ensure_read_supported().is_err()); + assert!(protocol.ensure_read_supported().is_ok()); let protocol = Protocol::try_new( 3, @@ -838,7 +838,7 @@ mod tests { Some([WriterFeatures::V2Checkpoint]), ) .unwrap(); - assert!(protocol.ensure_read_supported().is_err()); + assert!(protocol.ensure_read_supported().is_ok()); let protocol = Protocol { min_reader_version: 1, diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 53e284fe9..09c8aebb0 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -210,7 +210,7 @@ fn create_log_path(path: &str) -> ParsedLogPath { } #[test] -fn build_snapshot_with_unsupported_uuid_checkpoint() { +fn build_snapshot_with_uuid_checkpoint_parquet() { let (client, log_root) = build_log_with_paths_and_checkpoint( &[ delta_path_for_version(0, "json"), @@ -225,18 +225,88 @@ fn build_snapshot_with_unsupported_uuid_checkpoint() { ], None, ); + let log_segment = LogSegment::for_snapshot(client.as_ref(), log_root, None, None).unwrap(); let commit_files = log_segment.ascending_commit_files; let checkpoint_parts = log_segment.checkpoint_parts; assert_eq!(checkpoint_parts.len(), 1); - assert_eq!(checkpoint_parts[0].version, 3); + assert_eq!(checkpoint_parts[0].version, 5); let versions = commit_files.into_iter().map(|x| x.version).collect_vec(); - let expected_versions = vec![4, 5, 6, 7]; + let expected_versions = vec![6, 7]; + assert_eq!(versions, expected_versions); +} + +#[test] +fn build_snapshot_with_uuid_checkpoint_json() { + let (client, log_root) = build_log_with_paths_and_checkpoint( + &[ + delta_path_for_version(0, "json"), + delta_path_for_version(1, "checkpoint.parquet"), + delta_path_for_version(2, "json"), + delta_path_for_version(3, "checkpoint.parquet"), + delta_path_for_version(4, "json"), + delta_path_for_version(5, "json"), + delta_path_for_version(5, "checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json"), + delta_path_for_version(6, "json"), + delta_path_for_version(7, "json"), + ], + None, + ); + + let log_segment = LogSegment::for_snapshot(client.as_ref(), log_root, None, None).unwrap(); + let commit_files = log_segment.ascending_commit_files; + let checkpoint_parts = log_segment.checkpoint_parts; + + assert_eq!(checkpoint_parts.len(), 1); + assert_eq!(checkpoint_parts[0].version, 5); + + let versions = commit_files.into_iter().map(|x| x.version).collect_vec(); + let expected_versions = vec![6, 7]; assert_eq!(versions, expected_versions); } +#[test] +fn build_snapshot_with_correct_last_uuid_checkpoint() { + let checkpoint_metadata = CheckpointMetadata { + version: 5, + size: 10, + parts: Some(1), + size_in_bytes: None, + num_of_add_files: None, + checkpoint_schema: None, + checksum: None, + }; + + let (client, log_root) = build_log_with_paths_and_checkpoint( + &[ + delta_path_for_version(0, "json"), + delta_path_for_version(1, "checkpoint.parquet"), + delta_path_for_version(1, "json"), + delta_path_for_version(2, "json"), + delta_path_for_version(3, "checkpoint.parquet"), + delta_path_for_version(3, "json"), + delta_path_for_version(4, "json"), + delta_path_for_version(5, "checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.parquet"), + delta_path_for_version(5, "json"), + delta_path_for_version(6, "json"), + delta_path_for_version(7, "json"), + ], + Some(&checkpoint_metadata), + ); + + let log_segment = + LogSegment::for_snapshot(client.as_ref(), log_root, checkpoint_metadata, None).unwrap(); + let commit_files = log_segment.ascending_commit_files; + let checkpoint_parts = log_segment.checkpoint_parts; + + assert_eq!(checkpoint_parts.len(), 1); + assert_eq!(commit_files.len(), 2); + assert_eq!(checkpoint_parts[0].version, 5); + assert_eq!(commit_files[0].version, 6); + assert_eq!(commit_files[1].version, 7); +} #[test] fn build_snapshot_with_multiple_incomplete_multipart_checkpoints() { let (client, log_root) = build_log_with_paths_and_checkpoint( @@ -1121,7 +1191,8 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch "sidecarfile2.parquet", )?; - let checkpoint_file_path = log_root + let checkpoint_one_file = mock_table + .log_root() .join("00000000000000000001.checkpoint.parquet")? .to_string(); diff --git a/kernel/src/path.rs b/kernel/src/path.rs index 23e7819de..200eb0d09 100644 --- a/kernel/src/path.rs +++ b/kernel/src/path.rs @@ -166,7 +166,9 @@ impl ParsedLogPath { // TODO: Include UuidCheckpoint once we actually support v2 checkpoints matches!( self.file_type, - LogPathFileType::SinglePartCheckpoint | LogPathFileType::MultiPartCheckpoint { .. } + LogPathFileType::SinglePartCheckpoint + | LogPathFileType::MultiPartCheckpoint { .. } + | LogPathFileType::UuidCheckpoint(_) ) } @@ -174,11 +176,7 @@ impl ParsedLogPath { #[cfg_attr(not(feature = "developer-visibility"), visibility::make(pub(crate)))] #[allow(dead_code)] // currently only used in tests, which don't "count" fn is_unknown(&self) -> bool { - // TODO: Stop treating UuidCheckpoint as unknown once we support v2 checkpoints - matches!( - self.file_type, - LogPathFileType::Unknown | LogPathFileType::UuidCheckpoint(_) - ) + matches!(self.file_type, LogPathFileType::Unknown) } } @@ -357,10 +355,7 @@ mod tests { LogPathFileType::UuidCheckpoint(ref u) if u == "3a0d65cd-4056-49b8-937b-95f9e3ee90e5", )); assert!(!log_path.is_commit()); - - // TODO: Support v2 checkpoints! Until then we can't treat these as checkpoint files. - assert!(!log_path.is_checkpoint()); - assert!(log_path.is_unknown()); + assert!(log_path.is_checkpoint()); let log_path = table_log_dir .join("00000000000000000002.checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.json") @@ -377,10 +372,7 @@ mod tests { LogPathFileType::UuidCheckpoint(ref u) if u == "3a0d65cd-4056-49b8-937b-95f9e3ee90e5", )); assert!(!log_path.is_commit()); - - // TODO: Support v2 checkpoints! Until then we can't treat these as checkpoint files. - assert!(!log_path.is_checkpoint()); - assert!(log_path.is_unknown()); + assert!(log_path.is_checkpoint()); let log_path = table_log_dir .join("00000000000000000002.checkpoint.3a0d65cd-4056-49b8-937b-95f9e3ee90e5.foo") diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index 565546d52..22447554b 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -262,8 +262,8 @@ mod test { let protocol = Protocol::try_new( 3, 7, - Some([ReaderFeatures::V2Checkpoint]), - Some([WriterFeatures::V2Checkpoint]), + Some([ReaderFeatures::UnsupportedFeature]), + Some([WriterFeatures::UnsupportedFeature]), ) .unwrap(); let table_root = Url::try_from("file:///").unwrap(); diff --git a/kernel/src/table_features/mod.rs b/kernel/src/table_features/mod.rs index ee27fc17e..bc5499501 100644 --- a/kernel/src/table_features/mod.rs +++ b/kernel/src/table_features/mod.rs @@ -48,6 +48,8 @@ pub enum ReaderFeatures { /// vacuumProtocolCheck ReaderWriter feature ensures consistent application of reader and writer /// protocol checks during VACUUM operations VacuumProtocolCheck, + /// A dummy variant used to represent an unsupported feature for testing purposes + UnsupportedFeature, } /// Similar to reader features, writer features communicate capabilities that must be implemented @@ -109,6 +111,8 @@ pub enum WriterFeatures { /// vacuumProtocolCheck ReaderWriter feature ensures consistent application of reader and writer /// protocol checks during VACUUM operations VacuumProtocolCheck, + /// A dummy variant used to represent an unsupported feature for testing purposes + UnsupportedFeature, } impl From for String { @@ -133,6 +137,7 @@ pub(crate) static SUPPORTED_READER_FEATURES: LazyLock> = ReaderFeatures::TypeWidening, ReaderFeatures::TypeWideningPreview, ReaderFeatures::VacuumProtocolCheck, + ReaderFeatures::V2Checkpoint, ]) }); @@ -154,6 +159,7 @@ mod tests { (ReaderFeatures::TypeWideningPreview, "typeWidening-preview"), (ReaderFeatures::V2Checkpoint, "v2Checkpoint"), (ReaderFeatures::VacuumProtocolCheck, "vacuumProtocolCheck"), + (ReaderFeatures::UnsupportedFeature, "unsupportedFeature"), ]; assert_eq!(ReaderFeatures::VARIANTS.len(), cases.len()); @@ -192,6 +198,7 @@ mod tests { (WriterFeatures::IcebergCompatV1, "icebergCompatV1"), (WriterFeatures::IcebergCompatV2, "icebergCompatV2"), (WriterFeatures::VacuumProtocolCheck, "vacuumProtocolCheck"), + (WriterFeatures::UnsupportedFeature, "unsupportedFeature"), ]; assert_eq!(WriterFeatures::VARIANTS.len(), cases.len()); From d92e141611db921dbdf47e102bbad717e8dd03e1 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Sun, 9 Feb 2025 22:07:39 -0800 Subject: [PATCH 34/37] file name change?? --- kernel/src/log_segment/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 09c8aebb0..827ba0101 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -1191,7 +1191,7 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch "sidecarfile2.parquet", )?; - let checkpoint_one_file = mock_table + let checkpoint_file_path = mock_table .log_root() .join("00000000000000000001.checkpoint.parquet")? .to_string(); From e5748e1810cda50d47d53b2f8a919bed95d25d15 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Tue, 11 Feb 2025 19:54:21 -0800 Subject: [PATCH 35/37] fix merge conflict errors --- kernel/src/log_segment/tests.rs | 3 +-- kernel/tests/golden_tables.rs | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 827ba0101..756780982 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -1191,8 +1191,7 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch "sidecarfile2.parquet", )?; - let checkpoint_file_path = mock_table - .log_root() + let checkpoint_file_path = log_root .join("00000000000000000001.checkpoint.parquet")? .to_string(); diff --git a/kernel/tests/golden_tables.rs b/kernel/tests/golden_tables.rs index 120271ef2..e3394a38f 100644 --- a/kernel/tests/golden_tables.rs +++ b/kernel/tests/golden_tables.rs @@ -408,9 +408,8 @@ golden_test!("time-travel-schema-changes-b", latest_snapshot_test); golden_test!("time-travel-start", latest_snapshot_test); golden_test!("time-travel-start-start20", latest_snapshot_test); golden_test!("time-travel-start-start20-start40", latest_snapshot_test); - -skip_test!("v2-checkpoint-json": "v2 checkpoint not supported"); -skip_test!("v2-checkpoint-parquet": "v2 checkpoint not supported"); +golden_test!("v2-checkpoint-json", latest_snapshot_test); +golden_test!("v2-checkpoint-parquet", latest_snapshot_test); // BUG: // - AddFile: 'file:/some/unqualified/absolute/path' From a035f8c6b89403a11a8d52acded9485c2456fc32 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 19 Feb 2025 10:40:12 -0800 Subject: [PATCH 36/37] review & nits --- kernel/src/scan/mod.rs | 1 + kernel/src/table_configuration.rs | 8 ++++++-- kernel/src/table_features/mod.rs | 14 ++++++++------ kernel/src/utils.rs | 8 +++----- 4 files changed, 18 insertions(+), 13 deletions(-) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 9dd8095a9..692267674 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -699,6 +699,7 @@ pub(crate) mod test_utils { ) -> Box { let handler = SyncJsonHandler {}; + let json_strings = paths let mut json_strings: Vec = paths .iter() .map(|path| { diff --git a/kernel/src/table_configuration.rs b/kernel/src/table_configuration.rs index 22447554b..9521c5303 100644 --- a/kernel/src/table_configuration.rs +++ b/kernel/src/table_configuration.rs @@ -262,8 +262,12 @@ mod test { let protocol = Protocol::try_new( 3, 7, - Some([ReaderFeatures::UnsupportedFeature]), - Some([WriterFeatures::UnsupportedFeature]), + Some([ReaderFeatures::UnrecognizedReaderFeature( + "unrecognizedReaderFeature".to_string(), + )]), + Some([WriterFeatures::UnrecognizedWriterFeature( + "unrecognizedWriterFeature".to_string(), + )]), ) .unwrap(); let table_root = Url::try_from("file:///").unwrap(); diff --git a/kernel/src/table_features/mod.rs b/kernel/src/table_features/mod.rs index bc5499501..a9f6f7c76 100644 --- a/kernel/src/table_features/mod.rs +++ b/kernel/src/table_features/mod.rs @@ -49,7 +49,8 @@ pub enum ReaderFeatures { /// protocol checks during VACUUM operations VacuumProtocolCheck, /// A dummy variant used to represent an unsupported feature for testing purposes - UnsupportedFeature, + #[cfg(test)] + UnrecognizedReaderFeature(String), } /// Similar to reader features, writer features communicate capabilities that must be implemented @@ -112,7 +113,8 @@ pub enum WriterFeatures { /// protocol checks during VACUUM operations VacuumProtocolCheck, /// A dummy variant used to represent an unsupported feature for testing purposes - UnsupportedFeature, + #[cfg(test)] + UnrecognizedWriterFeature(String), } impl From for String { @@ -149,6 +151,7 @@ pub(crate) static SUPPORTED_WRITER_FEATURES: LazyLock> = mod tests { use super::*; + // TODO: Test the UnrecognizedReaderFeature variant #[test] fn test_roundtrip_reader_features() { let cases = [ @@ -159,10 +162,9 @@ mod tests { (ReaderFeatures::TypeWideningPreview, "typeWidening-preview"), (ReaderFeatures::V2Checkpoint, "v2Checkpoint"), (ReaderFeatures::VacuumProtocolCheck, "vacuumProtocolCheck"), - (ReaderFeatures::UnsupportedFeature, "unsupportedFeature"), ]; - assert_eq!(ReaderFeatures::VARIANTS.len(), cases.len()); + assert_eq!(ReaderFeatures::VARIANTS.len() - 1, cases.len()); for ((feature, expected), name) in cases.into_iter().zip(ReaderFeatures::VARIANTS) { assert_eq!(*name, expected); @@ -178,6 +180,7 @@ mod tests { } } + // TODO: Test the UnrecognizedWriterFeature variant #[test] fn test_roundtrip_writer_features() { let cases = [ @@ -198,10 +201,9 @@ mod tests { (WriterFeatures::IcebergCompatV1, "icebergCompatV1"), (WriterFeatures::IcebergCompatV2, "icebergCompatV2"), (WriterFeatures::VacuumProtocolCheck, "vacuumProtocolCheck"), - (WriterFeatures::UnsupportedFeature, "unsupportedFeature"), ]; - assert_eq!(WriterFeatures::VARIANTS.len(), cases.len()); + assert_eq!(WriterFeatures::VARIANTS.len() - 1, cases.len()); for ((feature, expected), name) in cases.into_iter().zip(WriterFeatures::VARIANTS) { assert_eq!(*name, expected); diff --git a/kernel/src/utils.rs b/kernel/src/utils.rs index f8d0bfe27..6ac4a6499 100644 --- a/kernel/src/utils.rs +++ b/kernel/src/utils.rs @@ -22,11 +22,9 @@ pub(crate) mod test_utils { use tempfile::TempDir; use test_utils::delta_path_for_version; - use crate::{ - actions::{Add, Cdc, CommitInfo, Metadata, Protocol, Remove}, - engine::arrow_data::ArrowEngineData, - EngineData, - }; + use crate::actions::{Add, Cdc, CommitInfo, Metadata, Protocol, Remove}; + use crate::engine::arrow_data::ArrowEngineData; + use crate::EngineData; #[derive(Serialize)] pub(crate) enum Action { From 3dcd0859be048dc05f3e98223d0950e460633b60 Mon Sep 17 00:00:00 2001 From: sebastian tia Date: Wed, 19 Feb 2025 10:51:36 -0800 Subject: [PATCH 37/37] merge error --- kernel/src/scan/mod.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/src/scan/mod.rs b/kernel/src/scan/mod.rs index 692267674..9dd8095a9 100644 --- a/kernel/src/scan/mod.rs +++ b/kernel/src/scan/mod.rs @@ -699,7 +699,6 @@ pub(crate) mod test_utils { ) -> Box { let handler = SyncJsonHandler {}; - let json_strings = paths let mut json_strings: Vec = paths .iter() .map(|path| {