diff --git a/kernel/src/log_segment.rs b/kernel/src/log_segment.rs index e44f591b4..33253327b 100644 --- a/kernel/src/log_segment.rs +++ b/kernel/src/log_segment.rs @@ -231,9 +231,10 @@ impl LogSegment { /// Returns an iterator over checkpoint data, processing sidecar files when necessary. /// - /// Checkpoint data is returned directly if: - /// - Processing a multi-part checkpoint - /// - Schema does not contain file actions + /// By default, `create_checkpoint_stream` checks for the presence of sidecar files, and + /// reads their contents if present. Checking for sidecar files is skipped if: + /// - The checkpoint is a multi-part checkpoint + /// - The checkpoint read schema does not contain the add action /// /// For single-part checkpoints, any referenced sidecar files are processed. These /// sidecar files contain the actual add actions that would otherwise be @@ -288,8 +289,8 @@ impl LogSegment { // This closure maps the checkpoint batch to an iterator of batches // by chaining the checkpoint batch with sidecar batches if they exist. - // 1. In the case where the schema does not contain add/remove actions, we return the checkpoint - // batch directly as sidecar files only have to be read when the schema contains add/remove actions. + // 1. In the case where the schema does not contain the add action, we return the checkpoint + // batch directly as sidecar files only have to be read when the schema contains the add action. // 2. Multi-part checkpoint batches never have sidecar actions, so the batch is returned as-is. let sidecar_content = if need_add_actions && checkpoint_parts.len() == 1 { Self::process_sidecars( diff --git a/kernel/src/log_segment/tests.rs b/kernel/src/log_segment/tests.rs index 7aa2caed0..53e284fe9 100644 --- a/kernel/src/log_segment/tests.rs +++ b/kernel/src/log_segment/tests.rs @@ -7,7 +7,8 @@ use url::Url; use crate::actions::visitors::AddVisitor; use crate::actions::{ - get_log_add_schema, get_log_schema, Add, Sidecar, ADD_NAME, METADATA_NAME, SIDECAR_NAME, + get_log_add_schema, get_log_schema, Add, Sidecar, ADD_NAME, METADATA_NAME, REMOVE_NAME, + SIDECAR_NAME, }; use crate::engine::arrow_data::ArrowEngineData; use crate::engine::default::executor::tokio::TokioBackgroundExecutor; @@ -788,12 +789,12 @@ fn test_checkpoint_batch_with_sidecars_returns_sidecar_batches() -> DeltaResult< add_sidecar_to_store( &store, - add_batch_simple(get_log_schema().clone()), + add_batch_simple(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?), "sidecarfile1.parquet", )?; add_sidecar_to_store( &store, - add_batch_with_remove(get_log_schema().clone()), + add_batch_with_remove(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?), "sidecarfile2.parquet", )?; @@ -1084,9 +1085,12 @@ fn test_create_checkpoint_stream_reads_json_checkpoint_batch_without_sidecars() Ok(()) } -// Encapsulates logic that has already been tested but tests the interaction between the functions, -// such as performing a map operation on the returned sidecar batches from `process_sidecars` -// to include the is_log_batch flag +// Tests the end-to-end process of creating a checkpoint stream. +// Verifies that: +// - The checkpoint file is read and produces batches containing references to sidecar files. +// - As sidecar references are present, the corresponding sidecar files are processed correctly. +// - Batches from both the checkpoint file and sidecar files are returned. +// - Each returned batch is correctly flagged with is_log_batch set to false #[test] fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batches( ) -> DeltaResult<()> { @@ -1108,12 +1112,12 @@ fn test_create_checkpoint_stream_reads_checkpoint_file_and_returns_sidecar_batch add_sidecar_to_store( &store, - add_batch_simple(get_log_schema().clone()), + add_batch_simple(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?), "sidecarfile1.parquet", )?; add_sidecar_to_store( &store, - add_batch_with_remove(get_log_schema().clone()), + add_batch_with_remove(get_log_schema().project(&[ADD_NAME, REMOVE_NAME])?), "sidecarfile2.parquet", )?;