Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: make ExpressionHandler::get_evaluator fallible #577

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions kernel/src/scan/data_skipping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::borrow::Cow;
use std::cmp::Ordering;
use std::sync::{Arc, LazyLock};

use tracing::debug;
use tracing::{debug, warn};

use crate::actions::get_log_add_schema;
use crate::actions::visitors::SelectionVectorVisitor;
Expand Down Expand Up @@ -107,20 +107,32 @@ impl DataSkippingFilter {
STATS_EXPR.clone(),
DataType::STRING,
)
.map_err(|e| {
warn!("Failed to create stats selector evaluator: {}", e);
e
})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
.map_err(|e| {
warn!("Failed to create stats selector evaluator: {}", e);
e
})
.inspect_err(|e| warn!("Failed to create stats selector evaluator: {e}"))

(more below)

.ok()?;

let skipping_evaluator = engine
.get_expression_handler()
.get_evaluator(
stats_schema.clone(),
Expr::struct_from([as_data_skipping_predicate(predicate, false)?]),
Expr::struct_from([as_data_skipping_predicate(&predicate, false)?]),
PREDICATE_SCHEMA.clone(),
)
.map_err(|e| {
warn!("Failed to create skipping evaluator: {}", e);
e
})
.ok()?;

let filter_evaluator = engine
.get_expression_handler()
.get_evaluator(stats_schema.clone(), FILTER_EXPR.clone(), DataType::BOOLEAN)
.map_err(|e| {
warn!("Failed to create filter evaluator: {}", e);
e
})
.ok()?;

Some(Self {
Expand Down
39 changes: 16 additions & 23 deletions kernel/src/scan/log_replay.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::clone::Clone;
use std::collections::HashSet;
use std::iter;
use std::sync::{Arc, LazyLock};

use tracing::debug;
Expand Down Expand Up @@ -239,29 +238,21 @@ impl LogReplayScanner {
pub fn scan_action_iter(
engine: &dyn Engine,
action_iter: impl Iterator<Item = DeltaResult<(Box<dyn EngineData>, bool)>> + 'static,
table_schema: &SchemaRef,
predicate: Option<ExpressionRef>,
) -> Box<dyn Iterator<Item = DeltaResult<ScanData>>> {
let mut log_scanner = LogReplayScanner::new(engine, table_schema, predicate);
match engine.get_expression_handler().get_evaluator(
physical_predicate: Option<(ExpressionRef, SchemaRef)>,
) -> DeltaResult<impl Iterator<Item = DeltaResult<ScanData>>> {
let mut log_scanner = LogReplayScanner::new(engine, physical_predicate);
let add_transform = engine.get_expression_handler().get_evaluator(
get_log_add_schema().clone(),
get_add_transform_expr(),
SCAN_ROW_DATATYPE.clone(),
) {
Ok(add_transform) => Box::new(
action_iter
.map(move |action_res| {
let (batch, is_log_batch) = action_res?;
log_scanner.process_scan_batch(
add_transform.as_ref(),
batch.as_ref(),
is_log_batch,
)
})
.filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true))),
),
Err(e) => Box::new(iter::once(Err(e))),
}
)?;

Ok(action_iter
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: it's very subjective, but in general I find this sort of multi-line, multi-operation Ok (or Some, etc) to be unnecessarily hard to read because of all the nesting. Factoring it out as a value is easier to read (and change later):

let actions = action_iter
      ...
    .filter(...);
Ok(actions)

Wrapping a single function call or struct creation isn't so bad, because conceptually only one thing is happening, e.g. this code above is not particularly difficult to read:

Ok(Arc::new(DefaultExpressionEvaluator {
    ...
}))

Clippy seems to take a similar stance on monadic chains -- it will almost always force newlines between chained function calls, if more than one of the functions takes args or even if the line gets very long (long before the 100-char line limit).

.map(move |action_res| {
let (batch, is_log_batch) = action_res?;
log_scanner.process_scan_batch(add_transform.as_ref(), batch.as_ref(), is_log_batch)
})
.filter(|res| res.as_ref().map_or(true, |(_, sv)| sv.contains(&true))))
}

#[cfg(test)]
Expand Down Expand Up @@ -301,7 +292,8 @@ mod tests {
&[true, false],
(),
validate_simple,
);
)
.unwrap();
}

#[test]
Expand All @@ -311,6 +303,7 @@ mod tests {
&[false, false, true, false],
(),
validate_simple,
);
)
.unwrap();
}
}
18 changes: 10 additions & 8 deletions kernel/src/scan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -381,11 +381,13 @@ impl Scan {
PhysicalPredicate::Some(predicate, schema) => Some((predicate, schema)),
PhysicalPredicate::None => None,
};

let it = scan_action_iter(
engine,
self.replay_for_scan_data(engine)?,
physical_predicate,
);
)?;

Ok(Some(it).into_iter().flatten())
}

Expand Down Expand Up @@ -701,7 +703,7 @@ pub(crate) mod test_utils {
sync::{json::SyncJsonHandler, SyncEngine},
},
scan::log_replay::scan_action_iter,
EngineData, JsonHandler,
DeltaResult, EngineData, JsonHandler,
};

use super::state::ScanCallback;
Expand Down Expand Up @@ -753,26 +755,26 @@ pub(crate) mod test_utils {
expected_sel_vec: &[bool],
context: T,
validate_callback: ScanCallback<T>,
) {
) -> DeltaResult<()> {
let iter = scan_action_iter(
&SyncEngine::new(),
batch.into_iter().map(|batch| Ok((batch as _, true))),
None,
);
)?;
let mut batch_count = 0;
for res in iter {
let (batch, sel) = res.unwrap();
assert_eq!(sel, expected_sel_vec);
let (batch, sel) = res?;
assert_eq!(sel.as_slice(), expected_sel_vec);
crate::scan::state::visit_scan_files(
batch.as_ref(),
&sel,
context.clone(),
validate_callback,
)
.unwrap();
)?;
batch_count += 1;
}
assert_eq!(batch_count, 1);
Ok(())
}
}

Expand Down
3 changes: 2 additions & 1 deletion kernel/src/scan/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ mod tests {
&[true, false],
context,
validate_visit,
);
)
.unwrap();
}
}
13 changes: 5 additions & 8 deletions kernel/src/table_changes/log_replay.rs
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,11 @@ impl LogReplayScanner {
.version
.try_into()
.map_err(|_| Error::generic("Failed to convert commit version to i64"))?;
let evaluator = engine
.get_expression_handler()
.get_evaluator(
get_log_add_schema().clone(),
cdf_scan_row_expression(timestamp, commit_version),
cdf_scan_row_schema().into(),
)?;

let evaluator = engine.get_expression_handler().get_evaluator(
get_log_add_schema().clone(),
cdf_scan_row_expression(timestamp, commit_version),
cdf_scan_row_schema().into(),
)?;
let result = action_iter.map(move |actions| -> DeltaResult<_> {
let actions = actions?;

Expand Down
2 changes: 1 addition & 1 deletion kernel/src/table_changes/scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ fn read_scan_file(
physical_schema.clone(),
physical_to_logical_expr,
global_state.logical_schema.clone().into(),
);
)?;

let table_root = Url::parse(&global_state.table_root)?;
let location = table_root.join(&scan_file.path)?;
Expand Down
Loading