From cf11a700eb6a5385a6ebade2b92c684380940296 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Wed, 21 Feb 2024 10:53:02 -0500 Subject: [PATCH 01/45] Fix toml format script. (#9309) --- ci/scripts/rust_toml_fmt.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ci/scripts/rust_toml_fmt.sh b/ci/scripts/rust_toml_fmt.sh index 9204a562a3fe..393ad55f4168 100755 --- a/ci/scripts/rust_toml_fmt.sh +++ b/ci/scripts/rust_toml_fmt.sh @@ -21,5 +21,4 @@ # without overwritng the file. If any error occur, you may want to # rerun `taplo format` to fix the formatting automatically. set -ex -taplo format -done +taplo format --check From 91f3eb2e5430d23e2b551e66732bec1a3a575971 Mon Sep 17 00:00:00 2001 From: SteveLauC Date: Thu, 22 Feb 2024 10:53:36 +0800 Subject: [PATCH 02/45] docs: update contributor guide (format toml/inte test) (#9301) --- docs/source/contributor-guide/index.md | 27 +++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 3b0d4932c2cf..839055f6ed7a 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -202,7 +202,7 @@ There are several tests of the public interface of the DataFusion library in the You can run these tests individually using `cargo` as normal command such as ```shell -cargo test -p datafusion --test dataframe +cargo test -p datafusion --test parquet_exec ``` ## Benchmarks @@ -338,3 +338,28 @@ After you've confirmed your prettier version, you can format all the `.md` files ```bash prettier -w {datafusion,datafusion-cli,datafusion-examples,dev,docs}/**/*.md ``` + +## How to format `.toml` files + +We use `taplo` to format `.toml` files. + +For Rust developers, you can install it via: + +```sh +cargo install taplo-cli --locked +``` + +> Refer to the [Installation section][doc] on other ways to install it. +> +> [doc]: https://taplo.tamasfe.dev/cli/installation/binary.html + +```bash +$ taplo --version +taplo 0.9.0 +``` + +After you've confirmed your `taplo` version, you can format all the `.toml` files: + +```bash +taplo fmt +``` From 10000fb5bdb88a5e7a5cbfb81f3bd5d028bbeea7 Mon Sep 17 00:00:00 2001 From: metesynnada <100111937+metesynnada@users.noreply.github.com> Date: Thu, 22 Feb 2024 09:56:23 +0300 Subject: [PATCH 03/45] Delete docs.yaml --- .github/workflows/docs.yaml | 64 ------------------------------------- 1 file changed, 64 deletions(-) delete mode 100644 .github/workflows/docs.yaml diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml deleted file mode 100644 index ab6a615ab60b..000000000000 --- a/.github/workflows/docs.yaml +++ /dev/null @@ -1,64 +0,0 @@ -on: - push: - branches: - - main - paths: - - .asf.yaml - - .github/workflows/docs.yaml - - docs/** - -name: Deploy DataFusion site - -jobs: - build-docs: - name: Build docs - runs-on: ubuntu-latest - steps: - - name: Checkout docs sources - uses: actions/checkout@v4 - - - name: Checkout asf-site branch - uses: actions/checkout@v4 - with: - ref: asf-site - path: asf-site - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - - name: Install dependencies - run: | - set -x - python3 -m venv venv - source venv/bin/activate - pip install -r docs/requirements.txt - - - name: Build docs - run: | - set -x - source venv/bin/activate - cd docs - ./build.sh - - - name: Copy & push the generated HTML - run: | - set -x - cd asf-site/ - rsync \ - -a \ - --delete \ - --exclude '/.git/' \ - ../docs/build/html/ \ - ./ - cp ../.asf.yaml . - touch .nojekyll - git status --porcelain - if [ "$(git status --porcelain)" != "" ]; then - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add --all - git commit -m 'Publish built docs triggered by ${{ github.sha }}' - git push || git push --force - fi From 02c948d91b82f3e8988fafd12a072a476500c13d Mon Sep 17 00:00:00 2001 From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com> Date: Thu, 22 Feb 2024 20:05:13 +0300 Subject: [PATCH 04/45] [MINOR]: Limit stream replace with slice (#9303) * Initial commit * Minor changes --- datafusion/physical-plan/src/limit.rs | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index c31d5f62c726..417bc4cf977b 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -29,9 +29,8 @@ use crate::{ DisplayFormatType, Distribution, EquivalenceProperties, ExecutionPlan, Partitioning, }; -use arrow::array::ArrayRef; use arrow::datatypes::SchemaRef; -use arrow::record_batch::{RecordBatch, RecordBatchOptions}; +use arrow::record_batch::RecordBatch; use datafusion_common::stats::Precision; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_execution::TaskContext; @@ -507,26 +506,15 @@ impl LimitStream { // self.fetch -= batch.num_rows(); Some(batch) - } else { + } else if batch.num_rows() >= self.fetch { let batch_rows = self.fetch; self.fetch = 0; self.input = None; // clear input so it can be dropped early - let limited_columns: Vec = batch - .columns() - .iter() - .map(|col| col.slice(0, col.len().min(batch_rows))) - .collect(); - let options = - RecordBatchOptions::new().with_row_count(Option::from(batch_rows)); - Some( - RecordBatch::try_new_with_options( - batch.schema(), - limited_columns, - &options, - ) - .unwrap(), - ) + // It is guaranteed that batch_rows is <= batch.num_rows + Some(batch.slice(0, batch_rows)) + } else { + unreachable!() } } } @@ -575,6 +563,7 @@ mod tests { use crate::{common, test}; use crate::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}; + use arrow_array::RecordBatchOptions; use arrow_schema::Schema; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::PhysicalExpr; From a851ecf1cc24a6b867d40087d8e890b9307137c1 Mon Sep 17 00:00:00 2001 From: comphead Date: Thu, 22 Feb 2024 19:16:30 -0800 Subject: [PATCH 05/45] Support IGNORE NULLS for LAG window function (#9221) * WIP lag/lead ignore nulls * Support IGNORE NULLS for LAG function * fmt * comments * remove comments * Add new tests, minor changes, trigger evalaute_all * Make algorithm pruning friendly --------- Co-authored-by: Mustafa Akur --- datafusion/core/src/dataframe/mod.rs | 1 + .../core/src/physical_optimizer/test_utils.rs | 1 + datafusion/core/src/physical_planner.rs | 6 + datafusion/core/tests/dataframe/mod.rs | 1 + .../core/tests/fuzz_cases/window_fuzz.rs | 3 + datafusion/expr/src/expr.rs | 18 +++ datafusion/expr/src/tree_node/expr.rs | 2 + datafusion/expr/src/udwf.rs | 1 + datafusion/expr/src/utils.rs | 10 ++ .../src/analyzer/count_wildcard_rule.rs | 3 + .../optimizer/src/analyzer/type_coercion.rs | 2 + .../optimizer/src/push_down_projection.rs | 2 + .../physical-expr/src/window/lead_lag.rs | 88 ++++++++++++-- datafusion/physical-plan/src/windows/mod.rs | 8 +- .../proto/src/logical_plan/from_proto.rs | 6 + datafusion/proto/src/logical_plan/to_proto.rs | 2 + .../proto/src/physical_plan/from_proto.rs | 1 + .../tests/cases/roundtrip_logical_plan.rs | 6 + datafusion/sql/src/expr/function.rs | 16 ++- datafusion/sqllogictest/test_files/window.slt | 107 ++++++++++++++++++ .../substrait/src/logical_plan/consumer.rs | 1 + .../substrait/src/logical_plan/producer.rs | 1 + 22 files changed, 272 insertions(+), 14 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 4ec16ac942b2..e407c477ae4c 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1685,6 +1685,7 @@ mod tests { vec![col("aggregate_test_100.c2")], vec![], WindowFrame::new(None), + None, )); let t2 = t.select(vec![col("c1"), first_row])?; let plan = t2.plan.clone(); diff --git a/datafusion/core/src/physical_optimizer/test_utils.rs b/datafusion/core/src/physical_optimizer/test_utils.rs index ca7fb78d21b1..3898fb6345f0 100644 --- a/datafusion/core/src/physical_optimizer/test_utils.rs +++ b/datafusion/core/src/physical_optimizer/test_utils.rs @@ -245,6 +245,7 @@ pub fn bounded_window_exec( &sort_exprs, Arc::new(WindowFrame::new(Some(false))), schema.as_ref(), + false, ) .unwrap()], input.clone(), diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index dabf0a91b2d3..23ac7e08cad8 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -100,6 +100,7 @@ use futures::future::BoxFuture; use futures::{FutureExt, StreamExt, TryStreamExt}; use itertools::{multiunzip, Itertools}; use log::{debug, trace}; +use sqlparser::ast::NullTreatment; fn create_function_physical_name( fun: &str, @@ -1581,6 +1582,7 @@ pub fn create_window_expr_with_name( partition_by, order_by, window_frame, + null_treatment, }) => { let args = args .iter() @@ -1605,6 +1607,9 @@ pub fn create_window_expr_with_name( } let window_frame = Arc::new(window_frame.clone()); + let ignore_nulls = null_treatment + .unwrap_or(sqlparser::ast::NullTreatment::RespectNulls) + == NullTreatment::IgnoreNulls; windows::create_window_expr( fun, name, @@ -1613,6 +1618,7 @@ pub fn create_window_expr_with_name( &order_by, window_frame, physical_input_schema, + ignore_nulls, ) } other => plan_err!("Invalid window expression '{other:?}'"), diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index f650e9e39d88..b08b2b8fc7a2 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -182,6 +182,7 @@ async fn test_count_wildcard_on_window() -> Result<()> { WindowFrameBound::Preceding(ScalarValue::UInt32(Some(6))), WindowFrameBound::Following(ScalarValue::UInt32(Some(2))), ), + None, ))])? .explain(false, false)? .collect() diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index d22d0c0f2ee0..609d26c9c253 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -281,6 +281,7 @@ async fn bounded_window_causal_non_causal() -> Result<()> { &orderby_exprs, Arc::new(window_frame), schema.as_ref(), + false, )?; let running_window_exec = Arc::new(BoundedWindowAggExec::try_new( vec![window_expr], @@ -642,6 +643,7 @@ async fn run_window_test( &orderby_exprs, Arc::new(window_frame.clone()), schema.as_ref(), + false, ) .unwrap()], exec1, @@ -664,6 +666,7 @@ async fn run_window_test( &orderby_exprs, Arc::new(window_frame.clone()), schema.as_ref(), + false, ) .unwrap()], exec2, diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 09de4b708de9..f40ccb6cdb58 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -30,6 +30,7 @@ use arrow::datatypes::DataType; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{internal_err, DFSchema, OwnedTableReference}; use datafusion_common::{plan_err, Column, DataFusionError, Result, ScalarValue}; +use sqlparser::ast::NullTreatment; use std::collections::HashSet; use std::fmt; use std::fmt::{Display, Formatter, Write}; @@ -646,6 +647,7 @@ pub struct WindowFunction { pub order_by: Vec, /// Window frame pub window_frame: window_frame::WindowFrame, + pub null_treatment: Option, } impl WindowFunction { @@ -656,6 +658,7 @@ impl WindowFunction { partition_by: Vec, order_by: Vec, window_frame: window_frame::WindowFrame, + null_treatment: Option, ) -> Self { Self { fun, @@ -663,6 +666,7 @@ impl WindowFunction { partition_by, order_by, window_frame, + null_treatment, } } } @@ -1440,8 +1444,14 @@ impl fmt::Display for Expr { partition_by, order_by, window_frame, + null_treatment, }) => { fmt_function(f, &fun.to_string(), false, args, true)?; + + if let Some(nt) = null_treatment { + write!(f, "{}", nt)?; + } + if !partition_by.is_empty() { write!(f, " PARTITION BY [{}]", expr_vec_fmt!(partition_by))?; } @@ -1768,15 +1778,23 @@ fn create_name(e: &Expr) -> Result { window_frame, partition_by, order_by, + null_treatment, }) => { let mut parts: Vec = vec![create_function_name(&fun.to_string(), false, args)?]; + + if let Some(nt) = null_treatment { + parts.push(format!("{}", nt)); + } + if !partition_by.is_empty() { parts.push(format!("PARTITION BY [{}]", expr_vec_fmt!(partition_by))); } + if !order_by.is_empty() { parts.push(format!("ORDER BY [{}]", expr_vec_fmt!(order_by))); } + parts.push(format!("{window_frame}")); Ok(parts.join(" ")) } diff --git a/datafusion/expr/src/tree_node/expr.rs b/datafusion/expr/src/tree_node/expr.rs index add15b3d7ad7..def25ed9242f 100644 --- a/datafusion/expr/src/tree_node/expr.rs +++ b/datafusion/expr/src/tree_node/expr.rs @@ -283,12 +283,14 @@ impl TreeNode for Expr { partition_by, order_by, window_frame, + null_treatment, }) => Expr::WindowFunction(WindowFunction::new( fun, transform_vec(args, &mut transform)?, transform_vec(partition_by, &mut transform)?, transform_vec(order_by, &mut transform)?, window_frame, + null_treatment, )), Expr::AggregateFunction(AggregateFunction { args, diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 953483408865..7e3eb6c001a1 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -130,6 +130,7 @@ impl WindowUDF { partition_by, order_by, window_frame, + null_treatment: None, }) } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index e855554f3687..2fda81d8896f 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -1255,6 +1255,7 @@ mod tests { vec![], vec![], WindowFrame::new(None), + None, )); let max2 = Expr::WindowFunction(expr::WindowFunction::new( WindowFunctionDefinition::AggregateFunction(AggregateFunction::Max), @@ -1262,6 +1263,7 @@ mod tests { vec![], vec![], WindowFrame::new(None), + None, )); let min3 = Expr::WindowFunction(expr::WindowFunction::new( WindowFunctionDefinition::AggregateFunction(AggregateFunction::Min), @@ -1269,6 +1271,7 @@ mod tests { vec![], vec![], WindowFrame::new(None), + None, )); let sum4 = Expr::WindowFunction(expr::WindowFunction::new( WindowFunctionDefinition::AggregateFunction(AggregateFunction::Sum), @@ -1276,6 +1279,7 @@ mod tests { vec![], vec![], WindowFrame::new(None), + None, )); let exprs = &[max1.clone(), max2.clone(), min3.clone(), sum4.clone()]; let result = group_window_expr_by_sort_keys(exprs.to_vec())?; @@ -1298,6 +1302,7 @@ mod tests { vec![], vec![age_asc.clone(), name_desc.clone()], WindowFrame::new(Some(false)), + None, )); let max2 = Expr::WindowFunction(expr::WindowFunction::new( WindowFunctionDefinition::AggregateFunction(AggregateFunction::Max), @@ -1305,6 +1310,7 @@ mod tests { vec![], vec![], WindowFrame::new(None), + None, )); let min3 = Expr::WindowFunction(expr::WindowFunction::new( WindowFunctionDefinition::AggregateFunction(AggregateFunction::Min), @@ -1312,6 +1318,7 @@ mod tests { vec![], vec![age_asc.clone(), name_desc.clone()], WindowFrame::new(Some(false)), + None, )); let sum4 = Expr::WindowFunction(expr::WindowFunction::new( WindowFunctionDefinition::AggregateFunction(AggregateFunction::Sum), @@ -1319,6 +1326,7 @@ mod tests { vec![], vec![name_desc.clone(), age_asc.clone(), created_at_desc.clone()], WindowFrame::new(Some(false)), + None, )); // FIXME use as_ref let exprs = &[max1.clone(), max2.clone(), min3.clone(), sum4.clone()]; @@ -1353,6 +1361,7 @@ mod tests { Expr::Sort(expr::Sort::new(Box::new(col("name")), false, true)), ], WindowFrame::new(Some(false)), + None, )), Expr::WindowFunction(expr::WindowFunction::new( WindowFunctionDefinition::AggregateFunction(AggregateFunction::Sum), @@ -1364,6 +1373,7 @@ mod tests { Expr::Sort(expr::Sort::new(Box::new(col("created_at")), false, true)), ], WindowFrame::new(Some(false)), + None, )), ]; let expected = vec![ diff --git a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs index 35a859783239..9242e68562c6 100644 --- a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs @@ -128,6 +128,7 @@ impl TreeNodeRewriter for CountWildcardRewriter { partition_by, order_by, window_frame, + null_treatment, }) if args.len() == 1 => match args[0] { Expr::Wildcard { qualifier: None } => { Expr::WindowFunction(expr::WindowFunction { @@ -138,6 +139,7 @@ impl TreeNodeRewriter for CountWildcardRewriter { partition_by, order_by, window_frame, + null_treatment, }) } @@ -351,6 +353,7 @@ mod tests { WindowFrameBound::Preceding(ScalarValue::UInt32(Some(6))), WindowFrameBound::Following(ScalarValue::UInt32(Some(2))), ), + None, ))])? .project(vec![count(wildcard())])? .build()?; diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index fba77047dd74..8cdb4d7dbdf6 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -392,6 +392,7 @@ impl TreeNodeRewriter for TypeCoercionRewriter { partition_by, order_by, window_frame, + null_treatment, }) => { let window_frame = coerce_window_frame(window_frame, &self.schema, &order_by)?; @@ -414,6 +415,7 @@ impl TreeNodeRewriter for TypeCoercionRewriter { partition_by, order_by, window_frame, + null_treatment, )); Ok(expr) } diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs index 6a003ecb5fa8..8b7a9148b590 100644 --- a/datafusion/optimizer/src/push_down_projection.rs +++ b/datafusion/optimizer/src/push_down_projection.rs @@ -587,6 +587,7 @@ mod tests { vec![col("test.b")], vec![], WindowFrame::new(None), + None, )); let max2 = Expr::WindowFunction(expr::WindowFunction::new( @@ -595,6 +596,7 @@ mod tests { vec![], vec![], WindowFrame::new(None), + None, )); let col1 = col(max1.display_name()?); let col2 = col(max2.display_name()?); diff --git a/datafusion/physical-expr/src/window/lead_lag.rs b/datafusion/physical-expr/src/window/lead_lag.rs index 6a33f26ca126..6e1aad575f6a 100644 --- a/datafusion/physical-expr/src/window/lead_lag.rs +++ b/datafusion/physical-expr/src/window/lead_lag.rs @@ -23,10 +23,14 @@ use crate::PhysicalExpr; use arrow::array::ArrayRef; use arrow::compute::cast; use arrow::datatypes::{DataType, Field}; -use datafusion_common::{arrow_datafusion_err, DataFusionError, Result, ScalarValue}; +use arrow_array::Array; +use datafusion_common::{ + arrow_datafusion_err, exec_datafusion_err, DataFusionError, Result, ScalarValue, +}; use datafusion_expr::PartitionEvaluator; use std::any::Any; use std::cmp::min; +use std::collections::VecDeque; use std::ops::{Neg, Range}; use std::sync::Arc; @@ -39,6 +43,7 @@ pub struct WindowShift { shift_offset: i64, expr: Arc, default_value: Option, + ignore_nulls: bool, } impl WindowShift { @@ -60,6 +65,7 @@ pub fn lead( expr: Arc, shift_offset: Option, default_value: Option, + ignore_nulls: bool, ) -> WindowShift { WindowShift { name, @@ -67,6 +73,7 @@ pub fn lead( shift_offset: shift_offset.map(|v| v.neg()).unwrap_or(-1), expr, default_value, + ignore_nulls, } } @@ -77,6 +84,7 @@ pub fn lag( expr: Arc, shift_offset: Option, default_value: Option, + ignore_nulls: bool, ) -> WindowShift { WindowShift { name, @@ -84,6 +92,7 @@ pub fn lag( shift_offset: shift_offset.unwrap_or(1), expr, default_value, + ignore_nulls, } } @@ -110,6 +119,8 @@ impl BuiltInWindowFunctionExpr for WindowShift { Ok(Box::new(WindowShiftEvaluator { shift_offset: self.shift_offset, default_value: self.default_value.clone(), + ignore_nulls: self.ignore_nulls, + non_null_offsets: VecDeque::new(), })) } @@ -120,6 +131,7 @@ impl BuiltInWindowFunctionExpr for WindowShift { shift_offset: -self.shift_offset, expr: self.expr.clone(), default_value: self.default_value.clone(), + ignore_nulls: self.ignore_nulls, })) } } @@ -128,6 +140,16 @@ impl BuiltInWindowFunctionExpr for WindowShift { pub(crate) struct WindowShiftEvaluator { shift_offset: i64, default_value: Option, + ignore_nulls: bool, + // VecDeque contains offset values that between non-null entries + non_null_offsets: VecDeque, +} + +impl WindowShiftEvaluator { + fn is_lag(&self) -> bool { + // Mode is LAG, when shift_offset is positive + self.shift_offset > 0 + } } fn create_empty_array( @@ -182,9 +204,13 @@ fn shift_with_default_value( impl PartitionEvaluator for WindowShiftEvaluator { fn get_range(&self, idx: usize, n_rows: usize) -> Result> { - if self.shift_offset > 0 { - let offset = self.shift_offset as usize; - let start = idx.saturating_sub(offset); + if self.is_lag() { + let start = if self.non_null_offsets.len() == self.shift_offset as usize { + let offset: usize = self.non_null_offsets.iter().sum(); + idx.saturating_sub(offset + 1) + } else { + 0 + }; let end = idx + 1; Ok(Range { start, end }) } else { @@ -196,7 +222,7 @@ impl PartitionEvaluator for WindowShiftEvaluator { fn is_causal(&self) -> bool { // Lagging windows are causal by definition: - self.shift_offset > 0 + self.is_lag() } fn evaluate( @@ -204,17 +230,57 @@ impl PartitionEvaluator for WindowShiftEvaluator { values: &[ArrayRef], range: &Range, ) -> Result { + // TODO: try to get rid of i64 usize conversion + // TODO: do not recalculate default value every call + // TODO: support LEAD mode for IGNORE NULLS let array = &values[0]; let dtype = array.data_type(); + let len = array.len() as i64; // LAG mode - let idx = if self.shift_offset > 0 { + let mut idx = if self.is_lag() { range.end as i64 - self.shift_offset - 1 } else { // LEAD mode range.start as i64 - self.shift_offset }; - if idx < 0 || idx as usize >= array.len() { + // Support LAG only for now, as LEAD requires some brainstorm first + // LAG with IGNORE NULLS calculated as the current row index - offset, but only for non-NULL rows + // If current row index points to NULL value the row is NOT counted + if self.ignore_nulls && self.is_lag() { + // Find the nonNULL row index that shifted by offset comparing to current row index + idx = if self.non_null_offsets.len() == self.shift_offset as usize { + let total_offset: usize = self.non_null_offsets.iter().sum(); + (range.end - 1 - total_offset) as i64 + } else { + -1 + }; + + // Keep track of offset values between non-null entries + if array.is_valid(range.end - 1) { + // Non-null add new offset + self.non_null_offsets.push_back(1); + if self.non_null_offsets.len() > self.shift_offset as usize { + // WE do not need to keep track of more than `lag number of offset` values. + self.non_null_offsets.pop_front(); + } + } else if !self.non_null_offsets.is_empty() { + // Entry is null, increment offset value of the last entry. + let end_idx = self.non_null_offsets.len() - 1; + self.non_null_offsets[end_idx] += 1; + } + } else if self.ignore_nulls && !self.is_lag() { + // IGNORE NULLS and LEAD mode. + return Err(exec_datafusion_err!( + "IGNORE NULLS mode for LEAD is not supported for BoundedWindowAggExec" + )); + } + + // Set the default value if + // - index is out of window bounds + // OR + // - ignore nulls mode and current value is null and is within window bounds + if idx < 0 || idx >= len || (self.ignore_nulls && array.is_null(idx as usize)) { get_default_value(self.default_value.as_ref(), dtype) } else { ScalarValue::try_from_array(array, idx as usize) @@ -226,6 +292,11 @@ impl PartitionEvaluator for WindowShiftEvaluator { values: &[ArrayRef], _num_rows: usize, ) -> Result { + if self.ignore_nulls { + return Err(exec_datafusion_err!( + "IGNORE NULLS mode for LAG and LEAD is not supported for WindowAggExec" + )); + } // LEAD, LAG window functions take single column, values will have size 1 let value = &values[0]; shift_with_default_value(value, self.shift_offset, self.default_value.as_ref()) @@ -279,6 +350,7 @@ mod tests { Arc::new(Column::new("c3", 0)), None, None, + false, ), [ Some(-2), @@ -301,6 +373,7 @@ mod tests { Arc::new(Column::new("c3", 0)), None, None, + false, ), [ None, @@ -323,6 +396,7 @@ mod tests { Arc::new(Column::new("c3", 0)), None, Some(ScalarValue::Int32(Some(100))), + false, ), [ Some(100), diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index 693d20e90a66..bf6ed925356c 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -55,6 +55,7 @@ pub use datafusion_physical_expr::window::{ }; /// Create a physical expression for window function +#[allow(clippy::too_many_arguments)] pub fn create_window_expr( fun: &WindowFunctionDefinition, name: String, @@ -63,6 +64,7 @@ pub fn create_window_expr( order_by: &[PhysicalSortExpr], window_frame: Arc, input_schema: &Schema, + ignore_nulls: bool, ) -> Result> { Ok(match fun { WindowFunctionDefinition::AggregateFunction(fun) => { @@ -83,7 +85,7 @@ pub fn create_window_expr( } WindowFunctionDefinition::BuiltInWindowFunction(fun) => { Arc::new(BuiltInWindowExpr::new( - create_built_in_window_expr(fun, args, input_schema, name)?, + create_built_in_window_expr(fun, args, input_schema, name, ignore_nulls)?, partition_by, order_by, window_frame, @@ -159,6 +161,7 @@ fn create_built_in_window_expr( args: &[Arc], input_schema: &Schema, name: String, + ignore_nulls: bool, ) -> Result> { // need to get the types into an owned vec for some reason let input_types: Vec<_> = args @@ -208,6 +211,7 @@ fn create_built_in_window_expr( arg, shift_offset, default_value, + ignore_nulls, )) } BuiltInWindowFunction::Lead => { @@ -222,6 +226,7 @@ fn create_built_in_window_expr( arg, shift_offset, default_value, + ignore_nulls, )) } BuiltInWindowFunction::NthValue => { @@ -671,6 +676,7 @@ mod tests { &[], Arc::new(WindowFrame::new(None)), schema.as_ref(), + false, )?], blocking_exec, vec![], diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index f1ee84a8221d..2554018a9273 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -1100,6 +1100,8 @@ pub fn parse_expr( "missing window frame during deserialization".to_string(), ) })?; + // TODO: support proto for null treatment + let null_treatment = None; regularize_window_order_by(&window_frame, &mut order_by)?; match window_function { @@ -1114,6 +1116,7 @@ pub fn parse_expr( partition_by, order_by, window_frame, + None ))) } window_expr_node::WindowFunction::BuiltInFunction(i) => { @@ -1133,6 +1136,7 @@ pub fn parse_expr( partition_by, order_by, window_frame, + null_treatment ))) } window_expr_node::WindowFunction::Udaf(udaf_name) => { @@ -1148,6 +1152,7 @@ pub fn parse_expr( partition_by, order_by, window_frame, + None, ))) } window_expr_node::WindowFunction::Udwf(udwf_name) => { @@ -1163,6 +1168,7 @@ pub fn parse_expr( partition_by, order_by, window_frame, + None, ))) } } diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index a6348e909cb0..ccadbb217a58 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -606,6 +606,8 @@ impl TryFrom<&Expr> for protobuf::LogicalExprNode { ref partition_by, ref order_by, ref window_frame, + // TODO: support null treatment in proto + null_treatment: _, }) => { let window_function = match fun { WindowFunctionDefinition::AggregateFunction(fun) => { diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 628ee5ad9b7a..af0aa485c348 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -176,6 +176,7 @@ pub fn parse_physical_window_expr( &order_by, Arc::new(window_frame), input_schema, + false, ) } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 81f59975476f..6ca757908159 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -1718,6 +1718,7 @@ fn roundtrip_window() { vec![col("col1")], vec![col("col2")], WindowFrame::new(Some(false)), + None, )); // 2. with default window_frame @@ -1729,6 +1730,7 @@ fn roundtrip_window() { vec![col("col1")], vec![col("col2")], WindowFrame::new(Some(false)), + None, )); // 3. with window_frame with row numbers @@ -1746,6 +1748,7 @@ fn roundtrip_window() { vec![col("col1")], vec![col("col2")], range_number_frame, + None, )); // 4. test with AggregateFunction @@ -1761,6 +1764,7 @@ fn roundtrip_window() { vec![col("col1")], vec![col("col2")], row_number_frame.clone(), + None, )); // 5. test with AggregateUDF @@ -1812,6 +1816,7 @@ fn roundtrip_window() { vec![col("col1")], vec![col("col2")], row_number_frame.clone(), + None, )); ctx.register_udaf(dummy_agg); @@ -1887,6 +1892,7 @@ fn roundtrip_window() { vec![col("col1")], vec![col("col2")], row_number_frame, + None, )); ctx.register_udwf(dummy_window_udf); diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 64b8d6957d2b..f56138066cb6 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -52,8 +52,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { order_by, } = function; - if let Some(null_treatment) = null_treatment { - return not_impl_err!("Null treatment in aggregate functions is not supported: {null_treatment}"); + // If function is a window function (it has an OVER clause), + // it shouldn't have ordering requirement as function argument + // required ordering should be defined in OVER clause. + let is_function_window = over.is_some(); + + match null_treatment { + Some(null_treatment) if !is_function_window => return not_impl_err!("Null treatment in aggregate functions is not supported: {null_treatment}"), + _ => {} } let name = if name.0.len() > 1 { @@ -120,10 +126,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { return Ok(Expr::ScalarFunction(ScalarFunction::new(fun, args))); }; - // If function is a window function (it has an OVER clause), - // it shouldn't have ordering requirement as function argument - // required ordering should be defined in OVER clause. - let is_function_window = over.is_some(); if !order_by.is_empty() && is_function_window { return plan_err!( "Aggregate ORDER BY is not implemented for window functions" @@ -198,6 +200,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { partition_by, order_by, window_frame, + null_treatment, )) } _ => Expr::WindowFunction(expr::WindowFunction::new( @@ -206,6 +209,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { partition_by, order_by, window_frame, + null_treatment, )), }; return Ok(expr); diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 9276f6e1e325..8d6b314747bb 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -4102,3 +4102,110 @@ ProjectionExec: expr=[ROW_NUMBER() PARTITION BY [a.a] ROWS BETWEEN UNBOUNDED PRE ----------CoalesceBatchesExec: target_batch_size=4096 ------------FilterExec: a@0 = 1 --------------MemoryExec: partitions=1, partition_sizes=[1] + +# LAG window function IGNORE/RESPECT NULLS support with ascending order and default offset 1 +query TTTTTT +select lag(a) ignore nulls over (order by id) as x, + lag(a, 1, null) ignore nulls over (order by id) as x1, + lag(a, 1, 'def') ignore nulls over (order by id) as x2, + lag(a) respect nulls over (order by id) as x3, + lag(a, 1, null) respect nulls over (order by id) as x4, + lag(a, 1, 'def') respect nulls over (order by id) as x5 +from (select 2 id, 'b' a union all select 1 id, null a union all select 3 id, null union all select 4 id, 'x') +---- +NULL NULL def NULL NULL def +NULL NULL def NULL NULL NULL +b b b b b b +b b b NULL NULL NULL + +# LAG window function IGNORE/RESPECT NULLS support with descending order and default offset 1 +query TTTTTT +select lag(a) ignore nulls over (order by id desc) as x, + lag(a, 1, null) ignore nulls over (order by id desc) as x1, + lag(a, 1, 'def') ignore nulls over (order by id desc) as x2, + lag(a) respect nulls over (order by id desc) as x3, + lag(a, 1, null) respect nulls over (order by id desc) as x4, + lag(a, 1, 'def') respect nulls over (order by id desc) as x5 +from (select 2 id, 'b' a union all select 1 id, null a union all select 3 id, null union all select 4 id, 'x') +---- +NULL NULL def NULL NULL def +x x x x x x +x x x NULL NULL NULL +b b b b b b + +# LAG window function IGNORE/RESPECT NULLS support with ascending order and nondefault offset +query TTTT +select lag(a, 2, null) ignore nulls over (order by id) as x1, + lag(a, 2, 'def') ignore nulls over (order by id) as x2, + lag(a, 2, null) respect nulls over (order by id) as x4, + lag(a, 2, 'def') respect nulls over (order by id) as x5 +from (select 2 id, 'b' a union all select 1 id, null a union all select 3 id, null union all select 4 id, 'x') +---- +NULL def NULL def +NULL def NULL def +NULL def NULL NULL +NULL def b b + +# LAG window function IGNORE/RESPECT NULLS support with descending order and nondefault offset +query TTTT +select lag(a, 2, null) ignore nulls over (order by id desc) as x1, + lag(a, 2, 'def') ignore nulls over (order by id desc) as x2, + lag(a, 2, null) respect nulls over (order by id desc) as x4, + lag(a, 2, 'def') respect nulls over (order by id desc) as x5 +from (select 2 id, 'b' a union all select 1 id, null a union all select 3 id, null union all select 4 id, 'x') +---- +NULL def NULL def +NULL def NULL def +NULL def x x +x x NULL NULL + +# LAG window function IGNORE/RESPECT NULLS support with descending order and nondefault offset. +# To trigger WindowAggExec, we added a sum window function with all of the ranges. +statement error Execution error: IGNORE NULLS mode for LAG and LEAD is not supported for WindowAggExec +select lag(a, 2, null) ignore nulls over (order by id desc) as x1, + lag(a, 2, 'def') ignore nulls over (order by id desc) as x2, + lag(a, 2, null) respect nulls over (order by id desc) as x4, + lag(a, 2, 'def') respect nulls over (order by id desc) as x5, + sum(id) over (order by id desc ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as sum_id +from (select 2 id, 'b' a union all select 1 id, null a union all select 3 id, null union all select 4 id, 'x') + +# LEAD window function IGNORE/RESPECT NULLS support with descending order and nondefault offset +statement error Execution error: IGNORE NULLS mode for LEAD is not supported for BoundedWindowAggExec +select lead(a, 2, null) ignore nulls over (order by id desc) as x1, + lead(a, 2, 'def') ignore nulls over (order by id desc) as x2, + lead(a, 2, null) respect nulls over (order by id desc) as x4, + lead(a, 2, 'def') respect nulls over (order by id desc) as x5 +from (select 2 id, 'b' a union all select 1 id, null a union all select 3 id, null union all select 4 id, 'x') + +statement ok +set datafusion.execution.batch_size = 1000; + +query I +SELECT LAG(c1, 2) IGNORE NULLS OVER() +FROM null_cases +ORDER BY c2 +LIMIT 5; +---- +78 +63 +3 +24 +14 + +# result should be same with above, when lag algorithm work with pruned data. +# decreasing batch size, causes data to be produced in smaller chunks at the source. +# Hence sliding window algorithm is used during calculations. +statement ok +set datafusion.execution.batch_size = 1; + +query I +SELECT LAG(c1, 2) IGNORE NULLS OVER() +FROM null_cases +ORDER BY c2 +LIMIT 5; +---- +78 +63 +3 +24 +14 diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 58a741c63401..23a7ee05d73e 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -978,6 +978,7 @@ pub async fn from_substrait_rex( from_substrait_bound(&window.lower_bound, true)?, from_substrait_bound(&window.upper_bound, false)?, ), + null_treatment: None, }))) } Some(RexType::Subquery(subquery)) => match &subquery.as_ref().subquery_type { diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index fc9517c90a45..9b29c0c67765 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -1115,6 +1115,7 @@ pub fn to_substrait_rex( partition_by, order_by, window_frame, + null_treatment: _, }) => { // function reference let function_anchor = _register_function(fun.to_string(), extension_info); From ae4113dc4e15fc6aea5eb99c91c63c63b305eb1b Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Thu, 22 Feb 2024 23:15:11 -0600 Subject: [PATCH 06/45] fix: issue #9213 substitute ArrayAgg to NthValue to optimize query plan (#9295) * fix: issue #9213 substitute ArrayAgg to NthValue to optimize query plan * fix format * adding type check * adding test --- datafusion/sql/src/expr/mod.rs | 46 +++++++- .../test_files/agg_func_substitute.slt | 100 ++++++++++++++++++ 2 files changed, 142 insertions(+), 4 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/agg_func_substitute.slt diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index ecf510da7bce..da6c3a6074d4 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -203,9 +203,44 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } SQLExpr::ArrayIndex { obj, indexes } => { + fn is_unsupported(expr: &SQLExpr) -> bool { + matches!(expr, SQLExpr::JsonAccess { .. }) + } + fn simplify_array_index_expr(expr: Expr, index: Expr) -> (Expr, bool) { + match &expr { + Expr::AggregateFunction(agg_func) if agg_func.func_def == datafusion_expr::expr::AggregateFunctionDefinition::BuiltIn(AggregateFunction::ArrayAgg) => { + let mut new_args = agg_func.args.clone(); + new_args.push(index.clone()); + (Expr::AggregateFunction(datafusion_expr::expr::AggregateFunction::new( + datafusion_expr::AggregateFunction::NthValue, + new_args, + agg_func.distinct, + agg_func.filter.clone(), + agg_func.order_by.clone(), + )), true) + }, + _ => (expr, false), + } + } let expr = self.sql_expr_to_logical_expr(*obj, schema, planner_context)?; - self.plan_indexed(expr, indexes, schema, planner_context) + if indexes.len() > 1 || is_unsupported(&indexes[0]) { + return self.plan_indexed(expr, indexes, schema, planner_context); + } + let (new_expr, changed) = simplify_array_index_expr( + expr, + self.sql_expr_to_logical_expr( + indexes[0].clone(), + schema, + planner_context, + )?, + ); + + if changed { + Ok(new_expr) + } else { + self.plan_indexed(new_expr, indexes, schema, planner_context) + } } SQLExpr::CompoundIdentifier(ids) => { @@ -557,7 +592,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { limit, within_group, } = array_agg; - let order_by = if let Some(order_by) = order_by { Some(self.order_by_to_sort_expr( &order_by, @@ -581,10 +615,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { vec![self.sql_expr_to_logical_expr(*expr, input_schema, planner_context)?]; // next, aggregate built-ins - let fun = AggregateFunction::ArrayAgg; Ok(Expr::AggregateFunction(expr::AggregateFunction::new( - fun, args, distinct, None, order_by, + AggregateFunction::ArrayAgg, + args, + distinct, + None, + order_by, ))) + // see if we can rewrite it into NTH-VALUE } fn sql_in_list_to_expr( diff --git a/datafusion/sqllogictest/test_files/agg_func_substitute.slt b/datafusion/sqllogictest/test_files/agg_func_substitute.slt new file mode 100644 index 000000000000..650ec1ad8e5c --- /dev/null +++ b/datafusion/sqllogictest/test_files/agg_func_substitute.slt @@ -0,0 +1,100 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####### +# Setup test data table +####### +statement ok +CREATE EXTERNAL TABLE multiple_ordered_table ( + a0 INTEGER, + a INTEGER, + b INTEGER, + c INTEGER, + d INTEGER +) +STORED AS CSV +WITH HEADER ROW +WITH ORDER (a ASC, b ASC) +WITH ORDER (c ASC) +LOCATION '../../datafusion/core/tests/data/window_2.csv'; + + +query TT +EXPLAIN SELECT a, ARRAY_AGG(c ORDER BY c)[1] as result + FROM multiple_ordered_table + GROUP BY a; +---- +logical_plan +Projection: multiple_ordered_table.a, NTH_VALUE(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] AS result +--Aggregate: groupBy=[[multiple_ordered_table.a]], aggr=[[NTH_VALUE(multiple_ordered_table.c, Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]]] +----TableScan: multiple_ordered_table projection=[a, c] +physical_plan +ProjectionExec: expr=[a@0 as a, NTH_VALUE(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]@1 as result] +--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[NTH_VALUE(multiple_ordered_table.c,Int64(1))], ordering_mode=Sorted +----SortExec: expr=[a@0 ASC NULLS LAST] +------CoalesceBatchesExec: target_batch_size=8192 +--------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 +----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[NTH_VALUE(multiple_ordered_table.c,Int64(1))], ordering_mode=Sorted +------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], has_header=true + + +query TT +EXPLAIN SELECT a, NTH_VALUE(c, 1 ORDER BY c) as result + FROM multiple_ordered_table + GROUP BY a; +---- +logical_plan +Projection: multiple_ordered_table.a, NTH_VALUE(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] AS result +--Aggregate: groupBy=[[multiple_ordered_table.a]], aggr=[[NTH_VALUE(multiple_ordered_table.c, Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]]] +----TableScan: multiple_ordered_table projection=[a, c] +physical_plan +ProjectionExec: expr=[a@0 as a, NTH_VALUE(multiple_ordered_table.c,Int64(1)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]@1 as result] +--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[NTH_VALUE(multiple_ordered_table.c,Int64(1))], ordering_mode=Sorted +----SortExec: expr=[a@0 ASC NULLS LAST] +------CoalesceBatchesExec: target_batch_size=8192 +--------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 +----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[NTH_VALUE(multiple_ordered_table.c,Int64(1))], ordering_mode=Sorted +------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], has_header=true + +query TT +EXPLAIN SELECT a, ARRAY_AGG(c ORDER BY c)[1 + 100] as result + FROM multiple_ordered_table + GROUP BY a; +---- +logical_plan +Projection: multiple_ordered_table.a, NTH_VALUE(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] AS result +--Aggregate: groupBy=[[multiple_ordered_table.a]], aggr=[[NTH_VALUE(multiple_ordered_table.c, Int64(101)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST] AS NTH_VALUE(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]]] +----TableScan: multiple_ordered_table projection=[a, c] +physical_plan +ProjectionExec: expr=[a@0 as a, NTH_VALUE(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]@1 as result] +--AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[NTH_VALUE(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted +----SortExec: expr=[a@0 ASC NULLS LAST] +------CoalesceBatchesExec: target_batch_size=8192 +--------RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 +----------AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[NTH_VALUE(multiple_ordered_table.c,Int64(1) + Int64(100)) ORDER BY [multiple_ordered_table.c ASC NULLS LAST]], ordering_mode=Sorted +------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +--------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, c], output_orderings=[[a@0 ASC NULLS LAST], [c@1 ASC NULLS LAST]], has_header=true + +query II +SELECT a, ARRAY_AGG(c ORDER BY c)[1] as result + FROM multiple_ordered_table + GROUP BY a; +---- +0 0 +1 50 From fad60615623079ee733c6e48f3ef4749d001ac19 Mon Sep 17 00:00:00 2001 From: comphead Date: Thu, 22 Feb 2024 22:38:00 -0800 Subject: [PATCH 07/45] Minor: Adding missing fields to debug for (#9325) --- datafusion/physical-expr/src/scalar_function.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/physical-expr/src/scalar_function.rs b/datafusion/physical-expr/src/scalar_function.rs index b73626aa4340..bfe0fdb279f5 100644 --- a/datafusion/physical-expr/src/scalar_function.rs +++ b/datafusion/physical-expr/src/scalar_function.rs @@ -69,6 +69,8 @@ impl Debug for ScalarFunctionExpr { .field("name", &self.name) .field("args", &self.args) .field("return_type", &self.return_type) + .field("monotonicity", &self.monotonicity) + .field("supports_zero_argument", &self.supports_zero_argument) .finish() } } From 3b355c798a3258f118016b33f26c5a55fed36220 Mon Sep 17 00:00:00 2001 From: SteveLauC Date: Fri, 23 Feb 2024 17:58:38 +0800 Subject: [PATCH 08/45] docs: document range() alias generate_series() (#9321) --- docs/source/user-guide/sql/scalar_functions.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index a6e872cab4c1..707e8c24b326 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1935,6 +1935,7 @@ from_unixtime(expression) - [array_to_string](#array_to_string) - [cardinality](#cardinality) - [empty](#empty) +- [generate_series](#generate_series) - [list_append](#list_append) - [list_sort](#list_sort) - [list_cat](#list_cat) @@ -2881,6 +2882,10 @@ empty(array) +------------------+ ``` +### `generate_series` + +_Alias of [range](#range)._ + ### `list_append` _Alias of [array_append](#array_append)._ @@ -3090,6 +3095,10 @@ Step can not be 0 (then the range will be nonsense.). - **end**: end of the range (not included) - **step**: increase by step (can not be 0) +#### Aliases + +- generate_series + ## Struct Functions - [struct](#struct) From 73c08b5045fc627c3ed81d1bcc5234a7436a8e38 Mon Sep 17 00:00:00 2001 From: universalmind303 Date: Sat, 24 Feb 2024 10:25:04 -0600 Subject: [PATCH 09/45] chore: statically link xz2 (#9311) * chore: statically link xz2 * toml fmt --- datafusion/core/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 874e4551323d..506be6667ef2 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -116,7 +116,7 @@ tokio = { workspace = true } tokio-util = { version = "0.7.4", features = ["io"], optional = true } url = { workspace = true } uuid = { version = "1.0", features = ["v4"] } -xz2 = { version = "0.1", optional = true } +xz2 = { version = "0.1", optional = true, features = ["static"] } zstd = { version = "0.13", optional = true, default-features = false } [dev-dependencies] From ff36f6d99285404cf45f870ccc1c650411fdb335 Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Sat, 24 Feb 2024 12:20:53 -0600 Subject: [PATCH 10/45] fix: throw error when incursion happen in dataframe api (#9330) * fix: issue #9327 throw error when incursion happen in dataframe api * fix --- datafusion/core/src/dataframe/mod.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index e407c477ae4c..3a60d57f6685 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -49,8 +49,8 @@ use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::file_options::json_writer::JsonWriterOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ - Column, DFSchema, DataFusionError, FileType, FileTypeWriterOptions, ParamValues, - SchemaError, UnnestOptions, + plan_err, Column, DFSchema, DataFusionError, FileType, FileTypeWriterOptions, + ParamValues, SchemaError, UnnestOptions, }; use datafusion_expr::dml::CopyOptions; use datafusion_expr::{ @@ -1044,6 +1044,9 @@ impl DataFrame { /// # } /// ``` pub fn explain(self, verbose: bool, analyze: bool) -> Result { + if matches!(self.plan, LogicalPlan::Explain(_)) { + return plan_err!("Nested EXPLAINs are not supported"); + } let plan = LogicalPlanBuilder::from(self.plan) .explain(verbose, analyze)? .build()?; @@ -2975,4 +2978,15 @@ mod tests { Ok(()) } + #[tokio::test] + async fn nested_explain_should_fail() -> Result<()> { + let ctx = SessionContext::new(); + // must be error + let mut result = ctx.sql("explain select 1").await?.explain(false, false); + assert!(result.is_err()); + // must be error + result = ctx.sql("explain explain select 1").await; + assert!(result.is_err()); + Ok(()) + } } From 148b4d22f1d6de2bc6269ab96ba2e48e4735a9f9 Mon Sep 17 00:00:00 2001 From: Hoang Pham Date: Sun, 25 Feb 2024 20:36:56 +0700 Subject: [PATCH 11/45] Support CopyTo::partition_by in datafusion proto (#9306) * add support for CopyTo::partition_by in proto Signed-off-by: Hoang Pham * simplify partition_by logic Signed-off-by: Hoang Pham --------- Signed-off-by: Hoang Pham --- datafusion/expr/src/logical_plan/dml.rs | 2 +- datafusion/proto/proto/datafusion.proto | 1 + datafusion/proto/src/generated/pbjson.rs | 18 ++++++++++++++++++ datafusion/proto/src/generated/prost.rs | 2 ++ datafusion/proto/src/logical_plan/mod.rs | 6 ++++-- .../tests/cases/roundtrip_logical_plan.rs | 11 +++++++---- 6 files changed, 33 insertions(+), 7 deletions(-) diff --git a/datafusion/expr/src/logical_plan/dml.rs b/datafusion/expr/src/logical_plan/dml.rs index a55781eda643..7f04bd8973d6 100644 --- a/datafusion/expr/src/logical_plan/dml.rs +++ b/datafusion/expr/src/logical_plan/dml.rs @@ -36,7 +36,7 @@ pub struct CopyTo { pub output_url: String, /// The file format to output (explicitly defined or inferred from file extension) pub file_format: FileType, - /// Detmines which, if any, columns should be used for hive-style partitioned writes + /// Determines which, if any, columns should be used for hive-style partitioned writes pub partition_by: Vec, /// Arbitrary options as tuples pub copy_options: CopyOptions, diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index e779e29cb8da..7673ce86ae1d 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -327,6 +327,7 @@ message CopyToNode { FileTypeWriterOptions writer_options = 5; } string file_type = 6; + repeated string partition_by = 7; } message SQLOptions { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index f5f15aa3e428..65483f9ac467 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -3795,6 +3795,9 @@ impl serde::Serialize for CopyToNode { if !self.file_type.is_empty() { len += 1; } + if !self.partition_by.is_empty() { + len += 1; + } if self.copy_options.is_some() { len += 1; } @@ -3808,6 +3811,9 @@ impl serde::Serialize for CopyToNode { if !self.file_type.is_empty() { struct_ser.serialize_field("fileType", &self.file_type)?; } + if !self.partition_by.is_empty() { + struct_ser.serialize_field("partitionBy", &self.partition_by)?; + } if let Some(v) = self.copy_options.as_ref() { match v { copy_to_node::CopyOptions::SqlOptions(v) => { @@ -3833,6 +3839,8 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { "outputUrl", "file_type", "fileType", + "partition_by", + "partitionBy", "sql_options", "sqlOptions", "writer_options", @@ -3844,6 +3852,7 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { Input, OutputUrl, FileType, + PartitionBy, SqlOptions, WriterOptions, } @@ -3870,6 +3879,7 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { "input" => Ok(GeneratedField::Input), "outputUrl" | "output_url" => Ok(GeneratedField::OutputUrl), "fileType" | "file_type" => Ok(GeneratedField::FileType), + "partitionBy" | "partition_by" => Ok(GeneratedField::PartitionBy), "sqlOptions" | "sql_options" => Ok(GeneratedField::SqlOptions), "writerOptions" | "writer_options" => Ok(GeneratedField::WriterOptions), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), @@ -3894,6 +3904,7 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { let mut input__ = None; let mut output_url__ = None; let mut file_type__ = None; + let mut partition_by__ = None; let mut copy_options__ = None; while let Some(k) = map_.next_key()? { match k { @@ -3915,6 +3926,12 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { } file_type__ = Some(map_.next_value()?); } + GeneratedField::PartitionBy => { + if partition_by__.is_some() { + return Err(serde::de::Error::duplicate_field("partitionBy")); + } + partition_by__ = Some(map_.next_value()?); + } GeneratedField::SqlOptions => { if copy_options__.is_some() { return Err(serde::de::Error::duplicate_field("sqlOptions")); @@ -3935,6 +3952,7 @@ impl<'de> serde::Deserialize<'de> for CopyToNode { input: input__, output_url: output_url__.unwrap_or_default(), file_type: file_type__.unwrap_or_default(), + partition_by: partition_by__.unwrap_or_default(), copy_options: copy_options__, }) } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 69d035239cb8..a567269e3356 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -511,6 +511,8 @@ pub struct CopyToNode { pub output_url: ::prost::alloc::string::String, #[prost(string, tag = "6")] pub file_type: ::prost::alloc::string::String, + #[prost(string, repeated, tag = "7")] + pub partition_by: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, #[prost(oneof = "copy_to_node::CopyOptions", tags = "4, 5")] pub copy_options: ::core::option::Option, } diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index aaaf165e3276..f107af757a71 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -913,12 +913,13 @@ impl AsLogicalPlan for LogicalPlanNode { } None => return Err(proto_error("CopyTo missing CopyOptions")), }; + Ok(datafusion_expr::LogicalPlan::Copy( datafusion_expr::dml::CopyTo { input: Arc::new(input), output_url: copy.output_url.clone(), file_format: FileType::from_str(©.file_type)?, - partition_by: vec![], + partition_by: copy.partition_by.clone(), copy_options, }, )) @@ -1642,7 +1643,7 @@ impl AsLogicalPlan for LogicalPlanNode { output_url, file_format, copy_options, - partition_by: _, + partition_by, }) => { let input = protobuf::LogicalPlanNode::try_from_logical_plan( input, @@ -1726,6 +1727,7 @@ impl AsLogicalPlan for LogicalPlanNode { output_url: output_url.to_string(), file_type: file_format.to_string(), copy_options: copy_options_proto, + partition_by: partition_by.clone(), }, ))), }) diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 6ca757908159..e3bd2cb1dc47 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -324,7 +324,7 @@ async fn roundtrip_logical_plan_copy_to_sql_options() -> Result<()> { input: Arc::new(input), output_url: "test.csv".to_string(), file_format: FileType::CSV, - partition_by: vec![], + partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], copy_options: CopyOptions::SQLOptions(StatementOptions::from(&options)), }); @@ -355,7 +355,7 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { input: Arc::new(input), output_url: "test.parquet".to_string(), file_format: FileType::PARQUET, - partition_by: vec![], + partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], copy_options: CopyOptions::WriterOptions(Box::new( FileTypeWriterOptions::Parquet(ParquetWriterOptions::new(writer_properties)), )), @@ -369,6 +369,7 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> { LogicalPlan::Copy(copy_to) => { assert_eq!("test.parquet", copy_to.output_url); assert_eq!(FileType::PARQUET, copy_to.file_format); + assert_eq!(vec!["a", "b", "c"], copy_to.partition_by); match ©_to.copy_options { CopyOptions::WriterOptions(y) => match y.as_ref() { FileTypeWriterOptions::Parquet(p) => { @@ -404,7 +405,7 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> { input: Arc::new(input), output_url: "test.arrow".to_string(), file_format: FileType::ARROW, - partition_by: vec![], + partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], copy_options: CopyOptions::WriterOptions(Box::new(FileTypeWriterOptions::Arrow( ArrowWriterOptions::new(), ))), @@ -418,6 +419,7 @@ async fn roundtrip_logical_plan_copy_to_arrow() -> Result<()> { LogicalPlan::Copy(copy_to) => { assert_eq!("test.arrow", copy_to.output_url); assert_eq!(FileType::ARROW, copy_to.file_format); + assert_eq!(vec!["a", "b", "c"], copy_to.partition_by); match ©_to.copy_options { CopyOptions::WriterOptions(y) => match y.as_ref() { FileTypeWriterOptions::Arrow(_) => {} @@ -450,7 +452,7 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> { input: Arc::new(input), output_url: "test.csv".to_string(), file_format: FileType::CSV, - partition_by: vec![], + partition_by: vec!["a".to_string(), "b".to_string(), "c".to_string()], copy_options: CopyOptions::WriterOptions(Box::new(FileTypeWriterOptions::CSV( CsvWriterOptions::new( writer_properties, @@ -467,6 +469,7 @@ async fn roundtrip_logical_plan_copy_to_csv() -> Result<()> { LogicalPlan::Copy(copy_to) => { assert_eq!("test.csv", copy_to.output_url); assert_eq!(FileType::CSV, copy_to.file_format); + assert_eq!(vec!["a", "b", "c"], copy_to.partition_by); match ©_to.copy_options { CopyOptions::WriterOptions(y) => match y.as_ref() { FileTypeWriterOptions::CSV(p) => { From 10cbb056a31f409fb1303df4601d777cedd3aaed Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 25 Feb 2024 10:18:35 -0700 Subject: [PATCH 12/45] Add test to prevent circular dependencies from being added (#9292) --- datafusion/core/Cargo.toml | 1 + datafusion/core/tests/depcheck.rs | 78 +++++ dev/release/crate-deps.dot | 101 ++++-- dev/release/crate-deps.svg | 524 +++++++++++++++++++++--------- 4 files changed, 524 insertions(+), 180 deletions(-) create mode 100644 datafusion/core/tests/depcheck.rs diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 506be6667ef2..c3bd89037cfe 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -122,6 +122,7 @@ zstd = { version = "0.13", optional = true, default-features = false } [dev-dependencies] async-trait = { workspace = true } bigdecimal = { workspace = true } +cargo = "0.77.0" criterion = { version = "0.5", features = ["async_tokio"] } csv = "1.1.6" ctor = { workspace = true } diff --git a/datafusion/core/tests/depcheck.rs b/datafusion/core/tests/depcheck.rs new file mode 100644 index 000000000000..94448818691e --- /dev/null +++ b/datafusion/core/tests/depcheck.rs @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +/// Check for circular dependencies between DataFusion crates +use std::collections::{HashMap, HashSet}; +use std::env; +use std::path::Path; + +use cargo::util::config::Config; +#[test] +fn test_deps() -> Result<(), Box> { + let config = Config::default()?; + let path = env::var("CARGO_MANIFEST_DIR").unwrap(); + let dir = Path::new(&path); + let root_cargo_toml = dir.join("Cargo.toml"); + let workspace = cargo::core::Workspace::new(&root_cargo_toml, &config)?; + let (_, resolve) = cargo::ops::resolve_ws(&workspace)?; + + let mut package_deps = HashMap::new(); + for package_id in resolve + .iter() + .filter(|id| id.name().starts_with("datafusion")) + { + let deps: Vec = resolve + .deps(package_id) + .filter(|(package_id, _)| package_id.name().starts_with("datafusion")) + .map(|(package_id, _)| package_id.name().to_string()) + .collect(); + package_deps.insert(package_id.name().to_string(), deps); + } + + // check for circular dependencies + for (root_package, deps) in &package_deps { + let mut seen = HashSet::new(); + for dep in deps { + check_circular_deps(root_package, dep, &package_deps, &mut seen); + } + } + + Ok(()) +} + +fn check_circular_deps( + root_package: &str, + current_dep: &str, + package_deps: &HashMap>, + seen: &mut HashSet, +) { + if root_package == current_dep { + panic!( + "circular dependency detected from {root_package} to self via one of {:?}", + seen + ); + } + if seen.contains(current_dep) { + return; + } + seen.insert(current_dep.to_string()); + if let Some(deps) = package_deps.get(current_dep) { + for dep in deps { + check_circular_deps(root_package, dep, package_deps, seen); + } + } +} diff --git a/dev/release/crate-deps.dot b/dev/release/crate-deps.dot index 618eb56afb75..69811c7d6109 100644 --- a/dev/release/crate-deps.dot +++ b/dev/release/crate-deps.dot @@ -16,39 +16,76 @@ // under the License. digraph G { - - datafusion_common - - datafusion_expr -> datafusion_common - - datafusion_sql -> datafusion_common - datafusion_sql -> datafusion_expr - - datafusion_optimizer -> datafusion_common - datafusion_optimizer -> datafusion_expr - - datafusion_physical_expr -> datafusion_common - datafusion_physical_expr -> datafusion_expr - - datafusion_execution -> datafusion_common - datafusion_execution -> datafusion_expr - + datafusion_examples + datafusion_examples -> datafusion + datafusion_examples -> datafusion_common + datafusion_examples -> datafusion_expr + datafusion_examples -> datafusion_optimizer + datafusion_examples -> datafusion_physical_expr + datafusion_examples -> datafusion_sql + datafusion_expr + datafusion_expr -> datafusion_common + datafusion_functions + datafusion_functions -> datafusion_common + datafusion_functions -> datafusion_execution + datafusion_functions -> datafusion_expr + datafusion_wasmtest + datafusion_wasmtest -> datafusion + datafusion_wasmtest -> datafusion_common + datafusion_wasmtest -> datafusion_execution + datafusion_wasmtest -> datafusion_expr + datafusion_wasmtest -> datafusion_optimizer + datafusion_wasmtest -> datafusion_physical_expr + datafusion_wasmtest -> datafusion_physical_plan + datafusion_wasmtest -> datafusion_sql + datafusion_common + datafusion_sql + datafusion_sql -> datafusion_common + datafusion_sql -> datafusion_expr + datafusion_physical_plan datafusion_physical_plan -> datafusion_common datafusion_physical_plan -> datafusion_execution datafusion_physical_plan -> datafusion_expr datafusion_physical_plan -> datafusion_physical_expr - - datafusion -> datafusion_common - datafusion -> datafusion_execution - datafusion -> datafusion_expr - datafusion -> datafusion_optimizer - datafusion -> datafusion_physical_expr - datafusion -> datafusion_physical_plan - datafusion -> datafusion_sql - - datafusion_proto -> datafusion - - datafusion_substrait -> datafusion - - datafusion_cli -> datafusion -} + datafusion_benchmarks + datafusion_benchmarks -> datafusion + datafusion_benchmarks -> datafusion_common + datafusion_benchmarks -> datafusion_proto + datafusion_docs_tests + datafusion_docs_tests -> datafusion + datafusion_optimizer + datafusion_optimizer -> datafusion_common + datafusion_optimizer -> datafusion_expr + datafusion_optimizer -> datafusion_physical_expr + datafusion_optimizer -> datafusion_sql + datafusion_proto + datafusion_proto -> datafusion + datafusion_proto -> datafusion_common + datafusion_proto -> datafusion_expr + datafusion_physical_expr + datafusion_physical_expr -> datafusion_common + datafusion_physical_expr -> datafusion_execution + datafusion_physical_expr -> datafusion_expr + datafusion_sqllogictest + datafusion_sqllogictest -> datafusion + datafusion_sqllogictest -> datafusion_common + datafusion + datafusion -> datafusion_common + datafusion -> datafusion_execution + datafusion -> datafusion_expr + datafusion -> datafusion_functions + datafusion -> datafusion_functions_array + datafusion -> datafusion_optimizer + datafusion -> datafusion_physical_expr + datafusion -> datafusion_physical_plan + datafusion -> datafusion_sql + datafusion_functions_array + datafusion_functions_array -> datafusion_common + datafusion_functions_array -> datafusion_execution + datafusion_functions_array -> datafusion_expr + datafusion_execution + datafusion_execution -> datafusion_common + datafusion_execution -> datafusion_expr + datafusion_substrait + datafusion_substrait -> datafusion +} \ No newline at end of file diff --git a/dev/release/crate-deps.svg b/dev/release/crate-deps.svg index a7c7b7fe4acd..cf60bf752642 100644 --- a/dev/release/crate-deps.svg +++ b/dev/release/crate-deps.svg @@ -1,217 +1,445 @@ - - - + + G - - + + -datafusion_common - -datafusion_common +datafusion_examples + +datafusion_examples - + -datafusion_expr - -datafusion_expr +datafusion + +datafusion - + -datafusion_expr->datafusion_common - - +datafusion_examples->datafusion + + - + -datafusion_sql - -datafusion_sql +datafusion_common + +datafusion_common - + -datafusion_sql->datafusion_common - - +datafusion_examples->datafusion_common + + - + + +datafusion_expr + +datafusion_expr + + -datafusion_sql->datafusion_expr - - +datafusion_examples->datafusion_expr + + - + datafusion_optimizer - -datafusion_optimizer + +datafusion_optimizer - + +datafusion_examples->datafusion_optimizer + + + + + +datafusion_physical_expr + +datafusion_physical_expr + + + +datafusion_examples->datafusion_physical_expr + + + + + +datafusion_sql + +datafusion_sql + + + +datafusion_examples->datafusion_sql + + + + + +datafusion->datafusion_common + + + + + +datafusion->datafusion_expr + + + + + +datafusion->datafusion_optimizer + + + + + +datafusion->datafusion_physical_expr + + + + + +datafusion->datafusion_sql + + + + + +datafusion_functions + +datafusion_functions + + + +datafusion->datafusion_functions + + + + + +datafusion_execution + +datafusion_execution + + + +datafusion->datafusion_execution + + + + + +datafusion_physical_plan + +datafusion_physical_plan + + + +datafusion->datafusion_physical_plan + + + + + +datafusion_functions_array + +datafusion_functions_array + + + +datafusion->datafusion_functions_array + + + + + +datafusion_expr->datafusion_common + + + + + datafusion_optimizer->datafusion_common - - + + - + datafusion_optimizer->datafusion_expr - - + + - - -datafusion_physical_expr - -datafusion_physical_expr + + +datafusion_optimizer->datafusion_physical_expr + + + + + +datafusion_optimizer->datafusion_sql + + - + datafusion_physical_expr->datafusion_common - - + + - + datafusion_physical_expr->datafusion_expr - - + + - - -datafusion_execution - -datafusion_execution + + +datafusion_physical_expr->datafusion_execution + + - - -datafusion_execution->datafusion_common - - + + +datafusion_sql->datafusion_common + + - - -datafusion_execution->datafusion_expr - - + + +datafusion_sql->datafusion_expr + + - - -datafusion_physical_plan - -datafusion_physical_plan + + +datafusion_functions->datafusion_common + + - + -datafusion_physical_plan->datafusion_common - - +datafusion_functions->datafusion_expr + + - - -datafusion_physical_plan->datafusion_expr - - + + +datafusion_functions->datafusion_execution + + - - -datafusion_physical_plan->datafusion_physical_expr - - + + +datafusion_execution->datafusion_common + + - + + +datafusion_execution->datafusion_expr + + + + + +datafusion_wasmtest + +datafusion_wasmtest + + -datafusion_physical_plan->datafusion_execution - - +datafusion_wasmtest->datafusion + + - - -datafusion - -datafusion + + +datafusion_wasmtest->datafusion_common + + - + -datafusion->datafusion_common - - +datafusion_wasmtest->datafusion_expr + + - + + +datafusion_wasmtest->datafusion_optimizer + + + + -datafusion->datafusion_expr - - +datafusion_wasmtest->datafusion_physical_expr + + - - -datafusion->datafusion_sql - - + + +datafusion_wasmtest->datafusion_sql + + - + + +datafusion_wasmtest->datafusion_execution + + + + -datafusion->datafusion_optimizer - - +datafusion_wasmtest->datafusion_physical_plan + + - - -datafusion->datafusion_physical_expr - - + + +datafusion_physical_plan->datafusion_common + + - - -datafusion->datafusion_execution - - + + +datafusion_physical_plan->datafusion_expr + + - - -datafusion->datafusion_physical_plan - - + + +datafusion_physical_plan->datafusion_physical_expr + + + + + +datafusion_physical_plan->datafusion_execution + + + + + +datafusion_benchmarks + +datafusion_benchmarks + + + +datafusion_benchmarks->datafusion + + + + + +datafusion_benchmarks->datafusion_common + + - + datafusion_proto - -datafusion_proto + +datafusion_proto + + + +datafusion_benchmarks->datafusion_proto + + - + datafusion_proto->datafusion - - + + + + + +datafusion_proto->datafusion_common + + + + + +datafusion_proto->datafusion_expr + + + + + +datafusion_docs_tests + +datafusion_docs_tests + + + +datafusion_docs_tests->datafusion + + + + + +datafusion_sqllogictest + +datafusion_sqllogictest + + + +datafusion_sqllogictest->datafusion + + + + + +datafusion_sqllogictest->datafusion_common + + + + + +datafusion_functions_array->datafusion_common + + + + + +datafusion_functions_array->datafusion_expr + + + + + +datafusion_functions_array->datafusion_execution + + - + datafusion_substrait - -datafusion_substrait + +datafusion_substrait - + datafusion_substrait->datafusion - - - - - -datafusion_cli - -datafusion_cli - - - -datafusion_cli->datafusion - - + + From 22585586bff554cbd0c08099d303dcc95ef61cfc Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Mon, 26 Feb 2024 07:30:24 +0800 Subject: [PATCH 13/45] Substrait: Support ScalarUDF (#9337) * support udf in substrait Signed-off-by: jayzhan211 * clippy Signed-off-by: jayzhan211 --------- Signed-off-by: jayzhan211 --- datafusion/functions/src/math/mod.rs | 2 +- .../substrait/src/logical_plan/consumer.rs | 69 +++++++++++++++---- .../tests/cases/roundtrip_logical_plan.rs | 5 ++ 3 files changed, 60 insertions(+), 16 deletions(-) diff --git a/datafusion/functions/src/math/mod.rs b/datafusion/functions/src/math/mod.rs index 67d2d957ea1f..873625948a35 100644 --- a/datafusion/functions/src/math/mod.rs +++ b/datafusion/functions/src/math/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! "core" DataFusion functions +//! "math" DataFusion functions mod nans; diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 23a7ee05d73e..095806c538d1 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -28,7 +28,7 @@ use datafusion::logical_expr::{ }; use datafusion::logical_expr::{ expr, Cast, Extension, GroupingSet, Like, LogicalPlanBuilder, Partitioning, - Repartition, Subquery, WindowFrameBound, WindowFrameUnits, + Repartition, ScalarUDF, Subquery, WindowFrameBound, WindowFrameUnits, }; use datafusion::prelude::JoinType; use datafusion::sql::TableReference; @@ -78,6 +78,7 @@ enum ScalarFunctionType { Builtin(BuiltinScalarFunction), Op(Operator), Expr(BuiltinExprBuilder), + Udf(Arc), } pub fn name_to_op(name: &str) -> Result { @@ -113,7 +114,15 @@ pub fn name_to_op(name: &str) -> Result { } } -fn scalar_function_type_from_str(name: &str) -> Result { +fn scalar_function_type_from_str( + ctx: &SessionContext, + name: &str, +) -> Result { + let s = ctx.state(); + if let Some(func) = s.scalar_functions().get(name) { + return Ok(ScalarFunctionType::Udf(func.to_owned())); + } + if let Ok(op) = name_to_op(name) { return Ok(ScalarFunctionType::Op(op)); } @@ -859,21 +868,51 @@ pub async fn from_substrait_rex( f.function_reference )) })?; - let fn_type = scalar_function_type_from_str(fn_name)?; + + // Convert function arguments from Substrait to DataFusion + async fn decode_arguments( + ctx: &SessionContext, + input_schema: &DFSchema, + extensions: &HashMap, + function_args: &[FunctionArgument], + ) -> Result> { + let mut args = Vec::with_capacity(function_args.len()); + for arg in function_args { + let arg_expr = match &arg.arg_type { + Some(ArgType::Value(e)) => { + from_substrait_rex(ctx, e, input_schema, extensions).await + } + _ => not_impl_err!( + "Aggregated function argument non-Value type not supported" + ), + }?; + args.push(arg_expr.as_ref().clone()); + } + Ok(args) + } + + let fn_type = scalar_function_type_from_str(ctx, fn_name)?; match fn_type { + ScalarFunctionType::Udf(fun) => { + let args = decode_arguments( + ctx, + input_schema, + extensions, + f.arguments.as_slice(), + ) + .await?; + Ok(Arc::new(Expr::ScalarFunction( + expr::ScalarFunction::new_udf(fun, args), + ))) + } ScalarFunctionType::Builtin(fun) => { - let mut args = Vec::with_capacity(f.arguments.len()); - for arg in &f.arguments { - let arg_expr = match &arg.arg_type { - Some(ArgType::Value(e)) => { - from_substrait_rex(ctx, e, input_schema, extensions).await - } - _ => not_impl_err!( - "Aggregated function argument non-Value type not supported" - ), - }; - args.push(arg_expr?.as_ref().clone()); - } + let args = decode_arguments( + ctx, + input_schema, + extensions, + f.arguments.as_slice(), + ) + .await?; Ok(Arc::new(Expr::ScalarFunction(expr::ScalarFunction::new( fun, args, )))) diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 79cf76de5985..331d63cc22b2 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -309,6 +309,11 @@ async fn simple_scalar_function_abs() -> Result<()> { roundtrip("SELECT ABS(a) FROM data").await } +#[tokio::test] +async fn simple_scalar_function_isnan() -> Result<()> { + roundtrip("SELECT ISNAN(a) FROM data").await +} + #[tokio::test] async fn simple_scalar_function_pow() -> Result<()> { roundtrip("SELECT POW(a, 2) FROM data").await From 3050699580fcebaaf06d27960e59038a99be99d5 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sun, 25 Feb 2024 19:17:17 -0700 Subject: [PATCH 14/45] Make agg_func_substitute test deterministic (#9340) --- datafusion/sqllogictest/test_files/agg_func_substitute.slt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/agg_func_substitute.slt b/datafusion/sqllogictest/test_files/agg_func_substitute.slt index 650ec1ad8e5c..ff485e97ca0c 100644 --- a/datafusion/sqllogictest/test_files/agg_func_substitute.slt +++ b/datafusion/sqllogictest/test_files/agg_func_substitute.slt @@ -94,7 +94,8 @@ ProjectionExec: expr=[a@0 as a, NTH_VALUE(multiple_ordered_table.c,Int64(1) + In query II SELECT a, ARRAY_AGG(c ORDER BY c)[1] as result FROM multiple_ordered_table - GROUP BY a; + GROUP BY a + ORDER BY a; ---- 0 0 1 50 From ace9815ae5d8c99001c84fb71100536989364c34 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Thu, 22 Feb 2024 15:54:39 +0300 Subject: [PATCH 15/45] use create_cache_convention --- .../examples/custom_datasource.rs | 18 +-- .../datasource/physical_plan/arrow_file.rs | 34 ++--- .../core/src/datasource/physical_plan/avro.rs | 25 ++-- .../core/src/datasource/physical_plan/csv.rs | 35 ++--- .../core/src/datasource/physical_plan/json.rs | 35 ++--- .../datasource/physical_plan/parquet/mod.rs | 35 ++--- .../enforce_distribution.rs | 16 +-- .../physical_optimizer/output_requirements.rs | 16 +-- datafusion/core/src/physical_planner.rs | 19 +-- datafusion/core/src/test/mod.rs | 19 ++- datafusion/core/src/test_util/mod.rs | 27 ++-- datafusion/core/tests/custom_sources.rs | 20 +-- .../provider_filter_pushdown.rs | 21 ++- .../tests/custom_sources_cases/statistics.rs | 19 ++- .../tests/user_defined/user_defined_plan.rs | 20 +-- .../physical-plan/src/aggregates/mod.rs | 65 ++++----- datafusion/physical-plan/src/analyze.rs | 21 ++- .../physical-plan/src/coalesce_batches.rs | 17 +-- .../physical-plan/src/coalesce_partitions.rs | 15 +-- datafusion/physical-plan/src/empty.rs | 27 ++-- datafusion/physical-plan/src/explain.rs | 20 ++- datafusion/physical-plan/src/filter.rs | 112 ++++++++-------- datafusion/physical-plan/src/insert.rs | 26 ++-- .../physical-plan/src/joins/cross_join.rs | 36 ++--- .../physical-plan/src/joins/hash_join.rs | 42 +++--- .../src/joins/nested_loop_join.rs | 42 +++--- .../src/joins/sort_merge_join.rs | 42 +++--- .../src/joins/symmetric_hash_join.rs | 42 +++--- datafusion/physical-plan/src/lib.rs | 16 --- datafusion/physical-plan/src/limit.rs | 34 ++--- datafusion/physical-plan/src/memory.rs | 35 ++--- .../physical-plan/src/placeholder_row.rs | 29 ++-- datafusion/physical-plan/src/projection.rs | 31 ++--- .../physical-plan/src/recursive_query.rs | 21 +-- .../physical-plan/src/repartition/mod.rs | 60 ++++++--- .../physical-plan/src/sorts/partial_sort.rs | 48 ++++--- datafusion/physical-plan/src/sorts/sort.rs | 51 ++++--- .../src/sorts/sort_preserving_merge.rs | 17 +-- datafusion/physical-plan/src/streaming.rs | 34 +++-- datafusion/physical-plan/src/test/exec.rs | 124 ++++++++---------- datafusion/physical-plan/src/union.rs | 43 +++--- datafusion/physical-plan/src/unnest.rs | 24 ++-- datafusion/physical-plan/src/values.rs | 19 +-- .../src/windows/bounded_window_agg_exec.rs | 29 ++-- .../src/windows/window_agg_exec.rs | 23 ++-- datafusion/physical-plan/src/work_table.rs | 18 +-- 46 files changed, 787 insertions(+), 735 deletions(-) diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_datasource.rs index 9516dc570d6d..d3cd66b2c9bc 100644 --- a/datafusion-examples/examples/custom_datasource.rs +++ b/datafusion-examples/examples/custom_datasource.rs @@ -35,6 +35,7 @@ use datafusion::physical_plan::{ }; use datafusion::prelude::*; use datafusion_expr::{Expr, LogicalPlanBuilder}; +use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; use tokio::time::timeout; @@ -199,22 +200,21 @@ impl CustomExec { db: CustomDataSource, ) -> Self { let projected_schema = project_schema(&schema, projections).unwrap(); - let cache = PlanPropertiesCache::new_default(projected_schema.clone()); + let cache = Self::create_cache(projected_schema.clone()); Self { db, projected_schema, cache, } - .with_cache() } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - .with_partitioning(Partitioning::UnknownPartitioning(1)) - .with_exec_mode(ExecutionMode::Bounded); - - self + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 1a27f9315b34..24e825a6920b 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -61,7 +61,11 @@ impl ArrowExec { pub fn new(base_config: FileScanConfig) -> Self { let (projected_schema, projected_statistics, projected_output_ordering) = base_config.project(); - let cache = PlanPropertiesCache::new_default(projected_schema.clone()); + let cache = Self::create_cache( + projected_schema.clone(), + &projected_output_ordering, + &base_config, + ); Self { base_config, projected_schema, @@ -70,36 +74,36 @@ impl ArrowExec { metrics: ExecutionPlanMetricsSet::new(), cache, } - .with_cache() } /// Ref to the base configs pub fn base_config(&self) -> &FileScanConfig { &self.base_config } - fn output_partitioning_helper(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.base_config.file_groups.len()) + fn output_partitioning_helper(file_scan_config: &FileScanConfig) -> Partitioning { + Partitioning::UnknownPartitioning(file_scan_config.file_groups.len()) } - fn with_cache(mut self) -> Self { + fn create_cache( + schema: SchemaRef, + projected_output_ordering: &[LexOrdering], + file_scan_config: &FileScanConfig, + ) -> PlanPropertiesCache { // Equivalence Properties - let eq_properties = EquivalenceProperties::new_with_orderings( - self.schema(), - &self.projected_output_ordering, - ); + let eq_properties = + EquivalenceProperties::new_with_orderings(schema, projected_output_ordering); - self.cache = PlanPropertiesCache::new( + PlanPropertiesCache::new( eq_properties, - self.output_partitioning_helper(), // Output Partitioning - ExecutionMode::Bounded, // Execution Mode - ); - self + Self::output_partitioning_helper(file_scan_config), // Output Partitioning + ExecutionMode::Bounded, // Execution Mode + ) } fn with_file_groups(mut self, file_groups: Vec>) -> Self { self.base_config.file_groups = file_groups; // Changing file groups may invalidate output partitioning. Update it also - let output_partitioning = self.output_partitioning_helper(); + let output_partitioning = Self::output_partitioning_helper(&self.base_config); self.cache = self.cache.with_partitioning(output_partitioning); self } diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index fb2cd627a1da..6b6e7bce90c1 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -50,7 +50,11 @@ impl AvroExec { pub fn new(base_config: FileScanConfig) -> Self { let (projected_schema, projected_statistics, projected_output_ordering) = base_config.project(); - let cache = PlanPropertiesCache::new_default(projected_schema.clone()); + let cache = Self::create_cache( + projected_schema.clone(), + &projected_output_ordering, + &base_config, + ); Self { base_config, projected_schema, @@ -59,27 +63,26 @@ impl AvroExec { metrics: ExecutionPlanMetricsSet::new(), cache, } - .with_cache() } /// Ref to the base configs pub fn base_config(&self) -> &FileScanConfig { &self.base_config } - fn with_cache(mut self) -> Self { + fn create_cache( + schema: SchemaRef, + orderings: &[LexOrdering], + file_scan_config: &FileScanConfig, + ) -> PlanPropertiesCache { // Equivalence Properties - let eq_properties = EquivalenceProperties::new_with_orderings( - self.schema(), - &self.projected_output_ordering, - ); - let n_partitions = self.base_config.file_groups.len(); + let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); + let n_partitions = file_scan_config.file_groups.len(); - self.cache = PlanPropertiesCache::new( + PlanPropertiesCache::new( eq_properties, Partitioning::UnknownPartitioning(n_partitions), // Output Partitioning ExecutionMode::Bounded, // Execution Mode - ); - self + ) } } diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 55cf62507788..19281bc3c189 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -53,7 +53,6 @@ use tokio::task::JoinSet; pub struct CsvExec { base_config: FileScanConfig, projected_statistics: Statistics, - projected_output_ordering: Vec, has_header: bool, delimiter: u8, quote: u8, @@ -77,11 +76,14 @@ impl CsvExec { ) -> Self { let (projected_schema, projected_statistics, projected_output_ordering) = base_config.project(); - let cache = PlanPropertiesCache::new_default(projected_schema); + let cache = Self::create_cache( + projected_schema, + &projected_output_ordering, + &base_config, + ); Self { base_config, projected_statistics, - projected_output_ordering, has_header, delimiter, quote, @@ -90,7 +92,6 @@ impl CsvExec { file_compression_type, cache, } - .with_cache() } /// Ref to the base configs @@ -116,29 +117,29 @@ impl CsvExec { self.escape } - fn output_partitioning_helper(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.base_config.file_groups.len()) + fn output_partitioning_helper(file_scan_config: &FileScanConfig) -> Partitioning { + Partitioning::UnknownPartitioning(file_scan_config.file_groups.len()) } - fn with_cache(mut self) -> Self { + fn create_cache( + schema: SchemaRef, + orderings: &[LexOrdering], + file_scan_config: &FileScanConfig, + ) -> PlanPropertiesCache { // Equivalence Properties - let eq_properties = EquivalenceProperties::new_with_orderings( - self.schema(), - &self.projected_output_ordering, - ); + let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); - self.cache = PlanPropertiesCache::new( + PlanPropertiesCache::new( eq_properties, - self.output_partitioning_helper(), // Output Partitioning - ExecutionMode::Bounded, // Execution Mode - ); - self + Self::output_partitioning_helper(file_scan_config), // Output Partitioning + ExecutionMode::Bounded, // Execution Mode + ) } fn with_file_groups(mut self, file_groups: Vec>) -> Self { self.base_config.file_groups = file_groups; // Changing file groups may invalidate output partitioning. Update it also - let output_partitioning = self.output_partitioning_helper(); + let output_partitioning = Self::output_partitioning_helper(&self.base_config); self.cache = self.cache.with_partitioning(output_partitioning); self } diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 231b48f28d96..6e17e58d8444 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -52,7 +52,6 @@ use tokio::task::JoinSet; pub struct NdJsonExec { base_config: FileScanConfig, projected_statistics: Statistics, - projected_output_ordering: Vec, /// Execution metrics metrics: ExecutionPlanMetricsSet, file_compression_type: FileCompressionType, @@ -67,16 +66,18 @@ impl NdJsonExec { ) -> Self { let (projected_schema, projected_statistics, projected_output_ordering) = base_config.project(); - let cache = PlanPropertiesCache::new_default(projected_schema); + let cache = Self::create_cache( + projected_schema, + &projected_output_ordering, + &base_config, + ); Self { base_config, projected_statistics, - projected_output_ordering, metrics: ExecutionPlanMetricsSet::new(), file_compression_type, cache, } - .with_cache() } /// Ref to the base configs @@ -84,29 +85,29 @@ impl NdJsonExec { &self.base_config } - fn output_partitioning_helper(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.base_config.file_groups.len()) + fn output_partitioning_helper(file_scan_config: &FileScanConfig) -> Partitioning { + Partitioning::UnknownPartitioning(file_scan_config.file_groups.len()) } - fn with_cache(mut self) -> Self { + fn create_cache( + schema: SchemaRef, + orderings: &[LexOrdering], + file_scan_config: &FileScanConfig, + ) -> PlanPropertiesCache { // Equivalence Properties - let eq_properties = EquivalenceProperties::new_with_orderings( - self.schema(), - &self.projected_output_ordering, - ); + let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); - self.cache = PlanPropertiesCache::new( + PlanPropertiesCache::new( eq_properties, - self.output_partitioning_helper(), // Output Partitioning - ExecutionMode::Bounded, // Execution Mode - ); - self + Self::output_partitioning_helper(file_scan_config), // Output Partitioning + ExecutionMode::Bounded, // Execution Mode + ) } fn with_file_groups(mut self, file_groups: Vec>) -> Self { self.base_config.file_groups = file_groups; // Changing file groups may invalidate output partitioning. Update it also - let output_partitioning = self.output_partitioning_helper(); + let output_partitioning = Self::output_partitioning_helper(&self.base_config); self.cache = self.cache.with_partitioning(output_partitioning); self } diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index e47c8e516385..810a84646c86 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -88,7 +88,6 @@ pub struct ParquetExec { /// Base configuration for this scan base_config: FileScanConfig, projected_statistics: Statistics, - projected_output_ordering: Vec, /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Optional predicate for row filtering during parquet scan @@ -149,7 +148,11 @@ impl ParquetExec { let (projected_schema, projected_statistics, projected_output_ordering) = base_config.project(); - let cache = PlanPropertiesCache::new_default(projected_schema); + let cache = Self::create_cache( + projected_schema, + &projected_output_ordering, + &base_config, + ); Self { pushdown_filters: None, reorder_filters: None, @@ -157,7 +160,6 @@ impl ParquetExec { enable_bloom_filter: None, base_config, projected_statistics, - projected_output_ordering, metrics, predicate, pruning_predicate, @@ -166,7 +168,6 @@ impl ParquetExec { parquet_file_reader_factory: None, cache, } - .with_cache() } /// Ref to the base configs @@ -261,29 +262,29 @@ impl ParquetExec { .unwrap_or(config_options.execution.parquet.bloom_filter_enabled) } - fn output_partitioning_helper(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.base_config.file_groups.len()) + fn output_partitioning_helper(file_config: &FileScanConfig) -> Partitioning { + Partitioning::UnknownPartitioning(file_config.file_groups.len()) } - fn with_cache(mut self) -> Self { + fn create_cache( + schema: SchemaRef, + orderings: &[LexOrdering], + file_config: &FileScanConfig, + ) -> PlanPropertiesCache { // Equivalence Properties - let eq_properties = EquivalenceProperties::new_with_orderings( - self.schema(), - &self.projected_output_ordering, - ); + let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); - self.cache = PlanPropertiesCache::new( + PlanPropertiesCache::new( eq_properties, - self.output_partitioning_helper(), // Output Partitioning - ExecutionMode::Bounded, // Execution Mode - ); - self + Self::output_partitioning_helper(file_config), // Output Partitioning + ExecutionMode::Bounded, // Execution Mode + ) } fn with_file_groups(mut self, file_groups: Vec>) -> Self { self.base_config.file_groups = file_groups; // Changing file groups may invalidate output partitioning. Update it also - let output_partitioning = self.output_partitioning_helper(); + let output_partitioning = Self::output_partitioning_helper(&self.base_config); self.cache = self.cache.with_partitioning(output_partitioning); self } diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index c7bfe4742bdf..a5ad2d546d41 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -1352,22 +1352,20 @@ pub(crate) mod tests { input: Arc, requirement: Vec, ) -> Self { - let cache = PlanPropertiesCache::new_default(input.schema()); + let cache = Self::create_cache(&input); Self { input, expr: requirement, cache, } - .with_cache() } - fn with_cache(mut self) -> Self { - self.cache = PlanPropertiesCache::new( - self.input.equivalence_properties().clone(), // Equivalence Properties - self.input.output_partitioning().clone(), // Output Partitioning - self.input.execution_mode(), // Execution Mode - ); - self + fn create_cache(input: &Arc) -> PlanPropertiesCache { + PlanPropertiesCache::new( + input.equivalence_properties().clone(), // Equivalence Properties + input.output_partitioning().clone(), // Output Partitioning + input.execution_mode(), // Execution Mode + ) } } diff --git a/datafusion/core/src/physical_optimizer/output_requirements.rs b/datafusion/core/src/physical_optimizer/output_requirements.rs index 5ddba79515ee..a806580ce716 100644 --- a/datafusion/core/src/physical_optimizer/output_requirements.rs +++ b/datafusion/core/src/physical_optimizer/output_requirements.rs @@ -99,27 +99,25 @@ impl OutputRequirementExec { requirements: Option, dist_requirement: Distribution, ) -> Self { - let cache = PlanPropertiesCache::new_default(input.schema()); + let cache = Self::create_cache(&input); Self { input, order_requirement: requirements, dist_requirement, cache, } - .with_cache() } pub(crate) fn input(&self) -> Arc { self.input.clone() } - fn with_cache(mut self) -> Self { - self.cache = PlanPropertiesCache::new( - self.input.equivalence_properties().clone(), // Equivalence Properties - self.input.output_partitioning().clone(), // Output Partitioning - self.input.execution_mode(), // Execution Mode - ); - self + fn create_cache(input: &Arc) -> PlanPropertiesCache { + PlanPropertiesCache::new( + input.equivalence_properties().clone(), // Equivalence Properties + input.output_partitioning().clone(), // Output Partitioning + input.execution_mode(), // Execution Mode + ) } } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 1a334678b6d7..e571bc76f4d5 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2013,6 +2013,7 @@ mod tests { col, lit, sum, Extension, GroupingSet, LogicalPlanBuilder, UserDefinedLogicalNodeCore, }; + use datafusion_physical_expr::EquivalenceProperties; fn make_session_state() -> SessionState { let runtime = Arc::new(RuntimeEnv::default()); @@ -2579,19 +2580,19 @@ mod tests { impl NoOpExecutionPlan { fn new(schema: SchemaRef) -> Self { - let cache = PlanPropertiesCache::new_default(schema.clone()); - Self { cache }.with_cache() + let cache = Self::create_cache(schema.clone()); + Self { cache } } - fn with_cache(mut self) -> Self { - self.cache = self - .cache + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + PlanPropertiesCache::new( + eq_properties, // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(1)) + Partitioning::UnknownPartitioning(1), // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); - - self + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index 277901ff9915..f8eb67cfdaf5 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -42,7 +42,7 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::{DataFusionError, FileType, Statistics}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; -use datafusion_physical_expr::{Partitioning, PhysicalSortExpr}; +use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalSortExpr}; use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; use datafusion_physical_plan::{ DisplayAs, DisplayFormatType, ExecutionMode, PlanPropertiesCache, @@ -376,24 +376,23 @@ impl StatisticsExec { stats.column_statistics.len(), schema.fields().len(), "if defined, the column statistics vector length should be the number of fields" ); - let cache = PlanPropertiesCache::new_default(Arc::new(schema.clone())); + let cache = Self::create_cache(Arc::new(schema.clone())); Self { stats, schema: Arc::new(schema), cache, } - .with_cache() } - fn with_cache(mut self) -> Self { - self.cache = self - .cache + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + PlanPropertiesCache::new( + eq_properties, // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(2)) + Partitioning::UnknownPartitioning(2), // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); - - self + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs index bb016f93c351..dda6d730ce84 100644 --- a/datafusion/core/src/test_util/mod.rs +++ b/datafusion/core/src/test_util/mod.rs @@ -55,6 +55,7 @@ use tempfile::TempDir; #[cfg(feature = "parquet")] pub use datafusion_common::test_util::parquet_test_data; pub use datafusion_common::test_util::{arrow_test_data, get_data_dir}; +use datafusion_physical_expr::EquivalenceProperties; /// Scan an empty data source, mainly used in tests pub fn scan_empty( @@ -226,7 +227,6 @@ impl TableProvider for TestTableProvider { pub struct UnboundedExec { batch_produce: Option, batch: RecordBatch, - partitions: usize, cache: PlanPropertiesCache, } impl UnboundedExec { @@ -238,29 +238,30 @@ impl UnboundedExec { batch: RecordBatch, partitions: usize, ) -> Self { - let cache = PlanPropertiesCache::new_default(batch.schema()); + let cache = Self::create_cache(batch.schema(), batch_produce, partitions); Self { batch_produce, batch, - partitions, cache, } - .with_cache() } - fn with_cache(mut self) -> Self { - let mode = if self.batch_produce.is_none() { + fn create_cache( + schema: SchemaRef, + batch_produce: Option, + n_partitions: usize, + ) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + let mode = if batch_produce.is_none() { ExecutionMode::Unbounded } else { ExecutionMode::Bounded }; - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(self.partitions)) - // Execution Mode - .with_exec_mode(mode); - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(n_partitions), + mode, + ) } } diff --git a/datafusion/core/tests/custom_sources.rs b/datafusion/core/tests/custom_sources.rs index a167258ee1d5..11f29192276c 100644 --- a/datafusion/core/tests/custom_sources.rs +++ b/datafusion/core/tests/custom_sources.rs @@ -38,6 +38,7 @@ use datafusion::scalar::ScalarValue; use datafusion_common::cast::as_primitive_array; use datafusion_common::project_schema; use datafusion_common::stats::Precision; +use datafusion_physical_expr::EquivalenceProperties; use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_physical_plan::{ExecutionMode, PlanPropertiesCache}; @@ -81,19 +82,18 @@ impl CustomExecutionPlan { let schema = TEST_CUSTOM_SCHEMA_REF!(); let schema = project_schema(&schema, projection.as_ref()).expect("projected schema"); - let cache = PlanPropertiesCache::new_default(schema); - Self { projection, cache }.with_cache() + let cache = Self::create_cache(schema); + Self { projection, cache } } - fn with_cache(mut self) -> Self { - self.cache = self - .cache + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + PlanPropertiesCache::new( + eq_properties, // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(1)) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); - - self + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs index 9423f0170c7e..da00effa00a8 100644 --- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs +++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs @@ -35,6 +35,7 @@ use datafusion::scalar::ScalarValue; use datafusion_common::cast::as_primitive_array; use datafusion_common::{internal_err, not_impl_err, DataFusionError}; use datafusion_expr::expr::{BinaryExpr, Cast}; +use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; @@ -62,19 +63,17 @@ struct CustomPlan { impl CustomPlan { fn new(schema: SchemaRef, batches: Vec) -> Self { - let cache = PlanPropertiesCache::new_default(schema); - Self { batches, cache }.with_cache() + let cache = Self::create_cache(schema); + Self { batches, cache } } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(1)) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); - - self + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/core/tests/custom_sources_cases/statistics.rs b/datafusion/core/tests/custom_sources_cases/statistics.rs index 315c7cb6dd26..37854908f021 100644 --- a/datafusion/core/tests/custom_sources_cases/statistics.rs +++ b/datafusion/core/tests/custom_sources_cases/statistics.rs @@ -33,6 +33,7 @@ use datafusion::{ scalar::ScalarValue, }; use datafusion_common::{project_schema, stats::Precision}; +use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; @@ -52,24 +53,22 @@ impl StatisticsValidation { schema.fields().len(), "the column statistics vector length should be the number of fields" ); - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(schema.clone()); Self { stats, schema, cache, } - .with_cache() } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(2)) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(2), + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index 947376dfb6a0..f2b6f6c93615 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -91,6 +91,7 @@ use datafusion::{ }; use async_trait::async_trait; +use datafusion_physical_expr::EquivalenceProperties; use futures::{Stream, StreamExt}; /// Execute the specified sql and return the resulting record batches @@ -416,19 +417,18 @@ struct TopKExec { impl TopKExec { fn new(input: Arc, k: usize) -> Self { - let cache = PlanPropertiesCache::new_default(input.schema()); - Self { input, k, cache }.with_cache() + let cache = Self::create_cache(input.schema()); + Self { input, k, cache } } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(1)) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index ea3fc3a737b2..fa5b65e40123 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -260,9 +260,6 @@ pub struct AggregateExec { /// We need the input schema of partial aggregate to be able to deserialize aggregate /// expressions from protobuf for final aggregate. pub input_schema: SchemaRef, - /// The mapping used to normalize expressions like Partitioning and - /// PhysicalSortExpr that maps input to output - projection_mapping: ProjectionMapping, /// Execution metrics metrics: ExecutionPlanMetricsSet, required_input_ordering: Option, @@ -365,8 +362,14 @@ impl AggregateExec { let required_input_ordering = (!new_requirement.is_empty()).then_some(new_requirement); - let cache = PlanPropertiesCache::new_default(schema.clone()); - let aggregate = AggregateExec { + let cache = Self::create_cache( + &input, + schema.clone(), + &projection_mapping, + &mode, + &input_order_mode, + ); + Ok(AggregateExec { mode, group_by, aggr_expr, @@ -374,14 +377,12 @@ impl AggregateExec { input, schema, input_schema, - projection_mapping, metrics: ExecutionPlanMetricsSet::new(), required_input_ordering, limit: None, input_order_mode, cache, - }; - Ok(aggregate.with_cache()) + }) } /// Aggregation mode (full, partial) @@ -505,26 +506,31 @@ impl AggregateExec { true } - fn with_cache(mut self) -> Self { + fn create_cache( + input: &Arc, + schema: SchemaRef, + projection_mapping: &ProjectionMapping, + mode: &AggregateMode, + input_order_mode: &InputOrderMode, + ) -> PlanPropertiesCache { // Construct equivalence properties: - let eq_properties = self - .input + let eq_properties = input .equivalence_properties() - .project(&self.projection_mapping, self.schema()); + .project(projection_mapping, schema); // Get output partitioning: - let mut output_partitioning = self.input.output_partitioning().clone(); - if self.mode.is_first_stage() { + let mut output_partitioning = input.output_partitioning().clone(); + if mode.is_first_stage() { // First stage aggregation will not change the output partitioning, // but needs to respect aliases (e.g. mapping in the GROUP BY // expression). - let input_eq_properties = self.input.equivalence_properties(); + let input_eq_properties = input.equivalence_properties(); if let Partitioning::Hash(exprs, part) = output_partitioning { let normalized_exprs = exprs .iter() .map(|expr| { input_eq_properties - .project_expr(expr, &self.projection_mapping) + .project_expr(expr, projection_mapping) .unwrap_or_else(|| { Arc::new(UnKnownColumn::new(&expr.to_string())) }) @@ -535,18 +541,15 @@ impl AggregateExec { } // Determine execution mode: - let mut exec_mode = self.input.execution_mode(); + let mut exec_mode = input.execution_mode(); if exec_mode == ExecutionMode::Unbounded - && self.input_order_mode == InputOrderMode::Linear + && *input_order_mode == InputOrderMode::Linear { // Cannot run without breaking the pipeline exec_mode = ExecutionMode::PipelineBreaking; } - self.cache = - PlanPropertiesCache::new(eq_properties, output_partitioning, exec_mode); - - self + PlanPropertiesCache::new(eq_properties, output_partitioning, exec_mode) } pub fn input_order_mode(&self) -> &InputOrderMode { @@ -1622,19 +1625,19 @@ mod tests { impl TestYieldingExec { fn new(yield_first: bool) -> Self { let schema = some_data().0; - let cache = PlanPropertiesCache::new_default(schema); - Self { yield_first, cache }.with_cache() + let cache = Self::create_cache(schema); + Self { yield_first, cache } } - fn with_cache(mut self) -> Self { - self.cache = self - .cache + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + PlanPropertiesCache::new( + eq_properties, // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(1)) + Partitioning::UnknownPartitioning(1), // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); - - self + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs index 99b3add2acd6..731f3e3c7ebf 100644 --- a/datafusion/physical-plan/src/analyze.rs +++ b/datafusion/physical-plan/src/analyze.rs @@ -29,6 +29,7 @@ use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch}; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_execution::TaskContext; +use datafusion_physical_expr::EquivalenceProperties; use futures::StreamExt; @@ -55,7 +56,7 @@ impl AnalyzeExec { input: Arc, schema: SchemaRef, ) -> Self { - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(&input, schema.clone()); AnalyzeExec { verbose, show_statistics, @@ -63,7 +64,6 @@ impl AnalyzeExec { schema, cache, } - .with_cache() } /// access to verbose @@ -81,15 +81,14 @@ impl AnalyzeExec { &self.input } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(1)) - // Execution Mode - .with_exec_mode(self.input.execution_mode()); - - self + fn create_cache( + input: &Arc, + schema: SchemaRef, + ) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + let output_partitioning = Partitioning::UnknownPartitioning(1); + let exec_mode = input.execution_mode(); + PlanPropertiesCache::new(eq_properties, output_partitioning, exec_mode) } } diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index e01060f3784d..e83bce0664a3 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -54,14 +54,13 @@ pub struct CoalesceBatchesExec { impl CoalesceBatchesExec { /// Create a new CoalesceBatchesExec pub fn new(input: Arc, target_batch_size: usize) -> Self { - let cache = PlanPropertiesCache::new_default(input.schema()); + let cache = Self::create_cache(&input); Self { input, target_batch_size, metrics: ExecutionPlanMetricsSet::new(), cache, } - .with_cache() } /// The input plan @@ -74,16 +73,14 @@ impl CoalesceBatchesExec { self.target_batch_size } - fn with_cache(mut self) -> Self { + fn create_cache(input: &Arc) -> PlanPropertiesCache { // The coalesce batches operator does not make any changes to the // partitioning of its input. - self.cache = PlanPropertiesCache::new( - self.input.equivalence_properties().clone(), // Equivalence Properties - self.input.output_partitioning().clone(), // Output Partitioning - self.input.execution_mode(), // Execution Mode - ); - - self + PlanPropertiesCache::new( + input.equivalence_properties().clone(), // Equivalence Properties + input.output_partitioning().clone(), // Output Partitioning + input.execution_mode(), // Execution Mode + ) } } diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index 255e996bd122..27f58c9bfd85 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -44,13 +44,12 @@ pub struct CoalescePartitionsExec { impl CoalescePartitionsExec { /// Create a new CoalescePartitionsExec pub fn new(input: Arc) -> Self { - let cache = PlanPropertiesCache::new_default(input.schema()); + let cache = Self::create_cache(&input); CoalescePartitionsExec { input, metrics: ExecutionPlanMetricsSet::new(), cache, } - .with_cache() } /// Input execution plan @@ -58,18 +57,16 @@ impl CoalescePartitionsExec { &self.input } - fn with_cache(mut self) -> Self { + fn create_cache(input: &Arc) -> PlanPropertiesCache { // Coalescing partitions loses existing orderings: - let mut eq_properties = self.input.equivalence_properties().clone(); + let mut eq_properties = input.equivalence_properties().clone(); eq_properties.clear_orderings(); - self.cache = PlanPropertiesCache::new( + PlanPropertiesCache::new( eq_properties, // Equivalence Properties Partitioning::UnknownPartitioning(1), // Output Partitioning - self.input.execution_mode(), // Execution Mode - ); - - self + input.execution_mode(), // Execution Mode + ) } } diff --git a/datafusion/physical-plan/src/empty.rs b/datafusion/physical-plan/src/empty.rs index d91395825135..942bee81f472 100644 --- a/datafusion/physical-plan/src/empty.rs +++ b/datafusion/physical-plan/src/empty.rs @@ -30,6 +30,7 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_execution::TaskContext; +use datafusion_physical_expr::EquivalenceProperties; use log::trace; @@ -46,20 +47,19 @@ pub struct EmptyExec { impl EmptyExec { /// Create a new EmptyExec pub fn new(schema: SchemaRef) -> Self { - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(schema.clone(), 1); EmptyExec { schema, partitions: 1, cache, } - .with_cache() } /// Create a new EmptyExec with specified partition number pub fn with_partitions(mut self, partitions: usize) -> Self { self.partitions = partitions; // Changing partitions may invalidate output partitioning, so update it: - let output_partitioning = self.output_partitioning_helper(); + let output_partitioning = Self::output_partitioning_helper(self.partitions); self.cache = self.cache.with_partitioning(output_partitioning); self } @@ -68,21 +68,20 @@ impl EmptyExec { Ok(vec![]) } - fn output_partitioning_helper(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.partitions) + fn output_partitioning_helper(n_partitions: usize) -> Partitioning { + Partitioning::UnknownPartitioning(n_partitions) } - fn with_cache(mut self) -> Self { - let output_partitioning = self.output_partitioning_helper(); - - self.cache = self - .cache + fn create_cache(schema: SchemaRef, n_partitions: usize) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + let output_partitioning = Self::output_partitioning_helper(n_partitions); + PlanPropertiesCache::new( + eq_properties, // Output Partitioning - .with_partitioning(output_partitioning) + output_partitioning, // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); - - self + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/physical-plan/src/explain.rs b/datafusion/physical-plan/src/explain.rs index 935b37c168da..689ef32aa1a9 100644 --- a/datafusion/physical-plan/src/explain.rs +++ b/datafusion/physical-plan/src/explain.rs @@ -28,6 +28,7 @@ use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatc use datafusion_common::display::StringifiedPlan; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_execution::TaskContext; +use datafusion_physical_expr::EquivalenceProperties; use log::trace; @@ -52,14 +53,13 @@ impl ExplainExec { stringified_plans: Vec, verbose: bool, ) -> Self { - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(schema.clone()); ExplainExec { schema, stringified_plans, verbose, cache, } - .with_cache() } /// The strings to be printed @@ -72,15 +72,13 @@ impl ExplainExec { self.verbose } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(1)) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); - - self + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index cc8fdcbcd0cd..d6942f0d5678 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -73,15 +73,15 @@ impl FilterExec { ) -> Result { match predicate.data_type(input.schema().as_ref())? { DataType::Boolean => { - let cache = PlanPropertiesCache::new_default(input.schema()); + let default_selectivity = 20; + let cache = Self::create_cache(&input, &predicate, default_selectivity)?; Ok(Self { predicate, input: input.clone(), metrics: ExecutionPlanMetricsSet::new(), - default_selectivity: 20, + default_selectivity, cache, - } - .with_cache()) + }) } other => { plan_err!("Filter predicate must return boolean values, not {other:?}") @@ -115,12 +115,58 @@ impl FilterExec { self.default_selectivity } - fn with_cache(mut self) -> Self { + fn statistics_helper( + input: &Arc, + predicate: &Arc, + default_selectivity: u8, + ) -> Result { + let input_stats = input.statistics()?; + let schema = input.schema(); + if !check_support(predicate, &schema) { + let selectivity = default_selectivity as f64 / 100.0; + let mut stats = input_stats.into_inexact(); + stats.num_rows = stats.num_rows.with_estimated_selectivity(selectivity); + stats.total_byte_size = stats + .total_byte_size + .with_estimated_selectivity(selectivity); + return Ok(stats); + } + + let num_rows = input_stats.num_rows; + let total_byte_size = input_stats.total_byte_size; + let input_analysis_ctx = AnalysisContext::try_from_statistics( + &input.schema(), + &input_stats.column_statistics, + )?; + + let analysis_ctx = analyze(predicate, input_analysis_ctx, &schema)?; + + // Estimate (inexact) selectivity of predicate + let selectivity = analysis_ctx.selectivity.unwrap_or(1.0); + let num_rows = num_rows.with_estimated_selectivity(selectivity); + let total_byte_size = total_byte_size.with_estimated_selectivity(selectivity); + + let column_statistics = collect_new_statistics( + &input_stats.column_statistics, + analysis_ctx.boundaries, + ); + Ok(Statistics { + num_rows, + total_byte_size, + column_statistics, + }) + } + + fn create_cache( + input: &Arc, + predicate: &Arc, + default_selectivity: u8, + ) -> Result { // Combine the equal predicates with the input equivalence properties // to construct the equivalence properties: - let stats = self.statistics().unwrap(); - let mut eq_properties = self.input.equivalence_properties().clone(); - let (equal_pairs, _) = collect_columns_from_predicate(&self.predicate); + let stats = Self::statistics_helper(input, predicate, default_selectivity)?; + let mut eq_properties = input.equivalence_properties().clone(); + let (equal_pairs, _) = collect_columns_from_predicate(predicate); for (lhs, rhs) in equal_pairs { let lhs_expr = Arc::new(lhs.clone()) as _; let rhs_expr = Arc::new(rhs.clone()) as _; @@ -128,19 +174,17 @@ impl FilterExec { } // Add the columns that have only one viable value (singleton) after // filtering to constants. - let constants = collect_columns(self.predicate()) + let constants = collect_columns(predicate) .into_iter() .filter(|column| stats.column_statistics[column.index()].is_singleton()) .map(|column| Arc::new(column) as _); eq_properties = eq_properties.add_constants(constants); - self.cache = PlanPropertiesCache::new( + Ok(PlanPropertiesCache::new( eq_properties, - self.input.output_partitioning().clone(), // Output Partitioning - self.input.execution_mode(), // Execution Mode - ); - - self + input.output_partitioning().clone(), // Output Partitioning + input.execution_mode(), // Execution Mode + )) } } @@ -211,43 +255,7 @@ impl ExecutionPlan for FilterExec { /// The output statistics of a filtering operation can be estimated if the /// predicate's selectivity value can be determined for the incoming data. fn statistics(&self) -> Result { - let predicate = self.predicate(); - - let input_stats = self.input.statistics()?; - let schema = self.schema(); - if !check_support(predicate, &schema) { - let selectivity = self.default_selectivity as f64 / 100.0; - let mut stats = input_stats.into_inexact(); - stats.num_rows = stats.num_rows.with_estimated_selectivity(selectivity); - stats.total_byte_size = stats - .total_byte_size - .with_estimated_selectivity(selectivity); - return Ok(stats); - } - - let num_rows = input_stats.num_rows; - let total_byte_size = input_stats.total_byte_size; - let input_analysis_ctx = AnalysisContext::try_from_statistics( - &self.input.schema(), - &input_stats.column_statistics, - )?; - - let analysis_ctx = analyze(predicate, input_analysis_ctx, &self.schema())?; - - // Estimate (inexact) selectivity of predicate - let selectivity = analysis_ctx.selectivity.unwrap_or(1.0); - let num_rows = num_rows.with_estimated_selectivity(selectivity); - let total_byte_size = total_byte_size.with_estimated_selectivity(selectivity); - - let column_statistics = collect_new_statistics( - &input_stats.column_statistics, - analysis_ctx.boundaries, - ); - Ok(Statistics { - num_rows, - total_byte_size, - column_statistics, - }) + Self::statistics_helper(&self.input, self.predicate(), self.default_selectivity) } } diff --git a/datafusion/physical-plan/src/insert.rs b/datafusion/physical-plan/src/insert.rs index b20e8cac7926..472c65f25b30 100644 --- a/datafusion/physical-plan/src/insert.rs +++ b/datafusion/physical-plan/src/insert.rs @@ -35,7 +35,9 @@ use arrow_array::{ArrayRef, UInt64Array}; use arrow_schema::{DataType, Field, Schema}; use datafusion_common::{exec_err, internal_err, DataFusionError, Result}; use datafusion_execution::TaskContext; -use datafusion_physical_expr::{Distribution, PhysicalSortRequirement}; +use datafusion_physical_expr::{ + Distribution, EquivalenceProperties, PhysicalSortRequirement, +}; use async_trait::async_trait; use futures::StreamExt; @@ -104,7 +106,7 @@ impl FileSinkExec { sort_order: Option>, ) -> Self { let count_schema = make_count_schema(); - let cache = PlanPropertiesCache::new_default(count_schema); + let cache = Self::create_schema(&input, count_schema); Self { input, sink, @@ -113,7 +115,6 @@ impl FileSinkExec { sort_order, cache, } - .with_cache() } fn execute_input_stream( @@ -176,15 +177,16 @@ impl FileSinkExec { self.sink.metrics() } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(1)) - // Execution Mode - .with_exec_mode(self.input.execution_mode()); - - self + fn create_schema( + input: &Arc, + schema: SchemaRef, + ) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(1), + input.execution_mode(), + ) } } diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 8b12b02b4667..42758e635060 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -27,9 +27,9 @@ use crate::coalesce_batches::concat_batches; use crate::coalesce_partitions::CoalescePartitionsExec; use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use crate::{ - ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionMode, - ExecutionPlan, PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, - Statistics, + exec_mode_flatten, ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, + ExecutionMode, ExecutionPlan, PlanPropertiesCache, RecordBatchStream, + SendableRecordBatchStream, Statistics, }; use arrow::datatypes::{Fields, Schema, SchemaRef}; @@ -77,7 +77,7 @@ impl CrossJoinExec { }; let schema = Arc::new(Schema::new(all_columns)); - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(&left, &right, schema.clone()); CrossJoinExec { left, right, @@ -86,7 +86,6 @@ impl CrossJoinExec { metrics: ExecutionPlanMetricsSet::default(), cache, } - .with_cache() } /// left (build) side which gets loaded in memory @@ -99,15 +98,19 @@ impl CrossJoinExec { &self.right } - fn with_cache(mut self) -> Self { + fn create_cache( + left: &Arc, + right: &Arc, + schema: SchemaRef, + ) -> PlanPropertiesCache { // Calculate equivalence properties // TODO: Check equivalence properties of cross join, it may preserve // ordering in some cases. let eq_properties = join_equivalence_properties( - self.left.equivalence_properties().clone(), - self.right.equivalence_properties().clone(), + left.equivalence_properties().clone(), + right.equivalence_properties().clone(), &JoinType::Full, - self.schema(), + schema, &[false, false], None, &[], @@ -117,19 +120,18 @@ impl CrossJoinExec { // TODO: Optimize the cross join implementation to generate M * N // partitions. let output_partitioning = adjust_right_output_partitioning( - self.right.output_partitioning(), - self.left.schema().fields.len(), + right.output_partitioning(), + left.schema().fields.len(), ); // Determine the execution mode: - let mode = match (self.left.execution_mode(), self.right.execution_mode()) { - (ExecutionMode::Bounded, ExecutionMode::Bounded) => ExecutionMode::Bounded, + let mut mode = exec_mode_flatten([left, right]); + if mode.is_unbounded() { // If any of the inputs is unbounded, cross join breaks the pipeline. - (_, _) => ExecutionMode::PipelineBreaking, - }; + mode = ExecutionMode::PipelineBreaking; + } - self.cache = PlanPropertiesCache::new(eq_properties, output_partitioning, mode); - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } } diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 3aae053151cd..2b88ec449a04 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -65,6 +65,7 @@ use datafusion_execution::TaskContext; use datafusion_physical_expr::equivalence::join_equivalence_properties; use datafusion_physical_expr::PhysicalExprRef; +use crate::joins::utils::JoinOnRef; use ahash::RandomState; use futures::{ready, Stream, StreamExt, TryStreamExt}; @@ -327,7 +328,14 @@ impl HashJoinExec { let random_state = RandomState::with_seeds(0, 0, 0, 0); - let cache = PlanPropertiesCache::new_default(Arc::new(schema.clone())); + let cache = Self::create_cache( + &left, + &right, + Arc::new(schema.clone()), + *join_type, + &on, + partition_mode, + ); Ok(HashJoinExec { left, @@ -343,8 +351,7 @@ impl HashJoinExec { column_indices, null_equals_null, cache, - } - .with_cache()) + }) } /// left (build) side which gets hashed @@ -399,25 +406,29 @@ impl HashJoinExec { JoinSide::Right } - fn with_cache(mut self) -> Self { - let left = &self.left; - let right = &self.right; - let schema = self.schema(); + fn create_cache( + left: &Arc, + right: &Arc, + schema: SchemaRef, + join_type: JoinType, + on: JoinOnRef, + mode: PartitionMode, + ) -> PlanPropertiesCache { // Calculate equivalence properties: let eq_properties = join_equivalence_properties( left.equivalence_properties().clone(), right.equivalence_properties().clone(), - &self.join_type, + &join_type, schema, - &Self::maintains_input_order(self.join_type), + &Self::maintains_input_order(join_type), Some(Self::probe_side()), - &self.on, + on, ); // Get output partitioning: let left_columns_len = left.schema().fields.len(); - let output_partitioning = match self.mode { - PartitionMode::CollectLeft => match self.join_type { + let output_partitioning = match mode { + PartitionMode::CollectLeft => match join_type { JoinType::Inner | JoinType::Right => adjust_right_output_partitioning( right.output_partitioning(), left_columns_len, @@ -433,7 +444,7 @@ impl HashJoinExec { ), }, PartitionMode::Partitioned => partitioned_join_output_partitioning( - self.join_type, + join_type, left.output_partitioning(), right.output_partitioning(), left_columns_len, @@ -449,7 +460,7 @@ impl HashJoinExec { let pipeline_breaking = left.execution_mode().is_unbounded() || (right.execution_mode().is_unbounded() && matches!( - self.join_type, + join_type, JoinType::Left | JoinType::Full | JoinType::LeftAnti @@ -462,8 +473,7 @@ impl HashJoinExec { exec_mode_flatten([left, right]) }; - self.cache = PlanPropertiesCache::new(eq_properties, output_partitioning, mode); - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } } diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index 1978338d2b6a..89beac14816d 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -109,19 +109,19 @@ impl NestedLoopJoinExec { check_join_is_valid(&left_schema, &right_schema, &[])?; let (schema, column_indices) = build_join_schema(&left_schema, &right_schema, join_type); - let cache = PlanPropertiesCache::new_default(Arc::new(schema.clone())); + let schema = Arc::new(schema); + let cache = Self::create_cache(&left, &right, schema.clone(), *join_type); Ok(NestedLoopJoinExec { left, right, filter, join_type: *join_type, - schema: Arc::new(schema), + schema, inner_table: Default::default(), column_indices, metrics: Default::default(), cache, - } - .with_cache()) + }) } /// left side @@ -144,39 +144,43 @@ impl NestedLoopJoinExec { &self.join_type } - fn with_cache(mut self) -> Self { + fn create_cache( + left: &Arc, + right: &Arc, + schema: SchemaRef, + join_type: JoinType, + ) -> PlanPropertiesCache { // Calculate equivalence properties: let eq_properties = join_equivalence_properties( - self.left.equivalence_properties().clone(), - self.right.equivalence_properties().clone(), - &self.join_type, - self.schema(), - &self.maintains_input_order(), + left.equivalence_properties().clone(), + right.equivalence_properties().clone(), + &join_type, + schema, + &[false, false], None, // No on columns in nested loop join &[], ); // Get output partitioning, - let output_partitioning = if self.join_type == JoinType::Full { - self.left.output_partitioning().clone() + let output_partitioning = if join_type == JoinType::Full { + left.output_partitioning().clone() } else { partitioned_join_output_partitioning( - self.join_type, - self.left.output_partitioning(), - self.right.output_partitioning(), - self.left.schema().fields.len(), + join_type, + left.output_partitioning(), + right.output_partitioning(), + left.schema().fields.len(), ) }; // Determine execution mode: - let mut mode = exec_mode_flatten([&self.left, &self.right]); + let mut mode = exec_mode_flatten([left, right]); if mode.is_unbounded() { mode = ExecutionMode::PipelineBreaking; } - self.cache = PlanPropertiesCache::new(eq_properties, output_partitioning, mode); - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } } diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index 0d7cd995a5a2..f7d754a99e0e 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -33,7 +33,7 @@ use std::task::{Context, Poll}; use crate::expressions::PhysicalSortExpr; use crate::joins::utils::{ build_join_schema, check_join_is_valid, estimate_join_statistics, - partitioned_join_output_partitioning, JoinFilter, JoinOn, + partitioned_join_output_partitioning, JoinFilter, JoinOn, JoinOnRef, }; use crate::metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; use crate::{ @@ -137,7 +137,7 @@ impl SortMergeJoinExec { let schema = Arc::new(build_join_schema(&left_schema, &right_schema, &join_type).0); - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(&left, &right, schema.clone(), join_type, &on); Ok(Self { left, right, @@ -151,8 +151,7 @@ impl SortMergeJoinExec { sort_options, null_equals_null, cache, - } - .with_cache()) + }) } /// Get probe side (e.g streaming side) information for this sort merge join. @@ -201,32 +200,37 @@ impl SortMergeJoinExec { self.left.as_ref() } - fn with_cache(mut self) -> Self { + fn create_cache( + left: &Arc, + right: &Arc, + schema: SchemaRef, + join_type: JoinType, + join_on: JoinOnRef, + ) -> PlanPropertiesCache { // Calculate equivalence properties: let eq_properties = join_equivalence_properties( - self.left.equivalence_properties().clone(), - self.right.equivalence_properties().clone(), - &self.join_type, - self.schema(), - &self.maintains_input_order(), - Some(Self::probe_side(&self.join_type)), - self.on(), + left.equivalence_properties().clone(), + right.equivalence_properties().clone(), + &join_type, + schema, + &Self::maintains_input_order(join_type), + Some(Self::probe_side(&join_type)), + join_on, ); // Get output partitioning: - let left_columns_len = self.left.schema().fields.len(); + let left_columns_len = left.schema().fields.len(); let output_partitioning = partitioned_join_output_partitioning( - self.join_type, - self.left.output_partitioning(), - self.right.output_partitioning(), + join_type, + left.output_partitioning(), + right.output_partitioning(), left_columns_len, ); // Determine execution mode: - let mode = exec_mode_flatten([&self.left, &self.right]); + let mode = exec_mode_flatten([left, right]); - self.cache = PlanPropertiesCache::new(eq_properties, output_partitioning, mode); - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } } diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 5fd89e98a58f..4e07b10dd517 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -43,7 +43,7 @@ use crate::joins::stream_join_utils::{ use crate::joins::utils::{ apply_join_filter_to_indices, build_batch_from_indices, build_join_schema, check_join_is_valid, partitioned_join_output_partitioning, ColumnIndex, JoinFilter, - JoinHashMapType, JoinOn, StatefulStreamResult, + JoinHashMapType, JoinOn, JoinOnRef, StatefulStreamResult, }; use crate::{ exec_mode_flatten, @@ -233,8 +233,8 @@ impl SymmetricHashJoinExec { // Initialize the random state for the join operation: let random_state = RandomState::with_seeds(0, 0, 0, 0); - - let cache = PlanPropertiesCache::new_default(Arc::new(schema)); + let schema = Arc::new(schema); + let cache = Self::create_cache(&left, &right, schema.clone(), *join_type, &on); Ok(SymmetricHashJoinExec { left, right, @@ -249,37 +249,41 @@ impl SymmetricHashJoinExec { right_sort_exprs, mode, cache, - } - .with_cache()) + }) } - fn with_cache(mut self) -> Self { + fn create_cache( + left: &Arc, + right: &Arc, + schema: SchemaRef, + join_type: JoinType, + join_on: JoinOnRef, + ) -> PlanPropertiesCache { // Calculate equivalence properties: let eq_properties = join_equivalence_properties( - self.left.equivalence_properties().clone(), - self.right.equivalence_properties().clone(), - &self.join_type, - self.schema(), - &self.maintains_input_order(), + left.equivalence_properties().clone(), + right.equivalence_properties().clone(), + &join_type, + schema, + &[false, false], // Has alternating probe side None, - self.on(), + join_on, ); // Get output partitioning: - let left_columns_len = self.left.schema().fields.len(); + let left_columns_len = left.schema().fields.len(); let output_partitioning = partitioned_join_output_partitioning( - self.join_type, - self.left.output_partitioning(), - self.right.output_partitioning(), + join_type, + left.output_partitioning(), + right.output_partitioning(), left_columns_len, ); // Determine execution mode: - let mode = exec_mode_flatten([&self.left, &self.right]); + let mode = exec_mode_flatten([left, right]); - self.cache = PlanPropertiesCache::new(eq_properties, output_partitioning, mode); - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } /// left stream diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 52bfb0063d40..f90bbf061d38 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -540,22 +540,6 @@ impl PlanPropertiesCache { } } - /// Construct a default `PlanPropertiesCache`, for a given schema. - pub fn new_default(schema: SchemaRef) -> PlanPropertiesCache { - // Default values are the most restrictive possible values. - let eq_properties = EquivalenceProperties::new(schema); - // Please note that this default is not safe, and should be overwritten. - let partitioning = Partitioning::UnknownPartitioning(0); - let exec_mode = ExecutionMode::PipelineBreaking; - let output_ordering = None; - Self { - eq_properties, - partitioning, - exec_mode, - output_ordering, - } - } - /// Overwrite output partitioning with its new value. pub fn with_partitioning(mut self, partitioning: Partitioning) -> Self { self.partitioning = partitioning; diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index 59559e84fd75..a4b924d71066 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -57,7 +57,7 @@ pub struct GlobalLimitExec { impl GlobalLimitExec { /// Create a new GlobalLimitExec pub fn new(input: Arc, skip: usize, fetch: Option) -> Self { - let cache = PlanPropertiesCache::new_default(input.schema()); + let cache = Self::create_cache(&input); GlobalLimitExec { input, skip, @@ -65,7 +65,6 @@ impl GlobalLimitExec { metrics: ExecutionPlanMetricsSet::new(), cache, } - .with_cache() } /// Input execution plan @@ -83,14 +82,12 @@ impl GlobalLimitExec { self.fetch } - fn with_cache(mut self) -> Self { - self.cache = PlanPropertiesCache::new( - self.input.equivalence_properties().clone(), // Equivalence Properties - Partitioning::UnknownPartitioning(1), // Output Partitioning - ExecutionMode::Bounded, // Execution Mode - ); - - self + fn create_cache(input: &Arc) -> PlanPropertiesCache { + PlanPropertiesCache::new( + input.equivalence_properties().clone(), // Equivalence Properties + Partitioning::UnknownPartitioning(1), // Output Partitioning + ExecutionMode::Bounded, // Execution Mode + ) } } @@ -276,14 +273,13 @@ pub struct LocalLimitExec { impl LocalLimitExec { /// Create a new LocalLimitExec partition pub fn new(input: Arc, fetch: usize) -> Self { - let cache = PlanPropertiesCache::new_default(input.schema()); + let cache = Self::create_cache(&input); Self { input, fetch, metrics: ExecutionPlanMetricsSet::new(), cache, } - .with_cache() } /// Input execution plan @@ -296,14 +292,12 @@ impl LocalLimitExec { self.fetch } - fn with_cache(mut self) -> Self { - self.cache = PlanPropertiesCache::new( - self.input.equivalence_properties().clone(), // Equivalence Properties - self.input.output_partitioning().clone(), // Output Partitioning - ExecutionMode::Bounded, // Execution Mode - ); - - self + fn create_cache(input: &Arc) -> PlanPropertiesCache { + PlanPropertiesCache::new( + input.equivalence_properties().clone(), // Equivalence Properties + input.output_partitioning().clone(), // Output Partitioning + ExecutionMode::Bounded, // Execution Mode + ) } } diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index 206d22e72226..f6039ee8b3ed 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -153,7 +153,7 @@ impl MemoryExec { projection: Option>, ) -> Result { let projected_schema = project_schema(&schema, projection.as_ref())?; - let cache = PlanPropertiesCache::new_default(projected_schema.clone()); + let cache = Self::create_cache(projected_schema.clone(), &[], partitions); Ok(Self { partitions: partitions.to_vec(), schema, @@ -161,8 +161,7 @@ impl MemoryExec { projection, sort_information: vec![], cache, - } - .with_cache()) + }) } pub fn partitions(&self) -> &[Vec] { @@ -193,27 +192,29 @@ impl MemoryExec { self.sort_information = sort_information; // We need to update equivalence properties when updating sort information. - let eq_properties = self.equivalent_properties_helper(); + let eq_properties = EquivalenceProperties::new_with_orderings( + self.schema(), + &self.sort_information, + ); self.cache = self.cache.with_eq_properties(eq_properties); - self.with_cache() + self } pub fn original_schema(&self) -> SchemaRef { self.schema.clone() } - fn equivalent_properties_helper(&self) -> EquivalenceProperties { - EquivalenceProperties::new_with_orderings(self.schema(), &self.sort_information) - } - - fn with_cache(mut self) -> Self { - self.cache = PlanPropertiesCache::new( - self.equivalent_properties_helper(), // Equivalence Properties - Partitioning::UnknownPartitioning(self.partitions.len()), // Output Partitioning - ExecutionMode::Bounded, // Execution Mode - ); - - self + fn create_cache( + schema: SchemaRef, + orderings: &[LexOrdering], + partitions: &[Vec], + ) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); + PlanPropertiesCache::new( + eq_properties, // Equivalence Properties + Partitioning::UnknownPartitioning(partitions.len()), // Output Partitioning + ExecutionMode::Bounded, // Execution Mode + ) } } diff --git a/datafusion/physical-plan/src/placeholder_row.rs b/datafusion/physical-plan/src/placeholder_row.rs index a9b27cb13fe1..9d4204ddb589 100644 --- a/datafusion/physical-plan/src/placeholder_row.rs +++ b/datafusion/physical-plan/src/placeholder_row.rs @@ -32,6 +32,7 @@ use arrow::record_batch::RecordBatch; use arrow_array::RecordBatchOptions; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_execution::TaskContext; +use datafusion_physical_expr::EquivalenceProperties; use log::trace; @@ -48,20 +49,20 @@ pub struct PlaceholderRowExec { impl PlaceholderRowExec { /// Create a new PlaceholderRowExec pub fn new(schema: SchemaRef) -> Self { - let cache = PlanPropertiesCache::new_default(schema.clone()); + let partitions = 1; + let cache = Self::create_cache(schema.clone(), partitions); PlaceholderRowExec { schema, - partitions: 1, + partitions, cache, } - .with_cache() } /// Create a new PlaceholderRowExecPlaceholderRowExec with specified partition number pub fn with_partitions(mut self, partitions: usize) -> Self { self.partitions = partitions; // Update output partitioning when updating partitions: - let output_partitioning = self.output_partitioning_helper(); + let output_partitioning = Self::output_partitioning_helper(self.partitions); self.cache = self.cache.with_partitioning(output_partitioning); self } @@ -89,20 +90,20 @@ impl PlaceholderRowExec { }) } - fn output_partitioning_helper(&self) -> Partitioning { - Partitioning::UnknownPartitioning(self.partitions) + fn output_partitioning_helper(n_partitions: usize) -> Partitioning { + Partitioning::UnknownPartitioning(n_partitions) } - fn with_cache(mut self) -> Self { + fn create_cache(schema: SchemaRef, n_partitions: usize) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); // Get output partitioning: - let output_partitioning = self.output_partitioning_helper(); + let output_partitioning = Self::output_partitioning_helper(n_partitions); - self.cache = self - .cache - .with_partitioning(output_partitioning) - .with_exec_mode(ExecutionMode::Bounded); - - self + PlanPropertiesCache::new( + eq_properties, + output_partitioning, + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 4c17aa3d834a..7420cf58b5ce 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -56,9 +56,6 @@ pub struct ProjectionExec { schema: SchemaRef, /// The input plan input: Arc, - /// The mapping used to normalize expressions like Partitioning and - /// PhysicalSortExpr that maps input to output - projection_mapping: ProjectionMapping, /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Cache holding plan properties like equivalences, output partitioning etc. @@ -96,16 +93,14 @@ impl ProjectionExec { // construct a map from the input expressions to the output expression of the Projection let projection_mapping = ProjectionMapping::try_new(&expr, &input_schema)?; - let cache = PlanPropertiesCache::new_default(schema.clone()); - let projection = Self { + let cache = Self::create_cache(&input, &projection_mapping, schema.clone())?; + Ok(Self { expr, schema, input, - projection_mapping, metrics: ExecutionPlanMetricsSet::new(), cache, - }; - projection.with_cache() + }) } /// The projection expressions stored as tuples of (expression, output column name) @@ -118,13 +113,15 @@ impl ProjectionExec { &self.input } - fn with_cache(mut self) -> Result { - let input = &self.input; + fn create_cache( + input: &Arc, + projection_mapping: &ProjectionMapping, + schema: SchemaRef, + ) -> Result { // Calculate equivalence properties: let mut input_eq_properties = input.equivalence_properties().clone(); - input_eq_properties.substitute_oeq_class(&self.projection_mapping)?; - let eq_properties = - input_eq_properties.project(&self.projection_mapping, self.schema.clone()); + input_eq_properties.substitute_oeq_class(projection_mapping)?; + let eq_properties = input_eq_properties.project(projection_mapping, schema); // Calculate output partitioning, which needs to respect aliases: let input_partition = input.output_partitioning(); @@ -134,7 +131,7 @@ impl ProjectionExec { .iter() .map(|expr| { input_eq_properties - .project_expr(expr, &self.projection_mapping) + .project_expr(expr, projection_mapping) .unwrap_or_else(|| { Arc::new(UnKnownColumn::new(&expr.to_string())) }) @@ -145,13 +142,11 @@ impl ProjectionExec { input_partition.clone() }; - self.cache = PlanPropertiesCache::new( + Ok(PlanPropertiesCache::new( eq_properties, output_partitioning, input.execution_mode(), - ); - - Ok(self) + )) } } diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs index 97a626c25116..adc675ba2730 100644 --- a/datafusion/physical-plan/src/recursive_query.rs +++ b/datafusion/physical-plan/src/recursive_query.rs @@ -33,7 +33,8 @@ use arrow::record_batch::RecordBatch; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{not_impl_err, DataFusionError, Result}; use datafusion_execution::TaskContext; -use datafusion_physical_expr::Partitioning; +use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; + use futures::{ready, Stream, StreamExt}; /// Recursive query execution plan. @@ -81,7 +82,7 @@ impl RecursiveQueryExec { let work_table = Arc::new(WorkTable::new()); // Use the same work table for both the WorkTableExec and the recursive term let recursive_term = assign_work_table(recursive_term, work_table.clone())?; - let cache = PlanPropertiesCache::new_default(static_term.schema()); + let cache = Self::create_cache(static_term.schema()); Ok(RecursiveQueryExec { name, static_term, @@ -90,17 +91,17 @@ impl RecursiveQueryExec { work_table, metrics: ExecutionPlanMetricsSet::new(), cache, - } - .with_cache()) + }) } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - .with_partitioning(Partitioning::UnknownPartitioning(1)) - .with_exec_mode(ExecutionMode::Bounded); + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 6d2835df05f0..dc1e88f52e56 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -44,7 +44,7 @@ use arrow::record_batch::RecordBatch; use datafusion_common::{arrow_datafusion_err, not_impl_err, DataFusionError, Result}; use datafusion_execution::memory_pool::MemoryConsumer; use datafusion_execution::TaskContext; -use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; +use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr, PhysicalSortExpr}; use futures::stream::Stream; use futures::{FutureExt, StreamExt}; @@ -436,12 +436,7 @@ impl ExecutionPlan for RepartitionExec { } fn maintains_input_order(&self) -> Vec { - if self.preserve_order { - vec![true] - } else { - // We preserve ordering when input partitioning is 1 - vec![self.input().output_partitioning().partition_count() <= 1] - } + Self::maintains_input_order_helper(self.input(), self.preserve_order) } fn execute( @@ -602,7 +597,8 @@ impl RepartitionExec { input: Arc, partitioning: Partitioning, ) -> Result { - let cache = PlanPropertiesCache::new_default(input.schema()); + let preserve_order = false; + let cache = Self::create_cache(&input, partitioning.clone(), preserve_order); Ok(RepartitionExec { input, partitioning, @@ -611,27 +607,49 @@ impl RepartitionExec { abort_helper: Arc::new(AbortOnDropMany::<()>(vec![])), })), metrics: ExecutionPlanMetricsSet::new(), - preserve_order: false, + preserve_order, cache, + }) + } + + fn maintains_input_order_helper( + input: &Arc, + preserve_order: bool, + ) -> Vec { + if preserve_order { + vec![true] + } else { + // We preserve ordering when input partitioning is 1 + vec![input.output_partitioning().partition_count() <= 1] } - .with_cache()) } - fn with_cache(mut self) -> Self { + fn eq_properties_helper( + input: &Arc, + preserve_order: bool, + ) -> EquivalenceProperties { // Equivalence Properties - let mut eq_properties = self.input.equivalence_properties().clone(); + let mut eq_properties = input.equivalence_properties().clone(); // If the ordering is lost, reset the ordering equivalence class: - if !self.maintains_input_order()[0] { + if !Self::maintains_input_order_helper(input, preserve_order)[0] { eq_properties.clear_orderings(); } + eq_properties + } - self.cache = PlanPropertiesCache::new( - eq_properties, // Equivalence Properties - self.partitioning.clone(), // Output Partitioning - self.input.execution_mode(), // Execution Mode - ); + fn create_cache( + input: &Arc, + partitioning: Partitioning, + preserve_order: bool, + ) -> PlanPropertiesCache { + // Equivalence Properties + let eq_properties = Self::eq_properties_helper(input, preserve_order); - self + PlanPropertiesCache::new( + eq_properties, // Equivalence Properties + partitioning, // Output Partitioning + input.execution_mode(), // Execution Mode + ) } /// Specify if this reparititoning operation should preserve the order of @@ -648,7 +666,9 @@ impl RepartitionExec { // if there is only one input partition, merging is not required // to maintain order self.input.output_partitioning().partition_count() > 1; - self.with_cache() + let eq_properties = Self::eq_properties_helper(&self.input, self.preserve_order); + self.cache = self.cache.with_eq_properties(eq_properties); + self } /// Return the sort expressions that are used to merge diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs index 99ead9f904a1..16c4bc8601b6 100644 --- a/datafusion/physical-plan/src/sorts/partial_sort.rs +++ b/datafusion/physical-plan/src/sorts/partial_sort.rs @@ -71,6 +71,7 @@ use arrow::record_batch::RecordBatch; use datafusion_common::utils::evaluate_partition_ranges; use datafusion_common::Result; use datafusion_execution::{RecordBatchStream, TaskContext}; +use datafusion_physical_expr::LexOrdering; use futures::{ready, Stream, StreamExt}; use log::trace; @@ -104,17 +105,17 @@ impl PartialSortExec { common_prefix_length: usize, ) -> Self { assert!(common_prefix_length > 0); - let cache = PlanPropertiesCache::new_default(input.schema()); + let preserve_partitioning = false; + let cache = Self::create_cache(&input, expr.clone(), preserve_partitioning); Self { input, expr, common_prefix_length, metrics_set: ExecutionPlanMetricsSet::new(), - preserve_partitioning: false, + preserve_partitioning, fetch: None, cache, } - .with_cache() } /// Whether this `PartialSortExec` preserves partitioning of the children @@ -131,6 +132,12 @@ impl PartialSortExec { /// input partitions producing a single, sorted partition. pub fn with_preserve_partitioning(mut self, preserve_partitioning: bool) -> Self { self.preserve_partitioning = preserve_partitioning; + self.cache = self + .cache + .with_partitioning(Self::output_partitioning_helper( + &self.input, + self.preserve_partitioning, + )); self } @@ -161,27 +168,38 @@ impl PartialSortExec { self.fetch } - fn with_cache(mut self) -> Self { + fn output_partitioning_helper( + input: &Arc, + preserve_partitioning: bool, + ) -> Partitioning { + // Get output partitioning: + if preserve_partitioning { + input.output_partitioning().clone() + } else { + Partitioning::UnknownPartitioning(1) + } + } + + fn create_cache( + input: &Arc, + sort_exprs: LexOrdering, + preserve_partitioning: bool, + ) -> PlanPropertiesCache { // Calculate equivalence properties; i.e. reset the ordering equivalence // class with the new ordering: - let eq_properties = self - .input + let eq_properties = input .equivalence_properties() .clone() - .with_reorder(self.expr.to_vec()); + .with_reorder(sort_exprs); // Get output partitioning: - let output_partitioning = if self.preserve_partitioning { - self.input.output_partitioning().clone() - } else { - Partitioning::UnknownPartitioning(1) - }; + let output_partitioning = + Self::output_partitioning_helper(input, preserve_partitioning); // Determine execution mode: - let mode = self.input.execution_mode(); + let mode = input.execution_mode(); - self.cache = PlanPropertiesCache::new(eq_properties, output_partitioning, mode); - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 55a3c9f068f1..a74705dd32ab 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -52,6 +52,7 @@ use datafusion_execution::memory_pool::{ }; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; +use datafusion_physical_expr::LexOrdering; use futures::{StreamExt, TryStreamExt}; use log::{debug, error, trace}; @@ -694,16 +695,16 @@ impl SortExec { /// Create a new sort execution plan that produces a single, /// sorted output partition. pub fn new(expr: Vec, input: Arc) -> Self { - let cache = PlanPropertiesCache::new_default(input.schema()); + let preserve_partitioning = false; + let cache = Self::create_cache(&input, expr.clone(), preserve_partitioning); Self { expr, input, metrics_set: ExecutionPlanMetricsSet::new(), - preserve_partitioning: false, + preserve_partitioning, fetch: None, cache, } - .with_cache() } /// Create a new sort execution plan with the option to preserve @@ -737,7 +738,13 @@ impl SortExec { /// input partitions producing a single, sorted partition. pub fn with_preserve_partitioning(mut self, preserve_partitioning: bool) -> Self { self.preserve_partitioning = preserve_partitioning; - self.with_cache() + self.cache = self + .cache + .with_partitioning(Self::output_partitioning_helper( + &self.input, + self.preserve_partitioning, + )); + self } /// Modify how many rows to include in the result @@ -767,33 +774,43 @@ impl SortExec { self.fetch } - fn with_cache(mut self) -> Self { + fn output_partitioning_helper( + input: &Arc, + preserve_partitioning: bool, + ) -> Partitioning { + // Get output partitioning: + if preserve_partitioning { + input.output_partitioning().clone() + } else { + Partitioning::UnknownPartitioning(1) + } + } + + fn create_cache( + input: &Arc, + sort_exprs: LexOrdering, + preserve_partitioning: bool, + ) -> PlanPropertiesCache { // Calculate equivalence properties; i.e. reset the ordering equivalence // class with the new ordering: - let eq_properties = self - .input + let eq_properties = input .equivalence_properties() .clone() - .with_reorder(self.expr.to_vec()); + .with_reorder(sort_exprs); // Get output partitioning: - let output_partitioning = if self.preserve_partitioning { - self.input.output_partitioning().clone() - } else { - Partitioning::UnknownPartitioning(1) - }; + let output_partitioning = + Self::output_partitioning_helper(input, preserve_partitioning); // Determine execution mode: - let mode = match self.input.execution_mode() { + let mode = match input.execution_mode() { ExecutionMode::Unbounded | ExecutionMode::PipelineBreaking => { ExecutionMode::PipelineBreaking } ExecutionMode::Bounded => ExecutionMode::Bounded, }; - self.cache = PlanPropertiesCache::new(eq_properties, output_partitioning, mode); - - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } } diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index eadd2d0711fe..c07ae72d5492 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -80,7 +80,7 @@ pub struct SortPreservingMergeExec { impl SortPreservingMergeExec { /// Create a new sort execution plan pub fn new(expr: Vec, input: Arc) -> Self { - let cache = PlanPropertiesCache::new_default(input.schema()); + let cache = Self::create_cache(&input); Self { input, expr, @@ -88,7 +88,6 @@ impl SortPreservingMergeExec { fetch: None, cache, } - .with_cache() } /// Sets the number of rows to fetch pub fn with_fetch(mut self, fetch: Option) -> Self { @@ -111,14 +110,12 @@ impl SortPreservingMergeExec { self.fetch } - fn with_cache(mut self) -> Self { - self.cache = PlanPropertiesCache::new( - self.input.equivalence_properties().clone(), // Equivalence Properties - Partitioning::UnknownPartitioning(1), // Output Partitioning - self.input.execution_mode(), // Execution Mode - ); - - self + fn create_cache(input: &Arc) -> PlanPropertiesCache { + PlanPropertiesCache::new( + input.equivalence_properties().clone(), // Equivalence Properties + Partitioning::UnknownPartitioning(1), // Output Partitioning + input.execution_mode(), // Execution Mode + ) } } diff --git a/datafusion/physical-plan/src/streaming.rs b/datafusion/physical-plan/src/streaming.rs index 60b372446805..e95fd37ab1b2 100644 --- a/datafusion/physical-plan/src/streaming.rs +++ b/datafusion/physical-plan/src/streaming.rs @@ -85,16 +85,22 @@ impl StreamingTableExec { Some(p) => Arc::new(schema.project(p)?), None => schema, }; - let cache = PlanPropertiesCache::new_default(projected_schema.clone()); + let projected_output_ordering = + projected_output_ordering.into_iter().collect::>(); + let cache = Self::create_cache( + projected_schema.clone(), + &projected_output_ordering, + &partitions, + infinite, + ); Ok(Self { partitions, projected_schema, projection: projection.cloned().map(Into::into), - projected_output_ordering: projected_output_ordering.into_iter().collect(), + projected_output_ordering, infinite, cache, - } - .with_cache()) + }) } pub fn partitions(&self) -> &Vec> { @@ -121,26 +127,26 @@ impl StreamingTableExec { self.infinite } - fn with_cache(mut self) -> Self { + fn create_cache( + schema: SchemaRef, + orderings: &[LexOrdering], + partitions: &[Arc], + is_infinite: bool, + ) -> PlanPropertiesCache { // Calculate equivalence properties: - let eq_properties = EquivalenceProperties::new_with_orderings( - self.schema(), - &self.projected_output_ordering, - ); + let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); // Get output partitioning: - let output_partitioning = - Partitioning::UnknownPartitioning(self.partitions.len()); + let output_partitioning = Partitioning::UnknownPartitioning(partitions.len()); // Determine execution mode: - let mode = if self.infinite { + let mode = if is_infinite { ExecutionMode::Unbounded } else { ExecutionMode::Bounded }; - self.cache = PlanPropertiesCache::new(eq_properties, output_partitioning, mode); - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } } diff --git a/datafusion/physical-plan/src/test/exec.rs b/datafusion/physical-plan/src/test/exec.rs index 77ff8d27157a..a677907295a7 100644 --- a/datafusion/physical-plan/src/test/exec.rs +++ b/datafusion/physical-plan/src/test/exec.rs @@ -34,6 +34,7 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_execution::TaskContext; +use datafusion_physical_expr::EquivalenceProperties; use futures::Stream; use tokio::sync::Barrier; @@ -132,14 +133,13 @@ impl MockExec { /// ensure any poll loops are correct. This behavior can be /// changed with `with_use_task` pub fn new(data: Vec>, schema: SchemaRef) -> Self { - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(schema.clone()); Self { data, schema, use_task: true, cache, } - .with_cache() } /// If `use_task` is true (the default) then the batches are sent @@ -150,15 +150,14 @@ impl MockExec { self } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(1)) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) } } @@ -290,14 +289,13 @@ impl BarrierExec { pub fn new(data: Vec>, schema: SchemaRef) -> Self { // wait for all streams and the input let barrier = Arc::new(Barrier::new(data.len() + 1)); - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(schema.clone(), &data); Self { data, schema, barrier, cache, } - .with_cache() } /// wait until all the input streams and this function is ready @@ -307,15 +305,13 @@ impl BarrierExec { println!("BarrierExec::wait done waiting"); } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(self.data.len())) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); - - self + fn create_cache(schema: SchemaRef, data: &[Vec]) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(data.len()), + ExecutionMode::Bounded, + ) } } @@ -412,19 +408,18 @@ impl ErrorExec { DataType::Int64, true, )])); - let cache = PlanPropertiesCache::new_default(schema.clone()); - Self { cache }.with_cache() + let cache = Self::create_cache(schema.clone()); + Self { cache } } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(1)) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) } } @@ -486,24 +481,22 @@ impl StatisticsExec { .column_statistics.len(), schema.fields().len(), "if defined, the column statistics vector length should be the number of fields" ); - let cache = PlanPropertiesCache::new_default(Arc::new(schema.clone())); + let cache = Self::create_cache(Arc::new(schema.clone())); Self { stats, schema: Arc::new(schema), cache, } - .with_cache() } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(2)) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(2), + ExecutionMode::Bounded, + ) } } @@ -567,9 +560,6 @@ pub struct BlockingExec { /// Schema that is mocked by this plan. schema: SchemaRef, - /// Number of output partitions. - n_partitions: usize, - /// Ref-counting helper to check if the plan and the produced stream are still in memory. refs: Arc<()>, cache: PlanPropertiesCache, @@ -578,14 +568,12 @@ pub struct BlockingExec { impl BlockingExec { /// Create new [`BlockingExec`] with a give schema and number of partitions. pub fn new(schema: SchemaRef, n_partitions: usize) -> Self { - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(schema.clone(), n_partitions); Self { schema, - n_partitions, refs: Default::default(), cache, } - .with_cache() } /// Weak pointer that can be used for ref-counting this execution plan and its streams. @@ -597,15 +585,14 @@ impl BlockingExec { Arc::downgrade(&self.refs) } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(self.n_partitions)) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); + fn create_cache(schema: SchemaRef, n_partitions: usize) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(n_partitions), + ExecutionMode::Bounded, + ) } } @@ -719,13 +706,13 @@ impl PanicExec { /// Create new [`PanicExec`] with a give schema and number of /// partitions, which will each panic immediately. pub fn new(schema: SchemaRef, n_partitions: usize) -> Self { - let cache = PlanPropertiesCache::new_default(schema.clone()); + let batches_until_panics = vec![0; n_partitions]; + let cache = Self::create_cache(schema.clone(), &batches_until_panics); Self { schema, - batches_until_panics: vec![0; n_partitions], + batches_until_panics, cache, } - .with_cache() } /// Set the number of batches prior to panic for a partition @@ -734,17 +721,18 @@ impl PanicExec { self } - fn with_cache(mut self) -> Self { - let num_partitions = self.batches_until_panics.len(); + fn create_cache( + schema: SchemaRef, + batches_until_panics: &[usize], + ) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + let num_partitions = batches_until_panics.len(); - self.cache = self - .cache - // Output Partitioning - .with_partitioning(Partitioning::UnknownPartitioning(num_partitions)) - // Execution Mode - .with_exec_mode(ExecutionMode::Bounded); - - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(num_partitions), + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 42e5ce58edb0..06a870123255 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -98,13 +98,12 @@ impl UnionExec { /// Create a new UnionExec pub fn new(inputs: Vec>) -> Self { let schema = union_schema(&inputs); - let cache = PlanPropertiesCache::new_default(schema); + let cache = Self::create_cache(&inputs, schema); UnionExec { inputs, metrics: ExecutionPlanMetricsSet::new(), cache, } - .with_cache() } /// Get inputs of the execution plan @@ -112,16 +111,18 @@ impl UnionExec { &self.inputs } - fn with_cache(mut self) -> Self { + fn create_cache( + inputs: &[Arc], + schema: SchemaRef, + ) -> PlanPropertiesCache { // Calculate equivalence properties: // TODO: In some cases, we should be able to preserve some equivalence // classes and constants. Add support for such cases. - let children_eqs = self - .inputs + let children_eqs = inputs .iter() .map(|child| child.equivalence_properties()) .collect::>(); - let mut eq_properties = EquivalenceProperties::new(self.schema()); + let mut eq_properties = EquivalenceProperties::new(schema); // Use the ordering equivalence class of the first child as the seed: let mut meets = children_eqs[0] .oeq_class() @@ -152,18 +153,16 @@ impl UnionExec { eq_properties.add_new_orderings(meets); // Calculate output partitioning; i.e. sum output partitions of the inputs. - let num_partitions = self - .inputs + let num_partitions = inputs .iter() .map(|plan| plan.output_partitioning().partition_count()) .sum(); let output_partitioning = Partitioning::UnknownPartitioning(num_partitions); // Determine execution mode: - let mode = exec_mode_flatten(self.inputs.iter()); + let mode = exec_mode_flatten(inputs.iter()); - self.cache = PlanPropertiesCache::new(eq_properties, output_partitioning, mode); - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } } @@ -323,20 +322,17 @@ pub struct InterleaveExec { impl InterleaveExec { /// Create a new InterleaveExec pub fn try_new(inputs: Vec>) -> Result { - let schema = union_schema(&inputs); - if !can_interleave(inputs.iter()) { return internal_err!( "Not all InterleaveExec children have a consistent hash partitioning" ); } - let cache = PlanPropertiesCache::new_default(schema); + let cache = Self::create_cache(&inputs); Ok(InterleaveExec { inputs, metrics: ExecutionPlanMetricsSet::new(), cache, - } - .with_cache()) + }) } /// Get inputs of the execution plan @@ -344,18 +340,15 @@ impl InterleaveExec { &self.inputs } - fn with_cache(mut self) -> Self { + fn create_cache(inputs: &[Arc]) -> PlanPropertiesCache { + let schema = union_schema(inputs); + let eq_properties = EquivalenceProperties::new(schema); // Get output partitioning: - let output_partitioning = self.inputs[0].output_partitioning().clone(); + let output_partitioning = inputs[0].output_partitioning().clone(); // Determine execution mode: - let mode = exec_mode_flatten(self.inputs.iter()); + let mode = exec_mode_flatten(inputs.iter()); - self.cache = self - .cache - .with_partitioning(output_partitioning) - .with_exec_mode(mode); - - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } } diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index b9df57d84f81..ba90e8b4f1fc 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -37,6 +37,7 @@ use arrow::datatypes::{ use arrow::record_batch::RecordBatch; use datafusion_common::{exec_err, DataFusionError, Result, UnnestOptions}; use datafusion_execution::TaskContext; +use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; use futures::{Stream, StreamExt}; @@ -70,7 +71,7 @@ impl UnnestExec { schema: SchemaRef, options: UnnestOptions, ) -> Self { - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(&input, schema.clone()); UnnestExec { input, schema, @@ -79,18 +80,19 @@ impl UnnestExec { metrics: Default::default(), cache, } - .with_cache() } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - // Output Partitioning - .with_partitioning(self.input.output_partitioning().clone()) - // Execution Mode - .with_exec_mode(self.input.execution_mode()); - - self + fn create_cache( + input: &Arc, + schema: SchemaRef, + ) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); + + PlanPropertiesCache::new( + eq_properties, + input.output_partitioning().clone(), + input.execution_mode(), + ) } } diff --git a/datafusion/physical-plan/src/values.rs b/datafusion/physical-plan/src/values.rs index 7fc242099379..20c8eddce6bd 100644 --- a/datafusion/physical-plan/src/values.rs +++ b/datafusion/physical-plan/src/values.rs @@ -33,6 +33,7 @@ use arrow::datatypes::{Schema, SchemaRef}; use arrow::record_batch::{RecordBatch, RecordBatchOptions}; use datafusion_common::{internal_err, plan_err, DataFusionError, Result, ScalarValue}; use datafusion_execution::TaskContext; +use datafusion_physical_expr::EquivalenceProperties; /// Execution plan for values list based relation (produces constant rows) #[derive(Debug)] @@ -113,13 +114,12 @@ impl ValuesExec { } } - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(schema.clone()); Ok(ValuesExec { schema, data: batches, cache, - } - .with_cache()) + }) } /// provides the data @@ -127,13 +127,14 @@ impl ValuesExec { self.data.clone() } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - .with_partitioning(Partitioning::UnknownPartitioning(1)) - .with_exec_mode(ExecutionMode::Bounded); + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) } } diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index cb512302cb6f..a9dfc9bfeedd 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -121,8 +121,8 @@ impl BoundedWindowAggExec { vec![] } }; - let cache = PlanPropertiesCache::new_default(schema.clone()); - let window = Self { + let cache = Self::create_cache(&input, &schema, &window_expr); + Ok(Self { input, window_expr, schema, @@ -131,8 +131,7 @@ impl BoundedWindowAggExec { input_order_mode, ordered_partition_by_indices, cache, - }; - Ok(window.with_cache()) + }) } /// Window expressions @@ -183,23 +182,25 @@ impl BoundedWindowAggExec { }) } - fn with_cache(mut self) -> Self { + fn create_cache( + input: &Arc, + schema: &SchemaRef, + window_expr: &[Arc], + ) -> PlanPropertiesCache { // Calculate equivalence properties: - let eq_properties = - window_equivalence_properties(&self.schema, &self.input, &self.window_expr); + let eq_properties = window_equivalence_properties(schema, input, window_expr); // As we can have repartitioning using the partition keys, this can // be either one or more than one, depending on the presence of // repartitioning. - let output_partitioning = self.input.output_partitioning().clone(); + let output_partitioning = input.output_partitioning().clone(); // Construct properties cache - self.cache = PlanPropertiesCache::new( - eq_properties, // Equivalence Properties - output_partitioning, // Output Partitioning - self.input.execution_mode(), // Execution Mode - ); - self + PlanPropertiesCache::new( + eq_properties, // Equivalence Properties + output_partitioning, // Output Partitioning + input.execution_mode(), // Execution Mode + ) } } diff --git a/datafusion/physical-plan/src/windows/window_agg_exec.rs b/datafusion/physical-plan/src/windows/window_agg_exec.rs index 247588c971a2..852698bafe3a 100644 --- a/datafusion/physical-plan/src/windows/window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/window_agg_exec.rs @@ -80,8 +80,8 @@ impl WindowAggExec { let ordered_partition_by_indices = get_ordered_partition_by_indices(window_expr[0].partition_by(), &input); - let cache = PlanPropertiesCache::new_default(schema.clone()); - let window = Self { + let cache = Self::create_cache(schema.clone(), &input, &window_expr); + Ok(Self { input, window_expr, schema, @@ -89,8 +89,7 @@ impl WindowAggExec { metrics: ExecutionPlanMetricsSet::new(), ordered_partition_by_indices, cache, - }; - Ok(window.with_cache()) + }) } /// Window expressions @@ -117,18 +116,21 @@ impl WindowAggExec { ) } - fn with_cache(mut self) -> Self { + fn create_cache( + schema: SchemaRef, + input: &Arc, + window_expr: &[Arc], + ) -> PlanPropertiesCache { // Calculate equivalence properties: - let eq_properties = - window_equivalence_properties(&self.schema, &self.input, &self.window_expr); + let eq_properties = window_equivalence_properties(&schema, input, window_expr); // Get output partitioning: // Because we can have repartitioning using the partition keys this // would be either 1 or more than 1 depending on the presense of repartitioning. - let output_partitioning = self.input.output_partitioning().clone(); + let output_partitioning = input.output_partitioning().clone(); // Determine execution mode: - let mode = match self.input.execution_mode() { + let mode = match input.execution_mode() { ExecutionMode::Bounded => ExecutionMode::Bounded, ExecutionMode::Unbounded | ExecutionMode::PipelineBreaking => { ExecutionMode::PipelineBreaking @@ -136,8 +138,7 @@ impl WindowAggExec { }; // Construct properties cache: - self.cache = PlanPropertiesCache::new(eq_properties, output_partitioning, mode); - self + PlanPropertiesCache::new(eq_properties, output_partitioning, mode) } } diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs index 33c611dd30d8..9a0b5daf27e4 100644 --- a/datafusion/physical-plan/src/work_table.rs +++ b/datafusion/physical-plan/src/work_table.rs @@ -33,7 +33,7 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_execution::TaskContext; -use datafusion_physical_expr::Partitioning; +use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; /// The name is from PostgreSQL's terminology. /// See @@ -91,7 +91,7 @@ pub struct WorkTableExec { impl WorkTableExec { /// Create a new execution plan for a worktable exec. pub fn new(name: String, schema: SchemaRef) -> Self { - let cache = PlanPropertiesCache::new_default(schema.clone()); + let cache = Self::create_cache(schema.clone()); Self { name, schema, @@ -99,7 +99,6 @@ impl WorkTableExec { work_table: Arc::new(WorkTable::new()), cache, } - .with_cache() } pub(super) fn with_work_table(&self, work_table: Arc) -> Self { @@ -112,13 +111,14 @@ impl WorkTableExec { } } - fn with_cache(mut self) -> Self { - self.cache = self - .cache - .with_partitioning(Partitioning::UnknownPartitioning(1)) - .with_exec_mode(ExecutionMode::Bounded); + fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + let eq_properties = EquivalenceProperties::new(schema); - self + PlanPropertiesCache::new( + eq_properties, + Partitioning::UnknownPartitioning(1), + ExecutionMode::Bounded, + ) } } From b728232b91862826061dfa878cefe90d25576f78 Mon Sep 17 00:00:00 2001 From: Alex Huang Date: Mon, 26 Feb 2024 21:10:13 +0800 Subject: [PATCH 16/45] feat: support `FixedSizeList` Type Coercion (#9108) * support FixedSizeList Type Coercion * add allow null type coercion parameter * support null column in FixedSizeList * Add test * Add tests for cardinality with fixed size lists * chore * fix ci * add comment * Fix array_element function signature * Remove unused imports and simplify code * Fix array function signatures and behavior * fix conflict * fix conflict * add tests for FixedSizeList * remove unreacheable null check * simplify the code * remove null checking * reformat output * simplify code * add tests for array_dims * Refactor type coercion functions in datafusion/expr module --- datafusion/expr/src/built_in_function.rs | 23 +- datafusion/expr/src/signature.rs | 17 +- .../expr/src/type_coercion/functions.rs | 109 ++-- datafusion/sqllogictest/test_files/array.slt | 562 +++++++++++++++++- 4 files changed, 628 insertions(+), 83 deletions(-) diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index f92ae87d6e6c..8b4e65121c79 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -31,7 +31,7 @@ use crate::{ }; use arrow::datatypes::{DataType, Field, Fields, IntervalUnit, TimeUnit}; -use datafusion_common::{internal_err, plan_err, DataFusionError, Result}; +use datafusion_common::{exec_err, plan_err, DataFusionError, Result}; use strum::IntoEnumIterator; use strum_macros::EnumIter; @@ -543,10 +543,11 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Flatten => { fn get_base_type(data_type: &DataType) -> Result { match data_type { - DataType::List(field) if matches!(field.data_type(), DataType::List(_)) => get_base_type(field.data_type()), + DataType::List(field) | DataType::FixedSizeList(field, _) if matches!(field.data_type(), DataType::List(_)|DataType::FixedSizeList(_,_ )) => get_base_type(field.data_type()), DataType::LargeList(field) if matches!(field.data_type(), DataType::LargeList(_)) => get_base_type(field.data_type()), DataType::Null | DataType::List(_) | DataType::LargeList(_) => Ok(data_type.to_owned()), - _ => internal_err!("Not reachable, data_type should be List or LargeList"), + DataType::FixedSizeList(field,_ ) => Ok(DataType::List(field.clone())), + _ => exec_err!("Not reachable, data_type should be List, LargeList or FixedSizeList"), } } @@ -929,18 +930,18 @@ impl BuiltinScalarFunction { // 0 or more arguments of arbitrary type Signature::one_of(vec![VariadicEqual, Any(0)], self.volatility()) } - BuiltinScalarFunction::ArrayPopFront => Signature::any(1, self.volatility()), - BuiltinScalarFunction::ArrayPopBack => Signature::any(1, self.volatility()), + BuiltinScalarFunction::ArrayPopFront => Signature::array(self.volatility()), + BuiltinScalarFunction::ArrayPopBack => Signature::array(self.volatility()), BuiltinScalarFunction::ArrayConcat => { Signature::variadic_any(self.volatility()) } - BuiltinScalarFunction::ArrayDims => Signature::any(1, self.volatility()), - BuiltinScalarFunction::ArrayEmpty => Signature::any(1, self.volatility()), + BuiltinScalarFunction::ArrayDims => Signature::array(self.volatility()), + BuiltinScalarFunction::ArrayEmpty => Signature::array(self.volatility()), BuiltinScalarFunction::ArrayElement => { Signature::array_and_index(self.volatility()) } BuiltinScalarFunction::ArrayExcept => Signature::any(2, self.volatility()), - BuiltinScalarFunction::Flatten => Signature::any(1, self.volatility()), + BuiltinScalarFunction::Flatten => Signature::array(self.volatility()), BuiltinScalarFunction::ArrayHasAll | BuiltinScalarFunction::ArrayHasAny => { Signature::any(2, self.volatility()) } @@ -950,8 +951,8 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArrayLength => { Signature::variadic_any(self.volatility()) } - BuiltinScalarFunction::ArrayNdims => Signature::any(1, self.volatility()), - BuiltinScalarFunction::ArrayDistinct => Signature::any(1, self.volatility()), + BuiltinScalarFunction::ArrayNdims => Signature::array(self.volatility()), + BuiltinScalarFunction::ArrayDistinct => Signature::array(self.volatility()), BuiltinScalarFunction::ArrayPosition => { Signature::array_and_element_and_optional_index(self.volatility()) } @@ -981,7 +982,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArrayIntersect => Signature::any(2, self.volatility()), BuiltinScalarFunction::ArrayUnion => Signature::any(2, self.volatility()), - BuiltinScalarFunction::Cardinality => Signature::any(1, self.volatility()), + BuiltinScalarFunction::Cardinality => Signature::array(self.volatility()), BuiltinScalarFunction::ArrayResize => { Signature::variadic_any(self.volatility()) } diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index e8d9d8fb3966..663ecf7b1b8e 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -123,7 +123,7 @@ pub enum TypeSignature { #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum ArrayFunctionSignature { /// Specialized Signature for ArrayAppend and similar functions - /// The first argument should be List/LargeList, and the second argument should be non-list or list. + /// The first argument should be List/LargeList/FixedSizedList, and the second argument should be non-list or list. /// The second argument's list dimension should be one dimension less than the first argument's list dimension. /// List dimension of the List/LargeList is equivalent to the number of List. /// List dimension of the non-list is 0. @@ -133,9 +133,14 @@ pub enum ArrayFunctionSignature { /// The first argument's list dimension should be one dimension less than the second argument's list dimension. ElementAndArray, /// Specialized Signature for Array functions of the form (List/LargeList, Index) + /// The first argument should be List/LargeList/FixedSizedList, and the second argument should be Int64. ArrayAndIndex, /// Specialized Signature for Array functions of the form (List/LargeList, Element, Optional Index) ArrayAndElementAndOptionalIndex, + /// Specialized Signature for ArrayEmpty and similar functions + /// The function takes a single argument that must be a List/LargeList/FixedSizeList + /// or something that can be coerced to one of those types. + Array, } impl std::fmt::Display for ArrayFunctionSignature { @@ -153,6 +158,9 @@ impl std::fmt::Display for ArrayFunctionSignature { ArrayFunctionSignature::ArrayAndIndex => { write!(f, "array, index") } + ArrayFunctionSignature::Array => { + write!(f, "array") + } } } } @@ -325,6 +333,13 @@ impl Signature { volatility, } } + /// Specialized Signature for ArrayEmpty and similar functions + pub fn array(volatility: Volatility) -> Self { + Signature { + type_signature: TypeSignature::ArraySignature(ArrayFunctionSignature::Array), + volatility, + } + } } /// Monotonicity of the `ScalarFunctionExpr` with respect to its arguments. diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 9cab04bc7605..2022d67879f8 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -80,6 +80,36 @@ fn get_valid_types( signature: &TypeSignature, current_types: &[DataType], ) -> Result>> { + fn array_element_and_optional_index( + current_types: &[DataType], + ) -> Result>> { + // make sure there's 2 or 3 arguments + if !(current_types.len() == 2 || current_types.len() == 3) { + return Ok(vec![vec![]]); + } + + let first_two_types = ¤t_types[0..2]; + let mut valid_types = array_append_or_prepend_valid_types(first_two_types, true)?; + + // Early return if there are only 2 arguments + if current_types.len() == 2 { + return Ok(valid_types); + } + + let valid_types_with_index = valid_types + .iter() + .map(|t| { + let mut t = t.clone(); + t.push(DataType::Int64); + t + }) + .collect::>(); + + valid_types.extend(valid_types_with_index); + + Ok(valid_types) + } + fn array_append_or_prepend_valid_types( current_types: &[DataType], is_append: bool, @@ -111,71 +141,37 @@ fn get_valid_types( ) })?; - let array_type = datafusion_common::utils::coerced_type_with_base_type_only( + let new_array_type = datafusion_common::utils::coerced_type_with_base_type_only( array_type, &new_base_type, ); - match array_type { + match new_array_type { DataType::List(ref field) | DataType::LargeList(ref field) | DataType::FixedSizeList(ref field, _) => { - let elem_type = field.data_type(); + let new_elem_type = field.data_type(); if is_append { - Ok(vec![vec![array_type.clone(), elem_type.clone()]]) + Ok(vec![vec![new_array_type.clone(), new_elem_type.clone()]]) } else { - Ok(vec![vec![elem_type.to_owned(), array_type.clone()]]) + Ok(vec![vec![new_elem_type.to_owned(), new_array_type.clone()]]) } } _ => Ok(vec![vec![]]), } } - fn array_element_and_optional_index( - current_types: &[DataType], - ) -> Result>> { - // make sure there's 2 or 3 arguments - if !(current_types.len() == 2 || current_types.len() == 3) { - return Ok(vec![vec![]]); - } - - let first_two_types = ¤t_types[0..2]; - let mut valid_types = array_append_or_prepend_valid_types(first_two_types, true)?; - - // Early return if there are only 2 arguments - if current_types.len() == 2 { - return Ok(valid_types); - } - - let valid_types_with_index = valid_types - .iter() - .map(|t| { - let mut t = t.clone(); - t.push(DataType::Int64); - t - }) - .collect::>(); - - valid_types.extend(valid_types_with_index); - - Ok(valid_types) - } - fn array_and_index(current_types: &[DataType]) -> Result>> { - if current_types.len() != 2 { - return Ok(vec![vec![]]); - } - - let array_type = ¤t_types[0]; - + fn array(array_type: &DataType) -> Option { match array_type { DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _) => { let array_type = coerced_fixed_size_list_to_list(array_type); - Ok(vec![vec![array_type, DataType::Int64]]) + Some(array_type) } - _ => Ok(vec![vec![]]), + _ => None, } } + let valid_types = match signature { TypeSignature::Variadic(valid_types) => valid_types .iter() @@ -211,19 +207,32 @@ fn get_valid_types( TypeSignature::ArraySignature(ref function_signature) => match function_signature { ArrayFunctionSignature::ArrayAndElement => { - return array_append_or_prepend_valid_types(current_types, true) + array_append_or_prepend_valid_types(current_types, true)? } - ArrayFunctionSignature::ArrayAndElementAndOptionalIndex => { - return array_element_and_optional_index(current_types) + ArrayFunctionSignature::ElementAndArray => { + array_append_or_prepend_valid_types(current_types, false)? } ArrayFunctionSignature::ArrayAndIndex => { - return array_and_index(current_types) + if current_types.len() != 2 { + return Ok(vec![vec![]]); + } + array(¤t_types[0]).map_or_else( + || vec![vec![]], + |array_type| vec![vec![array_type, DataType::Int64]], + ) } - ArrayFunctionSignature::ElementAndArray => { - return array_append_or_prepend_valid_types(current_types, false) + ArrayFunctionSignature::ArrayAndElementAndOptionalIndex => { + array_element_and_optional_index(current_types)? } - }, + ArrayFunctionSignature::Array => { + if current_types.len() != 1 { + return Ok(vec![vec![]]); + } + array(¤t_types[0]) + .map_or_else(|| vec![vec![]], |array_type| vec![vec![array_type]]) + } + }, TypeSignature::Any(number) => { if current_types.len() != *number { return plan_err!( diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 7f263d904819..da02a80a104f 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -123,6 +123,13 @@ AS VALUES (make_array(NULL, 10, 11, 12)) ; +statement ok +CREATE TABLE large_arrayspop +AS SELECT + arrow_cast(column1, 'LargeList(Int64)') AS column1 +FROM arrayspop +; + statement ok CREATE TABLE nested_arrays AS VALUES @@ -172,6 +179,15 @@ AS SELECT FROM arrays_values ; +statement ok +CREATE TABLE fixed_arrays_values +AS SELECT + arrow_cast(column1, 'FixedSizeList(10, Int64)') AS column1, + column2, + column3, + column4 +FROM arrays_values +; statement ok CREATE TABLE arrays_values_v2 @@ -212,6 +228,22 @@ AS FROM flatten_table ; +statement ok +CREATE TABLE fixed_size_flatten_table +AS VALUES + (arrow_cast(make_array([1], [2], [3]), 'FixedSizeList(3, List(Int64))'), + arrow_cast(make_array([[1, 2, 3]], [[4, 5]], [[6]]), 'FixedSizeList(3, List(List(Int64)))'), + arrow_cast(make_array([[[1]]], [[[2, 3]]]), 'FixedSizeList(2, List(List(List(Int64))))'), + arrow_cast(make_array([1.0], [2.1, 2.2], [3.2, 3.3, 3.4]), 'FixedSizeList(3, List(Float64))') + ), + ( + arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'FixedSizeList(3, List(Int64))'), + arrow_cast(make_array([[8]], [[9, 10]], [[11, 12, 13]]), 'FixedSizeList(3, List(List(Int64)))'), + arrow_cast(make_array([[[1,2]]], [[[3]]]), 'FixedSizeList(2, List(List(List(Int64))))'), + arrow_cast(make_array([1.0, 2.0], [3.0, 4.0], [5.0, 6.0]), 'FixedSizeList(3, List(Float64))') + ) +; + statement ok CREATE TABLE array_has_table_1D AS VALUES @@ -346,10 +378,31 @@ AS VALUES statement ok CREATE TABLE array_distinct_table_1D_large +AS SELECT + arrow_cast(column1, 'LargeList(Int64)') AS column1 +FROM array_distinct_table_1D +; + +statement ok +CREATE TABLE array_distinct_table_1D_fixed +AS SELECT + arrow_cast(column1, 'FixedSizeList(5, Int64)') AS column1 +FROM array_distinct_table_1D +; + +statement ok +CREATE TABLE array_distinct_table_1D_UTF8_fixed +AS SELECT + arrow_cast(column1, 'FixedSizeList(5, Utf8)') AS column1 +FROM array_distinct_table_1D_UTF8 +; + +statement ok +CREATE TABLE array_distinct_table_2D_fixed AS VALUES - (arrow_cast(make_array(1, 1, 2, 2, 3), 'LargeList(Int64)')), - (arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), - (arrow_cast(make_array(3, 5, 3, 3, 3), 'LargeList(Int64)')) + (arrow_cast(make_array([1,2], [1,2], [3,4], [3,4], [5,6]), 'FixedSizeList(5, List(Int64))')), + (arrow_cast(make_array([1,2], [3,4], [5,6], [7,8], [9,10]), 'FixedSizeList(5, List(Int64))')), + (arrow_cast(make_array([5,6], [5,6], NULL, NULL, NULL), 'FixedSizeList(5, List(Int64))')) ; statement ok @@ -1103,7 +1156,7 @@ select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), NULL NULL query IT -select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)'), 7), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 11); +select array_element(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)'), 7), array_element(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)'), 11); ---- NULL NULL @@ -1303,6 +1356,14 @@ NULL 43 ## array_pop_back (aliases: `list_pop_back`) +# array_pop_back scalar function with null +#TODO: https://github.com/apache/arrow-datafusion/issues/7142 +# follow clickhouse and duckdb +#query ? +#select array_pop_back(null); +#---- +#NULL + # array_pop_back scalar function #1 query ?? select array_pop_back(make_array(1, 2, 3, 4, 5)), array_pop_back(make_array('h', 'e', 'l', 'l', 'o')); @@ -1314,6 +1375,11 @@ select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')) ---- [1, 2, 3, 4] [h, e, l, l] +query ?? +select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)')), array_pop_back(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)')); +---- +[1, 2, 3, 4] [h, e, l, l] + # array_pop_back scalar function #2 (after array_pop_back, array is empty) query ? select array_pop_back(make_array(1)); @@ -1325,6 +1391,11 @@ select array_pop_back(arrow_cast(make_array(1), 'LargeList(Int64)')); ---- [] +query ? +select array_pop_back(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')); +---- +[] + # array_pop_back scalar function #3 (array_pop_back the empty array) query ? select array_pop_back(array_pop_back(make_array(1))); @@ -1336,12 +1407,27 @@ select array_pop_back(array_pop_back(arrow_cast(make_array(1), 'LargeList(Int64) ---- [] +query ? +select array_pop_back(array_pop_back(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'))); +---- +[] + # array_pop_back scalar function #4 (array_pop_back the arrays which have NULL) query ?? select array_pop_back(make_array(1, 2, 3, 4, NULL)), array_pop_back(make_array(NULL, 'e', 'l', NULL, 'o')); ---- [1, 2, 3, 4] [, e, l, ] +query ?? +select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, NULL), 'LargeList(Int64)')), array_pop_back(arrow_cast(make_array(NULL, 'e', 'l', NULL, 'o'), 'LargeList(Utf8)')); +---- +[1, 2, 3, 4] [, e, l, ] + +query ?? +select array_pop_back(arrow_cast(make_array(1, 2, 3, 4, NULL), 'FixedSizeList(5, Int64)')), array_pop_back(arrow_cast(make_array(NULL, 'e', 'l', NULL, 'o'), 'FixedSizeList(5, Utf8)')); +---- +[1, 2, 3, 4] [, e, l, ] + # array_pop_back scalar function #5 (array_pop_back the nested arrays) query ? select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6))); @@ -1353,6 +1439,11 @@ select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9 ---- [[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]] +query ? +select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'FixedSizeList(6, List(Int64))')); +---- +[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]] + # array_pop_back scalar function #6 (array_pop_back the nested arrays with NULL) query ? select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), NULL)); @@ -1364,6 +1455,11 @@ select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9 ---- [[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]] +query ? +select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), NULL), 'FixedSizeList(6, List(Int64))')); +---- +[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]] + # array_pop_back scalar function #7 (array_pop_back the nested arrays with NULL) query ? select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), NULL, make_array(1, 7, 4))); @@ -1375,6 +1471,11 @@ select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9 ---- [[1, 2, 3], [2, 9, 1], [7, 8, 9], ] +query ? +select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), NULL, make_array(1, 7, 4)), 'FixedSizeList(5, List(Int64))')); +---- +[[1, 2, 3], [2, 9, 1], [7, 8, 9], ] + # array_pop_back scalar function #8 (after array_pop_back, nested array is empty) query ? select array_pop_back(make_array(make_array(1, 2, 3))); @@ -1386,6 +1487,11 @@ select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3)), 'LargeList(Lis ---- [] +query ? +select array_pop_back(arrow_cast(make_array(make_array(1, 2, 3)), 'FixedSizeList(1, List(Int64))')); +---- +[] + # array_pop_back with columns query ? select array_pop_back(column1) from arrayspop; @@ -1407,8 +1513,36 @@ select array_pop_back(arrow_cast(column1, 'LargeList(Int64)')) from arrayspop; [] [, 10, 11] +query ? +select array_pop_back(column1) from large_arrayspop; +---- +[1, 2] +[3, 4, 5] +[6, 7, 8, ] +[, ] +[] +[, 10, 11] + +query ? +select array_pop_back(arrow_cast(column1, 'LargeList(Int64)')) from large_arrayspop; +---- +[1, 2] +[3, 4, 5] +[6, 7, 8, ] +[, ] +[] +[, 10, 11] + ## array_pop_front (aliases: `list_pop_front`) +#TODO:https://github.com/apache/arrow-datafusion/issues/7142 +# array_pop_front scalar function with null +# follow clickhouse and duckdb +#query ? +#select array_pop_front(null); +#---- +#NULL + # array_pop_front scalar function #1 query ?? select array_pop_front(make_array(1, 2, 3, 4, 5)), array_pop_front(make_array('h', 'e', 'l', 'l', 'o')); @@ -1420,6 +1554,11 @@ select array_pop_front(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)') ---- [2, 3, 4, 5] [e, l, l, o] +query ?? +select array_pop_front(arrow_cast(make_array(1, 2, 3, 4, 5), 'FixedSizeList(5, Int64)')), array_pop_front(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)')); +---- +[2, 3, 4, 5] [e, l, l, o] + # array_pop_front scalar function #2 (after array_pop_front, array is empty) query ? select array_pop_front(make_array(1)); @@ -1431,6 +1570,11 @@ select array_pop_front(arrow_cast(make_array(1), 'LargeList(Int64)')); ---- [] +query ? +select array_pop_front(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')); +---- +[] + # array_pop_front scalar function #3 (array_pop_front the empty array) query ? select array_pop_front(array_pop_front(make_array(1))); @@ -1442,6 +1586,11 @@ select array_pop_front(array_pop_front(arrow_cast(make_array(1), 'LargeList(Int6 ---- [] +query ? +select array_pop_front(array_pop_front(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)'))); +---- +[] + # array_pop_front scalar function #5 (array_pop_front the nested arrays) query ? select array_pop_front(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6))); @@ -1453,6 +1602,11 @@ select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, ---- [[2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] +query ? +select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6)), 'FixedSizeList(6, List(Int64))')); +---- +[[2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] + # array_pop_front scalar function #6 (array_pop_front the nested arrays with NULL) query ? select array_pop_front(make_array(NULL, make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4))); @@ -1464,6 +1618,11 @@ select array_pop_front(arrow_cast(make_array(NULL, make_array(1, 2, 3), make_arr ---- [[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]] +query ? +select array_pop_front(arrow_cast(make_array(NULL, make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4)), 'FixedSizeList(6, List(Int64))')); +---- +[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]] + # array_pop_front scalar function #8 (after array_pop_front, nested array is empty) query ? select array_pop_front(make_array(make_array(1, 2, 3))); @@ -1475,6 +1634,11 @@ select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3)), 'LargeList(Li ---- [] +query ? +select array_pop_front(arrow_cast(make_array(make_array(1, 2, 3)), 'FixedSizeList(1, List(Int64))')); +---- +[] + ## array_slice (aliases: list_slice) # array_slice scalar function #1 (with positive indexes) @@ -1874,6 +2038,14 @@ select ---- [4] [] [1, , 3, 4] [, , 1] +query ?? +select + array_append(arrow_cast(make_array(1, null, 3), 'FixedSizeList(3, Int64)'), 4), + array_append(arrow_cast(make_array(null, null), 'FixedSizeList(2, Int64)'), 1) +; +---- +[1, , 3, 4] [, , 1] + # test invalid (non-null) query error select array_append(1, 2); @@ -1898,6 +2070,13 @@ select ---- [[1, , 3], []] [[1, , 3], ] +query ?? +select + array_append(arrow_cast(make_array(make_array(1, null, 3)), 'FixedSizeList(1, List(Int64))'), [null]), + array_append(arrow_cast(make_array(make_array(1, null, 3)), 'FixedSizeList(1, List(Int64))'), null); +---- +[[1, , 3], []] [[1, , 3], ] + # array_append scalar function #3 query ??? select array_append(make_array(1, 2, 3), 4), array_append(make_array(1.0, 2.0, 3.0), 4.0), array_append(make_array('h', 'e', 'l', 'l'), 'o'); @@ -1905,7 +2084,12 @@ select array_append(make_array(1, 2, 3), 4), array_append(make_array(1.0, 2.0, 3 [1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o] query ??? -select array_append(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4), array_append(arrow_cast(make_array(1.0, 2.0, 3.0), 'LargeList(Float64)'), 4.0), array_append(make_array('h', 'e', 'l', 'l'), 'o'); +select array_append(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)'), 4), array_append(arrow_cast(make_array(1.0, 2.0, 3.0), 'LargeList(Float64)'), 4.0), array_append(arrow_cast(make_array('h', 'e', 'l', 'l'), 'LargeList(Utf8)'), 'o'); +---- +[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o] + +query ??? +select array_append(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)'), 4), array_append(arrow_cast(make_array(1.0, 2.0, 3.0), 'FixedSizeList(3, Float64)'), 4.0), array_append(arrow_cast(make_array('h', 'e', 'l', 'l'), 'FixedSizeList(4, Utf8)'), 'o'); ---- [1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o] @@ -1920,6 +2104,11 @@ select array_append(arrow_cast(make_array([1], [2], [3]), 'LargeList(LargeList(I ---- [[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]] +query ??? +select array_append(arrow_cast(make_array([1], [2], [3]), 'FixedSizeList(3, List(Int64))'), [4]), array_append(arrow_cast(make_array([1.0], [2.0], [3.0]), 'FixedSizeList(3, List(Float64))'), [4.0]), array_append(arrow_cast(make_array(['h'], ['e'], ['l'], ['l']), 'FixedSizeList(4, List(Utf8))'), ['o']); +---- +[[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]] + # list_append scalar function #5 (function alias `array_append`) query ??? select list_append(make_array(1, 2, 3), 4), list_append(make_array(1.0, 2.0, 3.0), 4.0), list_append(make_array('h', 'e', 'l', 'l'), 'o'); @@ -1978,6 +2167,18 @@ select array_append(column1, column2) from large_arrays_values; [51, 52, , 54, 55, 56, 57, 58, 59, 60, 55] [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 66] +query ? +select array_append(column1, column2) from fixed_arrays_values; +---- +[, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1] +[11, 12, 13, 14, 15, 16, 17, 18, , 20, 12] +[21, 22, 23, , 25, 26, 27, 28, 29, 30, 23] +[31, 32, 33, 34, 35, , 37, 38, 39, 40, 34] +[, , , , , , , , , , 44] +[41, 42, 43, 44, 45, 46, 47, 48, 49, 50, ] +[51, 52, , 54, 55, 56, 57, 58, 59, 60, 55] +[61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 66] + # array_append with columns #2 (element is list) query ? select array_append(column1, column2) from nested_arrays; @@ -1991,6 +2192,12 @@ select array_append(column1, column2) from large_nested_arrays; [[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [7, 8, 9]] [[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [10, 11, 12]] +query ? +select array_append(column1, column2) from fixed_size_nested_arrays; +---- +[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [7, 8, 9]] +[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [10, 11, 12]] + # array_append with columns and scalars #1 query ?? select array_append(column2, 100.1), array_append(column3, '.') from arrays; @@ -2014,6 +2221,17 @@ select array_append(column2, 100.1), array_append(column3, '.') from large_array [100.1] [,, .] [16.6, 17.7, 18.8, 100.1] [.] +query ?? +select array_append(column2, 100.1), array_append(column3, '.') from fixed_size_arrays; +---- +[1.1, 2.2, 3.3, 100.1] [L, o, r, e, m, .] +[, 5.5, 6.6, 100.1] [i, p, , u, m, .] +[7.7, 8.8, 9.9, 100.1] [d, , l, o, r, .] +[10.1, , 12.2, 100.1] [s, i, t, a, b, .] +[13.3, 14.4, 15.5, 100.1] [a, m, e, t, x, .] +[, , , 100.1] [,, a, b, c, d, .] +[16.6, 17.7, 18.8, 100.1] [, , , , , .] + # array_append with columns and scalars #2 query ?? select array_append(column1, make_array(1, 11, 111)), array_append(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), column2) from nested_arrays; @@ -2027,6 +2245,12 @@ select array_append(column1, arrow_cast(make_array(1, 11, 111), 'LargeList(Int64 [[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [7, 8, 9]] [[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [10, 11, 12]] +query ?? +select array_append(column1, arrow_cast(make_array(1, 11, 111), 'FixedSizeList(3, Int64)')), array_append(arrow_cast(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), 'FixedSizeList(2, List(Int64))'), column2) from fixed_size_nested_arrays; +---- +[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [7, 8, 9]] +[[4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7], [1, 11, 111]] [[1, 2, 3], [11, 12, 13], [10, 11, 12]] + ## array_prepend (aliases: `list_prepend`, `array_push_front`, `list_push_front`) # array_prepend with NULLs @@ -2093,6 +2317,11 @@ select array_prepend(1, arrow_cast(make_array(2, 3, 4), 'LargeList(Int64)')), ar ---- [1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o] +query ??? +select array_prepend(1, arrow_cast([2, 3, 4], 'FixedSizeList(3, Int64)')), array_prepend(1.0, arrow_cast([2.0, 3.0, 4.0], 'FixedSizeList(3, Float64)')), array_prepend('h', arrow_cast(['e', 'l', 'l', 'o'], 'FixedSizeList(4, Utf8)')); +---- +[1, 2, 3, 4] [1.0, 2.0, 3.0, 4.0] [h, e, l, l, o] + # array_prepend scalar function #4 (element is list) query ??? select array_prepend(make_array(1), make_array(make_array(2), make_array(3), make_array(4))), array_prepend(make_array(1.0), make_array([2.0], [3.0], [4.0])), array_prepend(make_array('h'), make_array(['e'], ['l'], ['l'], ['o'])); @@ -2106,6 +2335,13 @@ select array_prepend(arrow_cast(make_array(1), 'LargeList(Int64)'), arrow_cast(m ---- [[1], [2], [3], [4]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]] +query ??? +select array_prepend(arrow_cast([1], 'FixedSizeList(1, Int64)'), arrow_cast([[1], [2], [3]], 'FixedSizeList(3, List(Int64))')), + array_prepend(arrow_cast([1.0], 'FixedSizeList(1, Float64)'), arrow_cast([[2.0], [3.0], [4.0]], 'FixedSizeList(3, List(Float64))')), + array_prepend(arrow_cast(['h'], 'FixedSizeList(1, Utf8)'), arrow_cast([['e'], ['l'], ['l'], ['o']], 'FixedSizeList(4, List(Utf8))')); +---- +[[1], [1], [2], [3]] [[1.0], [2.0], [3.0], [4.0]] [[h], [e], [l], [l], [o]] + # list_prepend scalar function #5 (function alias `array_prepend`) query ??? select list_prepend(1, make_array(2, 3, 4)), list_prepend(1.0, make_array(2.0, 3.0, 4.0)), list_prepend('h', make_array('e', 'l', 'l', 'o')); @@ -2164,6 +2400,18 @@ select array_prepend(column2, column1) from large_arrays_values; [55, 51, 52, , 54, 55, 56, 57, 58, 59, 60] [66, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] +query ? +select array_prepend(column2, column1) from fixed_arrays_values; +---- +[1, , 2, 3, 4, 5, 6, 7, 8, 9, 10] +[12, 11, 12, 13, 14, 15, 16, 17, 18, , 20] +[23, 21, 22, 23, , 25, 26, 27, 28, 29, 30] +[34, 31, 32, 33, 34, 35, , 37, 38, 39, 40] +[44, , , , , , , , , , ] +[, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50] +[55, 51, 52, , 54, 55, 56, 57, 58, 59, 60] +[66, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70] + # array_prepend with columns #2 (element is list) query ? select array_prepend(column2, column1) from nested_arrays; @@ -2177,6 +2425,12 @@ select array_prepend(column2, column1) from large_nested_arrays; [[7, 8, 9], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [[10, 11, 12], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] +query ? +select array_prepend(column2, column1) from fixed_size_nested_arrays; +---- +[[7, 8, 9], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] +[[10, 11, 12], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] + # array_prepend with columns and scalars #1 query ?? select array_prepend(100.1, column2), array_prepend('.', column3) from arrays; @@ -2200,6 +2454,17 @@ select array_prepend(100.1, column2), array_prepend('.', column3) from large_arr [100.1] [., ,] [100.1, 16.6, 17.7, 18.8] [.] +query ?? +select array_prepend(100.1, column2), array_prepend('.', column3) from fixed_size_arrays; +---- +[100.1, 1.1, 2.2, 3.3] [., L, o, r, e, m] +[100.1, , 5.5, 6.6] [., i, p, , u, m] +[100.1, 7.7, 8.8, 9.9] [., d, , l, o, r] +[100.1, 10.1, , 12.2] [., s, i, t, a, b] +[100.1, 13.3, 14.4, 15.5] [., a, m, e, t, x] +[100.1, , , ] [., ,, a, b, c, d] +[100.1, 16.6, 17.7, 18.8] [., , , , , ] + # array_prepend with columns and scalars #2 (element is list) query ?? select array_prepend(make_array(1, 11, 111), column1), array_prepend(column2, make_array(make_array(1, 2, 3), make_array(11, 12, 13))) from nested_arrays; @@ -2213,6 +2478,12 @@ select array_prepend(arrow_cast(make_array(1, 11, 111), 'LargeList(Int64)'), col [[1, 11, 111], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [[7, 8, 9], [1, 2, 3], [11, 12, 13]] [[1, 11, 111], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] [[10, 11, 12], [1, 2, 3], [11, 12, 13]] +query ?? +select array_prepend(arrow_cast(make_array(1, 11, 111), 'FixedSizeList(3, Int64)'), column1), array_prepend(column2, arrow_cast(make_array(make_array(1, 2, 3), make_array(11, 12, 13)), 'FixedSizeList(2, List(Int64))')) from fixed_size_nested_arrays; +---- +[[1, 11, 111], [1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4], [4, 5, 6]] [[7, 8, 9], [1, 2, 3], [11, 12, 13]] +[[1, 11, 111], [4, 5, 6], [10, 11, 12], [4, 9, 8], [7, 8, 9], [10, 11, 12], [1, 8, 7]] [[10, 11, 12], [1, 2, 3], [11, 12, 13]] + ## array_repeat (aliases: `list_repeat`) # array_repeat scalar function #1 @@ -2723,12 +2994,18 @@ NULL 1 NULL ## array_positions (aliases: `list_positions`) -# array_position with NULL (follow PostgreSQL) query ? select array_positions([1, 2, 3, 4, 5], null); ---- [] +#TODO: https://github.com/apache/arrow-datafusion/issues/7142 +# array_positions with NULL (follow PostgreSQL) +#query ? +#select array_positions(null, 1); +#---- +#NULL + # array_positions scalar function #1 query ??? select array_positions(['h', 'e', 'l', 'l', 'o'], 'l'), array_positions([1, 2, 3, 4, 5], 5), array_positions([1, 1, 1], 1); @@ -3748,6 +4025,11 @@ select cardinality(arrow_cast(make_array(1, 2, 3, 4, 5), 'LargeList(Int64)')), c ---- 5 3 5 +query III +select cardinality(arrow_cast([1, 2, 3, 4, 5], 'FixedSizeList(5, Int64)')), cardinality(arrow_cast([1, 3, 5], 'FixedSizeList(3, Int64)')), cardinality(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'FixedSizeList(5, Utf8)')); +---- +5 3 5 + # cardinality scalar function #2 query II select cardinality(make_array([1, 2], [3, 4], [5, 6])), cardinality(array_repeat(array_repeat(array_repeat(3, 3), 2), 3)); @@ -3759,6 +4041,11 @@ select cardinality(arrow_cast(make_array([1, 2], [3, 4], [5, 6]), 'LargeList(Lis ---- 6 +query I +select cardinality(arrow_cast([[1, 2], [3, 4], [5, 6]], 'FixedSizeList(3, List(Int64))')); +---- +6 + # cardinality scalar function #3 query II select cardinality(make_array()), cardinality(make_array(make_array())) @@ -3770,6 +4057,13 @@ select cardinality(arrow_cast(make_array(), 'LargeList(Null)')), cardinality(arr ---- NULL 0 +#TODO +#https://github.com/apache/arrow-datafusion/issues/9158 +#query II +#select cardinality(arrow_cast(make_array(), 'FixedSizeList(1, Null)')), cardinality(arrow_cast(make_array(make_array()), 'FixedSizeList(1, List(Null))')) +#---- +#NULL 0 + # cardinality with columns query III select cardinality(column1), cardinality(column2), cardinality(column3) from arrays; @@ -3793,6 +4087,17 @@ NULL 3 4 4 NULL 1 4 3 NULL +query III +select cardinality(column1), cardinality(column2), cardinality(column3) from fixed_size_arrays; +---- +4 3 5 +4 3 5 +4 3 5 +4 3 5 +NULL 3 5 +4 NULL 5 +4 3 NULL + ## array_remove (aliases: `list_remove`) # array_remove scalar function #1 @@ -3801,6 +4106,13 @@ select array_remove(make_array(1, 2, 2, 1, 1), 2), array_remove(make_array(1.0, ---- [1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o] +query ??? +select array_remove(arrow_cast(make_array(1, 2, 2, 1, 1), 'LargeList(Int64)'), 2), + array_remove(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'LargeList(Float64)'), 1.0), + array_remove(arrow_cast(make_array('h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)'), 'l'); +---- +[1, 2, 1, 1] [2.0, 2.0, 1.0, 1.0] [h, e, l, o] + query ??? select array_remove(arrow_cast(make_array(1, 2, 2, 1, 1), 'FixedSizeList(5, Int64)'), 2), array_remove(arrow_cast(make_array(1.0, 2.0, 2.0, 1.0, 1.0), 'FixedSizeList(5, Float64)'), 1.0), @@ -3816,6 +4128,14 @@ select ---- [1, , 3] [, 2.2, 3.3] [, bc] +query ??? +select + array_remove(arrow_cast(make_array(1, null, 2, 3), 'LargeList(Int64)'), 2), + array_remove(arrow_cast(make_array(1.1, null, 2.2, 3.3), 'LargeList(Float64)'), 1.1), + array_remove(arrow_cast(make_array('a', null, 'bc'), 'LargeList(Utf8)'), 'a'); +---- +[1, , 3] [, 2.2, 3.3] [, bc] + query ??? select array_remove(arrow_cast(make_array(1, null, 2, 3), 'FixedSizeList(4, Int64)'), 2), @@ -3824,6 +4144,14 @@ select ---- [1, , 3] [, 2.2, 3.3] [, bc] +#TODO: https://github.com/apache/arrow-datafusion/issues/7142 +# follow PostgreSQL behavior +#query ? +#select +# array_remove(NULL, 1) +#---- +#NULL + query ?? select array_remove(make_array(1, null, 2), null), @@ -3831,12 +4159,32 @@ select ---- [1, 2] [1, 2, ] +query ?? +select + array_remove(arrow_cast(make_array(1, null, 2), 'LargeList(Int64)'), null), + array_remove(arrow_cast(make_array(1, null, 2, null), 'LargeList(Int64)'), null); +---- +[1, 2] [1, 2, ] + +query ?? +select + array_remove(arrow_cast(make_array(1, null, 2), 'FixedSizeList(3, Int64)'), null), + array_remove(arrow_cast(make_array(1, null, 2, null), 'FixedSizeList(4, Int64)'), null); +---- +[1, 2] [1, 2, ] + # array_remove scalar function #2 (element is list) query ?? select array_remove(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), [4, 5, 6]), array_remove(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), [2, 3, 4]); ---- [[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]] +query ?? +select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'LargeList(List(Int64))'), [4, 5, 6]), + array_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'LargeList(List(Int64))'), [2, 3, 4]); +---- +[[1, 2, 3], [5, 5, 5], [4, 5, 6], [7, 8, 9]] [[1, 3, 2], [2, 3, 4], [5, 3, 1], [1, 3, 2]] + query ?? select array_remove(arrow_cast(make_array([1, 2, 3], [4, 5, 6], [5, 5, 5], [4, 5, 6], [7, 8, 9]), 'FixedSizeList(5, List(Int64))'), [4, 5, 6]), array_remove(arrow_cast(make_array([1, 3, 2], [2, 3, 4], [2, 3, 4], [5, 3, 1], [1, 3, 2]), 'FixedSizeList(5, List(Int64))'), [2, 3, 4]); @@ -3864,6 +4212,14 @@ select array_remove(column1, column2) from arrays_with_repeating_elements; [7, 7, 8, 7, 9, 7, 8, 7, 7] [11, 12, 10, 11, 12, 10, 11, 12, 10] +query ? +select array_remove(column1, column2) from large_arrays_with_repeating_elements; +---- +[1, 1, 3, 2, 2, 1, 3, 2, 3] +[4, 5, 5, 6, 5, 5, 5, 4, 4] +[7, 7, 8, 7, 9, 7, 8, 7, 7] +[11, 12, 10, 11, 12, 10, 11, 12, 10] + query ? select array_remove(column1, column2) from fixed_arrays_with_repeating_elements; ---- @@ -3881,6 +4237,14 @@ select array_remove(column1, column2) from nested_arrays_with_repeating_elements [[19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] +query ? +select array_remove(column1, column2) from large_nested_arrays_with_repeating_elements; +---- +[[1, 2, 3], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] +[[10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] +[[19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] +[[31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] + query ? select array_remove(column1, column2) from fixed_size_nested_arrays_with_repeating_elements; ---- @@ -3898,6 +4262,14 @@ select array_remove(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), a [1, 2, 2, 4, 5, 4, 4, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] [1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] +query ?? +select array_remove(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove(column1, 1) from large_arrays_with_repeating_elements; +---- +[1, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8] [2, 1, 3, 2, 2, 1, 3, 2, 3] +[1, 2, 2, 5, 4, 4, 7, 7, 10, 7, 8] [4, 4, 5, 5, 6, 5, 5, 5, 4, 4] +[1, 2, 2, 4, 5, 4, 4, 7, 10, 7, 8] [7, 7, 7, 8, 7, 9, 7, 8, 7, 7] +[1, 2, 2, 4, 5, 4, 4, 7, 7, 7, 8] [10, 11, 12, 10, 11, 12, 10, 11, 12, 10] + query ?? select array_remove(make_array(1, 2, 2, 4, 5, 4, 4, 7, 7, 10, 7, 8), column2), array_remove(column1, 1) from fixed_arrays_with_repeating_elements; ---- @@ -3916,6 +4288,15 @@ select array_remove(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [1 [[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] [[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] +query ?? +select array_remove(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2), + array_remove(column1, make_array(1, 2, 3)) from large_nested_arrays_with_repeating_elements; +---- +[[1, 2, 3], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [4, 5, 6], [1, 2, 3], [7, 8, 9], [4, 5, 6], [7, 8, 9]] +[[1, 2, 3], [4, 5, 6], [4, 5, 6], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[10, 11, 12], [10, 11, 12], [13, 14, 15], [13, 14, 15], [16, 17, 18], [13, 14, 15], [13, 14, 15], [13, 14, 15], [10, 11, 12], [10, 11, 12]] +[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]] [[19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24], [19, 20, 21], [25, 26, 27], [19, 20, 21], [22, 23, 24], [19, 20, 21], [19, 20, 21]] +[[1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [19, 20, 21], [22, 23, 24]] [[28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30], [31, 32, 33], [34, 35, 36], [28, 29, 30]] + query ?? select array_remove(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], [13, 14, 15], [10, 11, 12], [10, 11, 12], [19, 20, 21], [19, 20, 21], [28, 29, 30], [19, 20, 21], [22, 23, 24]), column2), array_remove(column1, make_array(1, 2, 3)) from fixed_size_nested_arrays_with_repeating_elements; @@ -3983,7 +4364,13 @@ select array_remove_n(make_array([1, 2, 3], [4, 5, 6], [4, 5, 6], [10, 11, 12], ## array_remove_all (aliases: `list_removes`) +#TODO: https://github.com/apache/arrow-datafusion/issues/7142 # array_remove_all with NULL elements +#query ? +#select array_remove_all(NULL, 1); +#---- +#NULL + query ? select array_remove_all(make_array(1, 2, 2, 1, 1), NULL); ---- @@ -4217,7 +4604,7 @@ NULL 10 ## array_dims (aliases: `list_dims`) # array dims error -query error Execution error: array_dims does not support type 'Int64' +query error DataFusion error: Error during planning: No function matches the given name and argument types 'array_dims\(Int64\)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tarray_dims\(array\) select array_dims(1); # array_dims scalar function @@ -4231,12 +4618,27 @@ select array_dims(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), array_di ---- [3] [2, 2] [1, 1, 1, 2, 1] +query ??? +select array_dims(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), array_dims(arrow_cast(make_array([1, 2], [3, 4]), 'FixedSizeList(2, List(Int64))')), array_dims(arrow_cast(make_array([[[[1], [2]]]]), 'FixedSizeList(1, List(List(List(List(Int64)))))')); +---- +[3] [2, 2] [1, 1, 1, 2, 1] + # array_dims scalar function #2 query ?? select array_dims(array_repeat(array_repeat(array_repeat(2, 3), 2), 1)), array_dims(array_repeat(array_repeat(array_repeat(3, 4), 5), 2)); ---- [1, 2, 3] [2, 5, 4] +query ?? +select array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(2, 3), 2), 1), 'LargeList(List(List(Int64)))')), array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(3, 4), 5), 2), 'LargeList(List(List(Int64)))')); +---- +[1, 2, 3] [2, 5, 4] + +query ?? +select array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(2, 3), 2), 1), 'FixedSizeList(1, List(List(Int64)))')), array_dims(arrow_cast(array_repeat(array_repeat(array_repeat(3, 4), 5), 2), 'FixedSizeList(2, List(List(Int64)))')); +---- +[1, 2, 3] [2, 5, 4] + # array_dims scalar function #3 query ?? select array_dims(make_array()), array_dims(make_array(make_array())) @@ -4259,6 +4661,11 @@ select list_dims(arrow_cast(make_array(1, 2, 3), 'LargeList(Int64)')), list_dims ---- [3] [2, 2] [1, 1, 1, 2, 1] +query ??? +select list_dims(arrow_cast(make_array(1, 2, 3), 'FixedSizeList(3, Int64)')), list_dims(arrow_cast(make_array([1, 2], [3, 4]), 'FixedSizeList(2, List(Int64))')), list_dims(arrow_cast(make_array([[[[1], [2]]]]), 'FixedSizeList(1, List(List(List(List(Int64)))))')); +---- +[3] [2, 2] [1, 1, 1, 2, 1] + # array_dims with columns query ??? select array_dims(column1), array_dims(column2), array_dims(column3) from arrays; @@ -4282,14 +4689,22 @@ NULL [3] [4] [2, 2] NULL [1] [2, 2] [3] NULL +query ??? +select array_dims(column1), array_dims(column2), array_dims(column3) from fixed_size_arrays; +---- +[2, 2] [3] [5] +[2, 2] [3] [5] +[2, 2] [3] [5] +[2, 2] [3] [5] +NULL [3] [5] +[2, 2] NULL [5] +[2, 2] [3] NULL + ## array_ndims (aliases: `list_ndims`) # array_ndims scalar function #1 -query error -select array_ndims(1); - #follow PostgreSQL query error select @@ -4306,6 +4721,7 @@ CREATE TABLE array_ndims_table AS VALUES ([1], [1, 2, 3], [[7]], [[[[[10]]]]]), ([2], [4, 5], [[8]], [[[[[10]]]]]), + (NUll, [6, 7], [[9]], [[[[[10]]]]]), ([3], [6], [[9]], [[[[[10]]]]]) ; @@ -4318,6 +4734,15 @@ AS SELECT arrow_cast(column4, 'LargeList(List(List(List(List(Int64)))))') as column4 FROM array_ndims_table; +statement ok +CREATE TABLE fixed_array_ndims_table +AS VALUES + (arrow_cast([1], 'FixedSizeList(1, Int64)'), arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)'), arrow_cast([[7]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))')), + (arrow_cast([2], 'FixedSizeList(1, Int64)'), arrow_cast([4, 5, 6], 'FixedSizeList(3, Int64)'), arrow_cast([[8]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))')), + (null, arrow_cast([6, 7, 8], 'FixedSizeList(3, Int64)'), arrow_cast([[9]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))')), + (arrow_cast([3], 'FixedSizeList(1, Int64)'), arrow_cast([6, 7, 8], 'FixedSizeList(3, Int64)'), arrow_cast([[9]], 'FixedSizeList(1, List(Int64))'), arrow_cast([[[[[10]]]]], 'FixedSizeList(1, List(List(List(List(Int64)))))')) +; + query IIII select array_ndims(column1), @@ -4328,6 +4753,7 @@ from array_ndims_table; ---- 1 1 2 5 1 1 2 5 +NULL 1 2 5 1 1 2 5 query IIII @@ -4340,8 +4766,24 @@ from large_array_ndims_table; ---- 1 1 2 5 1 1 2 5 +NULL 1 2 5 +1 1 2 5 + +query IIII +select + array_ndims(column1), + array_ndims(column2), + array_ndims(column3), + array_ndims(column4) +from fixed_array_ndims_table; +---- +1 1 2 5 +1 1 2 5 +NULL 1 2 5 1 1 2 5 + + statement ok drop table array_ndims_table; @@ -4794,10 +5236,11 @@ true false true false false false true true false false true false true ## array_distinct -query ? -select array_distinct(null); ----- -NULL +#TODO: https://github.com/apache/arrow-datafusion/issues/7142 +#query ? +#select array_distinct(null); +#---- +#NULL query ? select array_distinct([]); @@ -4841,6 +5284,30 @@ from array_distinct_table_1D_large; [1, 2, 3, 4, 5] [3, 5] +query ? +select array_distinct(column1) +from array_distinct_table_1D_fixed; +---- +[1, 2, 3] +[1, 2, 3, 4, 5] +[3, 5] + +query ? +select array_distinct(column1) +from array_distinct_table_1D_UTF8_fixed; +---- +[a, bc, def] +[a, bc, def, defg] +[defg] + +query ? +select array_distinct(column1) +from array_distinct_table_2D_fixed; +---- +[[1, 2], [3, 4], [5, 6]] +[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]] +[, [5, 6]] + query ??? select array_intersect(column1, column2), array_intersect(column3, column4), @@ -5382,11 +5849,13 @@ select array_concat(column1, [7]) from arrays_values_v2; [7] # flatten + +#TODO: https://github.com/apache/arrow-datafusion/issues/7142 # follow DuckDB -query ? -select flatten(NULL); ----- -NULL +#query ? +#select flatten(NULL); +#---- +#NULL # flatten with scalar values #1 query ??? @@ -5403,6 +5872,13 @@ select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'LargeList(Int64)')), ---- [1, 2, 1, 3, 2] [1, 2, 3, , 4, , 5] [1.1, 2.2, 3.3, 4.4] +query ??? +select flatten(arrow_cast(make_array(1, 2, 1, 3, 2), 'FixedSizeList(5, Int64)')), + flatten(arrow_cast(make_array([1], [2, 3], [null], make_array(4, null, 5)), 'FixedSizeList(4, List(Int64))')), + flatten(arrow_cast(make_array([[1.1], [2.2]], [[3.3], [4.4]]), 'FixedSizeList(2, List(List(Float64)))')); +---- +[1, 2, 1, 3, 2] [1, 2, 3, , 4, , 5] [1.1, 2.2, 3.3, 4.4] + # flatten with column values query ???? select flatten(column1), @@ -5424,6 +5900,16 @@ from large_flatten_table; [1, 2, 3] [1, 2, 3, 4, 5, 6] [1, 2, 3] [1.0, 2.1, 2.2, 3.2, 3.3, 3.4] [1, 2, 3, 4, 5, 6] [8] [1, 2, 3] [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] +query ???? +select flatten(column1), + flatten(column2), + flatten(column3), + flatten(column4) +from fixed_size_flatten_table; +---- +[1, 2, 3] [1, 2, 3, 4, 5, 6] [1, 2, 3] [1.0, 2.1, 2.2, 3.2, 3.3, 3.4] +[1, 2, 3, 4, 5, 6] [8, 9, 10, 11, 12, 13] [1, 2, 3] [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + ## empty # empty scalar function #1 query B @@ -5436,6 +5922,11 @@ select empty(arrow_cast(make_array(1), 'LargeList(Int64)')); ---- false +query B +select empty(arrow_cast(make_array(1), 'FixedSizeList(1, Int64)')); +---- +false + # empty scalar function #2 query B select empty(make_array()); @@ -5447,6 +5938,12 @@ select empty(arrow_cast(make_array(), 'LargeList(Null)')); ---- true +#TODO: https://github.com/apache/arrow-datafusion/issues/9158 +#query B +#select empty(arrow_cast(make_array(), 'FixedSizeList(0, Null)')); +#---- +#true + # empty scalar function #3 query B select empty(make_array(NULL)); @@ -5458,11 +5955,17 @@ select empty(arrow_cast(make_array(NULL), 'LargeList(Null)')); ---- false -# empty scalar function #4 query B -select empty(NULL); +select empty(arrow_cast(make_array(NULL), 'FixedSizeList(1, Null)')); ---- -NULL +false + +#TODO: https://github.com/apache/arrow-datafusion/issues/7142 +# empty scalar function #4 +#query B +#select empty(NULL); +#---- +#NULL # empty scalar function #5 query B @@ -5487,6 +5990,17 @@ NULL false false +query B +select empty(column1) from fixed_size_arrays; +---- +false +false +false +false +NULL +false +false + query ? SELECT string_to_array('abcxxxdef', 'xxx') ---- @@ -5686,6 +6200,9 @@ drop table fixed_slices; statement ok drop table arrayspop; +statement ok +drop table large_arrayspop; + statement ok drop table arrays_values; @@ -5806,6 +6323,9 @@ drop table flatten_table; statement ok drop table large_flatten_table; +statement ok +drop table fixed_size_flatten_table; + statement ok drop table arrays_values_without_nulls; From ec86acbc1fbc0da1e0bec9ad066a5177ec586c96 Mon Sep 17 00:00:00 2001 From: Jonah Gao Date: Mon, 26 Feb 2024 21:33:19 +0800 Subject: [PATCH 17/45] feat: expand `unnest` to accept arbitrary single array expression (#9342) * feat: expand `unnest` to accept any single array expression * unnest null * review feedback --- datafusion/sql/src/expr/function.rs | 58 +++++++++---------- datafusion/sqllogictest/test_files/unnest.slt | 48 +++++++++++---- 2 files changed, 64 insertions(+), 42 deletions(-) diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index f56138066cb6..db572a23cf99 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -16,16 +16,17 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; +use arrow_schema::DataType; use datafusion_common::{ - exec_err, not_impl_err, plan_datafusion_err, plan_err, DFSchema, DataFusionError, - Dependency, Result, + not_impl_err, plan_datafusion_err, plan_err, DFSchema, DataFusionError, Dependency, + Result, }; use datafusion_expr::expr::{ScalarFunction, Unnest}; use datafusion_expr::function::suggest_valid_function; use datafusion_expr::window_frame::{check_window_frame, regularize_window_order_by}; use datafusion_expr::{ - expr, AggregateFunction, BuiltinScalarFunction, Expr, ScalarFunctionDefinition, - WindowFrame, WindowFunctionDefinition, + expr, AggregateFunction, BuiltinScalarFunction, Expr, ExprSchemable, WindowFrame, + WindowFunctionDefinition, }; use sqlparser::ast::{ Expr as SQLExpr, Function as SQLFunction, FunctionArg, FunctionArgExpr, WindowType, @@ -80,41 +81,34 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { if name.eq("unnest") { let exprs = self.function_args_to_expr(args.clone(), schema, planner_context)?; - - match exprs.len() { + // Currently only one argument is supported + let arg = match exprs.len() { 0 => { - return exec_err!("unnest() requires at least one argument"); - } - 1 => { - if let Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn( - BuiltinScalarFunction::MakeArray, - ), - .. - }) = exprs[0] - { - // valid - } else if let Expr::Column(_) = exprs[0] { - // valid - } else if let Expr::ScalarFunction(ScalarFunction { - func_def: - ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::Struct), - .. - }) = exprs[0] - { - return not_impl_err!("unnest() does not support struct yet"); - } else { - return plan_err!( - "unnest() can only be applied to array and structs and null" - ); - } + return plan_err!("unnest() requires at least one argument"); } + 1 => &exprs[0], _ => { return not_impl_err!( "unnest() does not support multiple arguments yet" ); } + }; + // Check argument type, array types are supported + match arg.get_type(schema)? { + DataType::List(_) + | DataType::LargeList(_) + | DataType::FixedSizeList(_, _) => {} + DataType::Struct(_) => { + return not_impl_err!("unnest() does not support struct yet"); + } + DataType::Null => { + return not_impl_err!("unnest() does not support null yet"); + } + _ => { + return plan_err!( + "unnest() can only be applied to array, struct and null" + ); + } } return Ok(Expr::Unnest(Unnest { exprs })); diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 7e4ce06be203..9990c00f75d2 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -36,7 +36,7 @@ select unnest([1,2,3]); 2 3 -query error DataFusion error: Error during planning: unnest\(\) can only be applied to array and structs and null +query error DataFusion error: This feature is not implemented: unnest\(\) does not support null yet select unnest(null); ## Unnest empty array @@ -71,27 +71,55 @@ NULL NULL ## Unnest column with scalars -# TODO: This should be an error, but unnest is able to process scalar values now. -query I +query error DataFusion error: Error during planning: unnest\(\) can only be applied to array, struct and null select unnest(column3) from unnest_table; ----- -1 -2 -3 -NULL ## Unnest multiple columns query error DataFusion error: This feature is not implemented: Only support single unnest expression for now select unnest(column1), unnest(column2) from unnest_table; ## Unnest scalar -query error DataFusion error: Error during planning: unnest\(\) can only be applied to array and structs and null +query error DataFusion error: Error during planning: unnest\(\) can only be applied to array, struct and null select unnest(1); ## Unnest empty expression -query error DataFusion error: Execution error: unnest\(\) requires at least one argument +query error DataFusion error: Error during planning: unnest\(\) requires at least one argument select unnest(); +## Unnest struct expression +query error DataFusion error: This feature is not implemented: unnest\(\) does not support struct yet +select unnest(struct(null)); + + +## Unnest array expression +query I +select unnest(range(1, 3)); +---- +1 +2 + +query I +select unnest(arrow_cast(range(1, 3), 'LargeList(Int64)')); +---- +1 +2 + +query I +select unnest(arrow_cast(range(1, 3), 'FixedSizeList(2, Int64)')); +---- +1 +2 + +query I +select unnest(array_remove(column1, 12)) from unnest_table; +---- +1 +2 +3 +4 +5 +6 + statement ok drop table unnest_table; From c56840734c142fa1766ba5d3b0440733fcfccd05 Mon Sep 17 00:00:00 2001 From: Chris O'Donnell Date: Mon, 26 Feb 2024 12:16:05 -0500 Subject: [PATCH 18/45] fix: flight examples (#9335) * fix: downgrade tonic for arrow compatibility Tonic 0.10 and 0.11 are not API compatible. Arrow 50 depends on tonic 0.10, and datafusion must match that dependency for compatibility reasons. * feat: make nested examples runnable cargo run --example doesn't support nested examples. Nested examples need an explicit block to be runnable. * fix: fix custom catalog typo and formatting * docs: add note about upgrading tonic with arrow * ci: add cargo check for all examples --- ci/scripts/rust_example.sh | 1 + datafusion-examples/Cargo.toml | 27 ++++++++++++++++++- .../examples/external_dependency/catalog.rs | 4 +-- .../examples/flight/flight_sql_server.rs | 4 +-- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/ci/scripts/rust_example.sh b/ci/scripts/rust_example.sh index fe3696f20865..18a7306b520d 100755 --- a/ci/scripts/rust_example.sh +++ b/ci/scripts/rust_example.sh @@ -20,6 +20,7 @@ set -ex cd datafusion-examples/examples/ cargo fmt --all -- --check +cargo check --examples files=$(ls .) for filename in $files diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 0fb49a20a8f1..b1a9cbcad5f7 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -29,6 +29,30 @@ license = { workspace = true } authors = { workspace = true } rust-version = { workspace = true } +[[example]] +name = "flight_sql_server" +path = "examples/flight/flight_sql_server.rs" + +[[example]] +name = "flight_server" +path = "examples/flight/flight_server.rs" + +[[example]] +name = "flight_client" +path = "examples/flight/flight_client.rs" + +[[example]] +name = "catalog" +path = "examples/external_dependency/catalog.rs" + +[[example]] +name = "dataframe_to_s3" +path = "examples/external_dependency/dataframe-to-s3.rs" + +[[example]] +name = "query_aws_s3" +path = "examples/external_dependency/query-aws-s3.rs" + [dev-dependencies] arrow = { workspace = true } arrow-flight = { workspace = true } @@ -54,6 +78,7 @@ serde = { version = "1.0.136", features = ["derive"] } serde_json = { workspace = true } tempfile = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tonic = "0.11" +# 0.10 and 0.11 are incompatible. Need to upgrade tonic to 0.11 when upgrading to arrow 51 +tonic = "0.10" url = { workspace = true } uuid = "1.2" diff --git a/datafusion-examples/examples/external_dependency/catalog.rs b/datafusion-examples/examples/external_dependency/catalog.rs index 29e505fb1dcb..a623eafdf3d7 100644 --- a/datafusion-examples/examples/external_dependency/catalog.rs +++ b/datafusion-examples/examples/external_dependency/catalog.rs @@ -24,7 +24,7 @@ use datafusion::{ arrow::util::pretty, catalog::{ schema::SchemaProvider, - {CatalogProviderList, CatalogProvider}, + {CatalogProvider, CatalogProviderList}, }, datasource::{ file_format::{csv::CsvFormat, parquet::ParquetFormat, FileFormat}, @@ -53,7 +53,7 @@ async fn main() -> Result<()> { .unwrap(); let mut ctx = SessionContext::new(); let state = ctx.state(); - let catlist = Arc::new(CustomCatalogProvderList::new()); + let catlist = Arc::new(CustomCatalogProviderList::new()); // use our custom catalog list for context. each context has a single catalog list. // context will by default have [`MemoryCatalogProviderList`] ctx.register_catalog_list(catlist.clone()); diff --git a/datafusion-examples/examples/flight/flight_sql_server.rs b/datafusion-examples/examples/flight/flight_sql_server.rs index ed5b86d0b66c..35d475623062 100644 --- a/datafusion-examples/examples/flight/flight_sql_server.rs +++ b/datafusion-examples/examples/flight/flight_sql_server.rs @@ -231,7 +231,7 @@ impl FlightSqlService for FlightSqlServiceImpl { info!("getting results for {handle}"); let result = self.get_result(&handle)?; // if we get an empty result, create an empty schema - let (schema, batches) = match result.get(0) { + let (schema, batches) = match result.first() { None => (Arc::new(Schema::empty()), vec![]), Some(batch) => (batch.schema(), result.clone()), }; @@ -287,7 +287,7 @@ impl FlightSqlService for FlightSqlServiceImpl { .map_err(|e| status!("Error executing query", e))?; // if we get an empty result, create an empty schema - let schema = match result.get(0) { + let schema = match result.first() { None => Schema::empty(), Some(batch) => (*batch.schema()).clone(), }; From b8c6e0bb3f3f8ab60cf394b46d6aacc616b67d41 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Mon, 26 Feb 2024 10:05:09 -0800 Subject: [PATCH 19/45] docs: update parquet_sql_multiple_files.rs with a relative path ex (#9310) * docs: update parquet_sql_multiple_files.rs with a relative path ex * style: run cargo fmt * docs: update comment * docs: better --- .../examples/parquet_sql_multiple_files.rs | 66 ++++++++++++++++--- 1 file changed, 56 insertions(+), 10 deletions(-) diff --git a/datafusion-examples/examples/parquet_sql_multiple_files.rs b/datafusion-examples/examples/parquet_sql_multiple_files.rs index 451de96f2e91..0e2968f20356 100644 --- a/datafusion-examples/examples/parquet_sql_multiple_files.rs +++ b/datafusion-examples/examples/parquet_sql_multiple_files.rs @@ -17,31 +17,35 @@ use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; -use datafusion::error::Result; use datafusion::prelude::*; -use datafusion_common::{FileType, GetExt}; +use object_store::local::LocalFileSystem; +use std::path::Path; use std::sync::Arc; /// This example demonstrates executing a simple query against an Arrow data source (a directory -/// with multiple Parquet files) and fetching results +/// with multiple Parquet files) and fetching results. The query is run twice, once showing +/// how to used `register_listing_table` with an absolute path, and once registering an +/// ObjectStore to use a relative path. #[tokio::main] -async fn main() -> Result<()> { +async fn main() -> Result<(), Box> { // create local execution context let ctx = SessionContext::new(); - let testdata = datafusion::test_util::parquet_test_data(); + let test_data = datafusion::test_util::parquet_test_data(); // Configure listing options let file_format = ParquetFormat::default().with_enable_pruning(Some(true)); let listing_options = ListingOptions::new(Arc::new(file_format)) - .with_file_extension(FileType::PARQUET.get_ext()); + // This is a workaround for this example since `test_data` contains + // many different parquet different files, + // in practice use FileType::PARQUET.get_ext(). + .with_file_extension("alltypes_plain.parquet"); - // Register a listing table - this will use all files in the directory as data sources - // for the query + // First example were we use an absolute path, which requires no additional setup. ctx.register_listing_table( "my_table", - &format!("file://{testdata}/alltypes_plain.parquet"), - listing_options, + &format!("file://{test_data}/"), + listing_options.clone(), None, None, ) @@ -60,5 +64,47 @@ async fn main() -> Result<()> { // print the results df.show().await?; + // Second example were we temporarily move into the test data's parent directory and + // simulate a relative path, this requires registering an ObjectStore. + let cur_dir = std::env::current_dir()?; + + let test_data_path = Path::new(&test_data); + let test_data_path_parent = test_data_path + .parent() + .ok_or("test_data path needs a parent")?; + + std::env::set_current_dir(test_data_path_parent)?; + + let local_fs = Arc::new(LocalFileSystem::default()); + + let u = url::Url::parse("file://./")?; + ctx.runtime_env().register_object_store(&u, local_fs); + + // Register a listing table - this will use all files in the directory as data sources + // for the query + ctx.register_listing_table( + "relative_table", + "./data", + listing_options.clone(), + None, + None, + ) + .await?; + + // execute the query + let df = ctx + .sql( + "SELECT * \ + FROM relative_table \ + LIMIT 1", + ) + .await?; + + // print the results + df.show().await?; + + // Reset the current directory + std::env::set_current_dir(cur_dir)?; + Ok(()) } From a26f583d2766da746ff30199cc7341227526737f Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Mon, 26 Feb 2024 10:06:11 -0800 Subject: [PATCH 20/45] tests: add tests for writing hive-partitioned parquet (#9316) * tests: adds tests associated with #9237 * style: clippy --- .../datasource/physical_plan/parquet/mod.rs | 74 -------- datafusion/core/tests/dataframe/mod.rs | 160 +++++++++++++++++- 2 files changed, 158 insertions(+), 76 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index badd87084812..3aa1998bde7e 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -2066,80 +2066,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn write_parquet_results() -> Result<()> { - // create partitioned input file and context - let tmp_dir = TempDir::new()?; - // let mut ctx = create_ctx(&tmp_dir, 4).await?; - let ctx = SessionContext::new_with_config( - SessionConfig::new().with_target_partitions(8), - ); - let schema = populate_csv_partitions(&tmp_dir, 4, ".csv")?; - // register csv file with the execution context - ctx.register_csv( - "test", - tmp_dir.path().to_str().unwrap(), - CsvReadOptions::new().schema(&schema), - ) - .await?; - - // register a local file system object store for /tmp directory - let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?); - let local_url = Url::parse("file://local").unwrap(); - ctx.runtime_env().register_object_store(&local_url, local); - - // execute a simple query and write the results to parquet - let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out/"; - let out_dir_url = "file://local/out/"; - let df = ctx.sql("SELECT c1, c2 FROM test").await?; - df.write_parquet(out_dir_url, DataFrameWriteOptions::new(), None) - .await?; - // write_parquet(&mut ctx, "SELECT c1, c2 FROM test", &out_dir, None).await?; - - // create a new context and verify that the results were saved to a partitioned parquet file - let ctx = SessionContext::new(); - - // get write_id - let mut paths = fs::read_dir(&out_dir).unwrap(); - let path = paths.next(); - let name = path - .unwrap()? - .path() - .file_name() - .expect("Should be a file name") - .to_str() - .expect("Should be a str") - .to_owned(); - let (parsed_id, _) = name.split_once('_').expect("File should contain _ !"); - let write_id = parsed_id.to_owned(); - - // register each partition as well as the top level dir - ctx.register_parquet( - "part0", - &format!("{out_dir}/{write_id}_0.parquet"), - ParquetReadOptions::default(), - ) - .await?; - - ctx.register_parquet("allparts", &out_dir, ParquetReadOptions::default()) - .await?; - - let part0 = ctx.sql("SELECT c1, c2 FROM part0").await?.collect().await?; - let allparts = ctx - .sql("SELECT c1, c2 FROM allparts") - .await? - .collect() - .await?; - - let allparts_count: usize = allparts.iter().map(|batch| batch.num_rows()).sum(); - - assert_eq!(part0[0].schema(), allparts[0].schema()); - - assert_eq!(allparts_count, 40); - - Ok(()) - } - fn logical2physical(expr: &Expr, schema: &Schema) -> Arc { let df_schema = schema.clone().to_dfschema().unwrap(); let execution_props = ExecutionProps::new(); diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index b08b2b8fc7a2..ee842004172c 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -30,15 +30,19 @@ use arrow::{ }; use arrow_array::Float32Array; use arrow_schema::ArrowError; +use object_store::local::LocalFileSystem; +use std::fs; use std::sync::Arc; +use tempfile::TempDir; +use url::Url; -use datafusion::dataframe::DataFrame; +use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::MemTable; use datafusion::error::Result; use datafusion::execution::context::{SessionContext, SessionState}; use datafusion::prelude::JoinType; use datafusion::prelude::{CsvReadOptions, ParquetReadOptions}; -use datafusion::test_util::parquet_test_data; +use datafusion::test_util::{parquet_test_data, populate_csv_partitions}; use datafusion::{assert_batches_eq, assert_batches_sorted_eq}; use datafusion_common::{assert_contains, DataFusionError, ScalarValue, UnnestOptions}; use datafusion_execution::config::SessionConfig; @@ -1896,3 +1900,155 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> { Ok(()) } + +#[tokio::test] +async fn write_partitioned_parquet_results() -> Result<()> { + // create partitioned input file and context + let tmp_dir = TempDir::new()?; + + let ctx = SessionContext::new(); + + // Create an in memory table with schema C1 and C2, both strings + let schema = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Utf8, false), + Field::new("c2", DataType::Utf8, false), + ])); + + let record_batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["abc", "def"])), + Arc::new(StringArray::from(vec!["123", "456"])), + ], + )?; + + let mem_table = Arc::new(MemTable::try_new(schema, vec![vec![record_batch]])?); + + // Register the table in the context + ctx.register_table("test", mem_table)?; + + let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?); + let local_url = Url::parse("file://local").unwrap(); + ctx.runtime_env().register_object_store(&local_url, local); + + // execute a simple query and write the results to parquet + let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out/"; + let out_dir_url = format!("file://{out_dir}"); + + // Write the results to parquet with partitioning + let df = ctx.sql("SELECT c1, c2 FROM test").await?; + let df_write_options = + DataFrameWriteOptions::new().with_partition_by(vec![String::from("c2")]); + + df.write_parquet(&out_dir_url, df_write_options, None) + .await?; + + // Explicitly read the parquet file at c2=123 to verify the physical files are partitioned + let partitioned_file = format!("{out_dir}/c2=123", out_dir = out_dir); + let filted_df = ctx + .read_parquet(&partitioned_file, ParquetReadOptions::default()) + .await?; + + // Check that the c2 column is gone and that c1 is abc. + let results = filted_df.collect().await?; + let expected = ["+-----+", "| c1 |", "+-----+", "| abc |", "+-----+"]; + + assert_batches_eq!(expected, &results); + + // Read the entire set of parquet files + let df = ctx + .read_parquet( + &out_dir_url, + ParquetReadOptions::default() + .table_partition_cols(vec![(String::from("c2"), DataType::Utf8)]), + ) + .await?; + + // Check that the df has the entire set of data + let results = df.collect().await?; + let expected = [ + "+-----+-----+", + "| c1 | c2 |", + "+-----+-----+", + "| abc | 123 |", + "| def | 456 |", + "+-----+-----+", + ]; + + assert_batches_eq!(expected, &results); + + Ok(()) +} + +#[tokio::test] +async fn write_parquet_results() -> Result<()> { + // create partitioned input file and context + let tmp_dir = TempDir::new()?; + // let mut ctx = create_ctx(&tmp_dir, 4).await?; + let ctx = + SessionContext::new_with_config(SessionConfig::new().with_target_partitions(8)); + let schema = populate_csv_partitions(&tmp_dir, 4, ".csv")?; + // register csv file with the execution context + ctx.register_csv( + "test", + tmp_dir.path().to_str().unwrap(), + CsvReadOptions::new().schema(&schema), + ) + .await?; + + // register a local file system object store for /tmp directory + let local = Arc::new(LocalFileSystem::new_with_prefix(&tmp_dir)?); + let local_url = Url::parse("file://local").unwrap(); + ctx.runtime_env().register_object_store(&local_url, local); + + // execute a simple query and write the results to parquet + let out_dir = tmp_dir.as_ref().to_str().unwrap().to_string() + "/out/"; + let out_dir_url = "file://local/out/"; + let df = ctx.sql("SELECT c1, c2 FROM test").await?; + df.write_parquet(out_dir_url, DataFrameWriteOptions::new(), None) + .await?; + // write_parquet(&mut ctx, "SELECT c1, c2 FROM test", &out_dir, None).await?; + + // create a new context and verify that the results were saved to a partitioned parquet file + let ctx = SessionContext::new(); + + // get write_id + let mut paths = fs::read_dir(&out_dir).unwrap(); + let path = paths.next(); + let name = path + .unwrap()? + .path() + .file_name() + .expect("Should be a file name") + .to_str() + .expect("Should be a str") + .to_owned(); + let (parsed_id, _) = name.split_once('_').expect("File should contain _ !"); + let write_id = parsed_id.to_owned(); + + // register each partition as well as the top level dir + ctx.register_parquet( + "part0", + &format!("{out_dir}/{write_id}_0.parquet"), + ParquetReadOptions::default(), + ) + .await?; + + ctx.register_parquet("allparts", &out_dir, ParquetReadOptions::default()) + .await?; + + let part0 = ctx.sql("SELECT c1, c2 FROM part0").await?.collect().await?; + let allparts = ctx + .sql("SELECT c1, c2 FROM allparts") + .await? + .collect() + .await?; + + let allparts_count: usize = allparts.iter().map(|batch| batch.num_rows()).sum(); + + assert_eq!(part0[0].schema(), allparts[0].schema()); + + assert_eq!(allparts_count, 40); + + Ok(()) +} From b55d0edb0e8b9e8ef8cfaaca164dd5ae93a46854 Mon Sep 17 00:00:00 2001 From: junxiangMu <63799833+guojidan@users.noreply.github.com> Date: Tue, 27 Feb 2024 04:31:51 +0800 Subject: [PATCH 21/45] feature: support nvl(ifnull) function (#9284) * feature: support nvl(ifnull) function * add sqllogictest * add docs entry * Update docs/source/user-guide/sql/scalar_functions.md Co-authored-by: Jonah Gao * fix some code * fix docs --------- Co-authored-by: Jonah Gao --- datafusion/functions/src/core/mod.rs | 5 +- datafusion/functions/src/core/nvl.rs | 277 ++++++++++++++++++ datafusion/sqllogictest/test_files/nvl.slt | 120 ++++++++ .../source/user-guide/sql/scalar_functions.md | 21 ++ 4 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 datafusion/functions/src/core/nvl.rs create mode 100644 datafusion/sqllogictest/test_files/nvl.slt diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs index 9aab4bd450d1..db47c622188d 100644 --- a/datafusion/functions/src/core/mod.rs +++ b/datafusion/functions/src/core/mod.rs @@ -18,12 +18,15 @@ //! "core" DataFusion functions mod nullif; +mod nvl; // create UDFs make_udf_function!(nullif::NullIfFunc, NULLIF, nullif); +make_udf_function!(nvl::NVLFunc, NVL, nvl); // Export the functions out of this package, both as expr_fn as well as a list of functions export_functions!( - (nullif, arg_1 arg_2, "returns NULL if value1 equals value2; otherwise it returns value1. This can be used to perform the inverse operation of the COALESCE expression.") + (nullif, arg_1 arg_2, "returns NULL if value1 equals value2; otherwise it returns value1. This can be used to perform the inverse operation of the COALESCE expression."), + (nvl, arg_1 arg_2, "returns value2 if value1 is NULL; otherwise it returns value1") ); diff --git a/datafusion/functions/src/core/nvl.rs b/datafusion/functions/src/core/nvl.rs new file mode 100644 index 000000000000..6d6ad1cdeb21 --- /dev/null +++ b/datafusion/functions/src/core/nvl.rs @@ -0,0 +1,277 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::DataType; +use datafusion_common::{internal_err, Result, DataFusionError}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use arrow::compute::kernels::zip::zip; +use arrow::compute::is_not_null; +use arrow::array::Array; + +#[derive(Debug)] +pub(super) struct NVLFunc { + signature: Signature, + aliases: Vec, +} + +/// Currently supported types by the nvl/ifnull function. +/// The order of these types correspond to the order on which coercion applies +/// This should thus be from least informative to most informative +static SUPPORTED_NVL_TYPES: &[DataType] = &[ + DataType::Boolean, + DataType::UInt8, + DataType::UInt16, + DataType::UInt32, + DataType::UInt64, + DataType::Int8, + DataType::Int16, + DataType::Int32, + DataType::Int64, + DataType::Float32, + DataType::Float64, + DataType::Utf8, + DataType::LargeUtf8, +]; + +impl NVLFunc { + pub fn new() -> Self { + Self { + signature: + Signature::uniform(2, SUPPORTED_NVL_TYPES.to_vec(), + Volatility::Immutable, + ), + aliases: vec![String::from("ifnull")], + } + } +} + +impl ScalarUDFImpl for NVLFunc { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + "nvl" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + // NVL has two args and they might get coerced, get a preview of this + let coerced_types = datafusion_expr::type_coercion::functions::data_types(arg_types, &self.signature); + coerced_types.map(|typs| typs[0].clone()) + .map_err(|e| e.context("Failed to coerce arguments for NVL") + ) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + nvl_func(args) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +fn nvl_func(args: &[ColumnarValue]) -> Result { + if args.len() != 2 { + return internal_err!( + "{:?} args were supplied but NVL/IFNULL takes exactly two args", + args.len() + ); + } + let (lhs_array, rhs_array) = match (&args[0], &args[1]) { + (ColumnarValue::Array(lhs), ColumnarValue::Scalar(rhs)) => { + (lhs.clone(), rhs.to_array_of_size(lhs.len())?) + } + (ColumnarValue::Array(lhs), ColumnarValue::Array(rhs)) => { + (lhs.clone(), rhs.clone()) + } + (ColumnarValue::Scalar(lhs), ColumnarValue::Array(rhs)) => { + (lhs.to_array_of_size(rhs.len())?, rhs.clone()) + } + (ColumnarValue::Scalar(lhs), ColumnarValue::Scalar(rhs)) => { + let mut current_value = lhs; + if lhs.is_null() { + current_value = rhs; + } + return Ok(ColumnarValue::Scalar(current_value.clone())); + } + }; + let to_apply = is_not_null(&lhs_array)?; + let value = zip(&to_apply, &lhs_array, &rhs_array)?; + Ok(ColumnarValue::Array(value)) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow::array::*; + + use super::*; + use datafusion_common::{Result, ScalarValue}; + + #[test] + fn nvl_int32() -> Result<()> { + let a = Int32Array::from(vec![ + Some(1), + Some(2), + None, + None, + Some(3), + None, + None, + Some(4), + Some(5), + ]); + let a = ColumnarValue::Array(Arc::new(a)); + + let lit_array = ColumnarValue::Scalar(ScalarValue::Int32(Some(6i32))); + + let result = nvl_func(&[a, lit_array])?; + let result = result.into_array(0).expect("Failed to convert to array"); + + let expected = Arc::new(Int32Array::from(vec![ + Some(1), + Some(2), + Some(6), + Some(6), + Some(3), + Some(6), + Some(6), + Some(4), + Some(5), + ])) as ArrayRef; + assert_eq!(expected.as_ref(), result.as_ref()); + Ok(()) + } + + #[test] + // Ensure that arrays with no nulls can also invoke nvl() correctly + fn nvl_int32_nonulls() -> Result<()> { + let a = Int32Array::from(vec![1, 3, 10, 7, 8, 1, 2, 4, 5]); + let a = ColumnarValue::Array(Arc::new(a)); + + let lit_array = ColumnarValue::Scalar(ScalarValue::Int32(Some(20i32))); + + let result = nvl_func(&[a, lit_array])?; + let result = result.into_array(0).expect("Failed to convert to array"); + + let expected = Arc::new(Int32Array::from(vec![ + Some(1), + Some(3), + Some(10), + Some(7), + Some(8), + Some(1), + Some(2), + Some(4), + Some(5), + ])) as ArrayRef; + assert_eq!(expected.as_ref(), result.as_ref()); + Ok(()) + } + + #[test] + fn nvl_boolean() -> Result<()> { + let a = BooleanArray::from(vec![Some(true), Some(false), None]); + let a = ColumnarValue::Array(Arc::new(a)); + + let lit_array = ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))); + + let result = nvl_func(&[a, lit_array])?; + let result = result.into_array(0).expect("Failed to convert to array"); + + let expected = + Arc::new(BooleanArray::from(vec![Some(true), Some(false), Some(false)])) as ArrayRef; + + assert_eq!(expected.as_ref(), result.as_ref()); + Ok(()) + } + + #[test] + fn nvl_string() -> Result<()> { + let a = StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]); + let a = ColumnarValue::Array(Arc::new(a)); + + let lit_array = ColumnarValue::Scalar(ScalarValue::from("bax")); + + let result = nvl_func(&[a, lit_array])?; + let result = result.into_array(0).expect("Failed to convert to array"); + + let expected = Arc::new(StringArray::from(vec![ + Some("foo"), + Some("bar"), + Some("bax"), + Some("baz"), + ])) as ArrayRef; + + assert_eq!(expected.as_ref(), result.as_ref()); + Ok(()) + } + + #[test] + fn nvl_literal_first() -> Result<()> { + let a = Int32Array::from(vec![Some(1), Some(2), None, None, Some(3), Some(4)]); + let a = ColumnarValue::Array(Arc::new(a)); + + let lit_array = ColumnarValue::Scalar(ScalarValue::Int32(Some(2i32))); + + let result = nvl_func(&[lit_array, a])?; + let result = result.into_array(0).expect("Failed to convert to array"); + + let expected = Arc::new(Int32Array::from(vec![ + Some(2), + Some(2), + Some(2), + Some(2), + Some(2), + Some(2), + ])) as ArrayRef; + assert_eq!(expected.as_ref(), result.as_ref()); + Ok(()) + } + + #[test] + fn nvl_scalar() -> Result<()> { + let a_null = ColumnarValue::Scalar(ScalarValue::Int32(None)); + let b_null = ColumnarValue::Scalar(ScalarValue::Int32(Some(2i32))); + + let result_null = nvl_func(&[a_null, b_null])?; + let result_null = result_null.into_array(1).expect("Failed to convert to array"); + + let expected_null = Arc::new(Int32Array::from(vec![Some(2i32)])) as ArrayRef; + + assert_eq!(expected_null.as_ref(), result_null.as_ref()); + + let a_nnull = ColumnarValue::Scalar(ScalarValue::Int32(Some(2i32))); + let b_nnull = ColumnarValue::Scalar(ScalarValue::Int32(Some(1i32))); + + let result_nnull = nvl_func(&[a_nnull, b_nnull])?; + let result_nnull = result_nnull + .into_array(1) + .expect("Failed to convert to array"); + + let expected_nnull = Arc::new(Int32Array::from(vec![Some(2i32)])) as ArrayRef; + assert_eq!(expected_nnull.as_ref(), result_nnull.as_ref()); + + Ok(()) + } +} diff --git a/datafusion/sqllogictest/test_files/nvl.slt b/datafusion/sqllogictest/test_files/nvl.slt new file mode 100644 index 000000000000..81e79e1eb5b0 --- /dev/null +++ b/datafusion/sqllogictest/test_files/nvl.slt @@ -0,0 +1,120 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +statement ok +CREATE TABLE test( + int_field INT, + bool_field BOOLEAN, + text_field TEXT, + more_ints INT +) as VALUES + (1, true, 'abc', 2), + (2, false, 'def', 2), + (3, NULL, 'ghij', 3), + (NULL, NULL, NULL, 4), + (4, false, 'zxc', 5), + (NULL, true, NULL, 6) +; + +# Arrays tests +query I +SELECT NVL(int_field, 2) FROM test; +---- +1 +2 +3 +2 +4 +2 + + +query B +SELECT NVL(bool_field, false) FROM test; +---- +true +false +false +false +false +true + + +query T +SELECT NVL(text_field, 'zxb') FROM test; +---- +abc +def +ghij +zxb +zxc +zxb + + +query I +SELECT IFNULL(int_field, more_ints) FROM test; +---- +1 +2 +3 +4 +4 +6 + + +query I +SELECT NVL(3, int_field) FROM test; +---- +3 +3 +3 +3 +3 +3 + + +# Scalar values tests +query I +SELECT NVL(1, 1); +---- +1 + +query I +SELECT NVL(1, 3); +---- +1 + +query I +SELECT NVL(NULL, NULL); +---- +NULL diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 707e8c24b326..d4eb5944ad09 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -569,6 +569,8 @@ trunc(numeric_expression[, decimal_places]) - [coalesce](#coalesce) - [nullif](#nullif) +- [nvl](#nvl) +- [ifnull](#ifnull) ### `coalesce` @@ -603,6 +605,25 @@ nullif(expression1, expression2) - **expression2**: Expression to compare to expression1. Can be a constant, column, or function, and any combination of arithmetic operators. +### `nvl` + +Returns _expression2_ if _expression1_ is NULL; otherwise it returns _expression1_. + +``` +nvl(expression1, expression2) +``` + +#### Arguments + +- **expression1**: return if expression1 not is NULL. + Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression2**: return if expression1 is NULL. + Can be a constant, column, or function, and any combination of arithmetic operators. + +### `ifnull` + +_Alias of [nvl](#nvl)._ + ## String Functions - [ascii](#ascii) From 85f7a8e88e3596b3ec900c43fabb8f7f42bbea5c Mon Sep 17 00:00:00 2001 From: Junhao Liu Date: Mon, 26 Feb 2024 18:24:16 -0600 Subject: [PATCH 22/45] Move abs to datafusion_functions (#9313) * feat: move abs to datafusion_functions * fix proto * fix proto * fix CI vendored code * Fix proto * add support type * fix signature * fix typo * fix test cases * disable a test case * remove old code from math_expressions * feat: add test * fix clippy * use unknown for proto * fix unknown proto --- datafusion/expr/src/built_in_function.rs | 7 - datafusion/expr/src/expr.rs | 5 - datafusion/expr/src/expr_fn.rs | 2 - datafusion/functions/src/math/abs.rs | 177 ++++++++++++++++++ datafusion/functions/src/math/mod.rs | 8 +- datafusion/physical-expr/src/functions.rs | 4 - .../physical-expr/src/math_expressions.rs | 93 +-------- datafusion/proto/proto/datafusion.proto | 4 +- datafusion/proto/src/generated/pbjson.rs | 6 +- datafusion/proto/src/generated/prost.rs | 8 +- .../proto/src/logical_plan/from_proto.rs | 6 +- datafusion/proto/src/logical_plan/to_proto.rs | 1 - 12 files changed, 198 insertions(+), 123 deletions(-) create mode 100644 datafusion/functions/src/math/abs.rs diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index 8b4e65121c79..cf1e73f780ad 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -42,8 +42,6 @@ use strum_macros::EnumIter; #[derive(Debug, Clone, PartialEq, Eq, Hash, EnumIter, Copy)] pub enum BuiltinScalarFunction { // math functions - /// abs - Abs, /// acos Acos, /// asin @@ -364,7 +362,6 @@ impl BuiltinScalarFunction { pub fn volatility(&self) -> Volatility { match self { // Immutable scalar builtins - BuiltinScalarFunction::Abs => Volatility::Immutable, BuiltinScalarFunction::Acos => Volatility::Immutable, BuiltinScalarFunction::Asin => Volatility::Immutable, BuiltinScalarFunction::Atan => Volatility::Immutable, @@ -868,8 +865,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArrowTypeof => Ok(Utf8), - BuiltinScalarFunction::Abs => Ok(input_expr_types[0].clone()), - BuiltinScalarFunction::OverLay => { utf8_to_str_type(&input_expr_types[0], "overlay") } @@ -1338,7 +1333,6 @@ impl BuiltinScalarFunction { Signature::uniform(2, vec![Int64], self.volatility()) } BuiltinScalarFunction::ArrowTypeof => Signature::any(1, self.volatility()), - BuiltinScalarFunction::Abs => Signature::any(1, self.volatility()), BuiltinScalarFunction::OverLay => Signature::one_of( vec![ Exact(vec![Utf8, Utf8, Int64, Int64]), @@ -1444,7 +1438,6 @@ impl BuiltinScalarFunction { /// Returns all names that can be used to call this function pub fn aliases(&self) -> &'static [&'static str] { match self { - BuiltinScalarFunction::Abs => &["abs"], BuiltinScalarFunction::Acos => &["acos"], BuiltinScalarFunction::Acosh => &["acosh"], BuiltinScalarFunction::Asin => &["asin"], diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index f40ccb6cdb58..c3d9269d1559 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -2033,11 +2033,6 @@ mod test { .is_volatile() .unwrap() ); - assert!( - !ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::Abs) - .is_volatile() - .unwrap() - ); // UDF #[derive(Debug)] diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 4aa270e6dde6..55bd40a18900 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -557,7 +557,6 @@ nary_scalar_expr!( trunc, "truncate toward zero, with optional precision" ); -scalar_expr!(Abs, abs, num, "absolute value"); scalar_expr!(Signum, signum, num, "sign of the argument (-1, 0, +1) "); scalar_expr!(Exp, exp, num, "exponential"); scalar_expr!(Gcd, gcd, arg_1 arg_2, "greatest common divisor"); @@ -1354,7 +1353,6 @@ mod test { test_nary_scalar_expr!(Round, round, input, decimal_places); test_nary_scalar_expr!(Trunc, trunc, num); test_nary_scalar_expr!(Trunc, trunc, num, precision); - test_unary_scalar_expr!(Abs, abs); test_unary_scalar_expr!(Signum, signum); test_unary_scalar_expr!(Exp, exp); test_unary_scalar_expr!(Log2, log2); diff --git a/datafusion/functions/src/math/abs.rs b/datafusion/functions/src/math/abs.rs new file mode 100644 index 000000000000..21ca37fb8ec3 --- /dev/null +++ b/datafusion/functions/src/math/abs.rs @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! math expressions + +use arrow::array::Decimal128Array; +use arrow::array::Decimal256Array; +use arrow::array::Int16Array; +use arrow::array::Int32Array; +use arrow::array::Int64Array; +use arrow::array::Int8Array; +use arrow::datatypes::DataType; +use datafusion_common::not_impl_err; +use datafusion_common::plan_datafusion_err; +use datafusion_common::{internal_err, Result, DataFusionError}; +use datafusion_expr::utils; +use datafusion_expr::ColumnarValue; + +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; +use std::sync::Arc; +use arrow::array::{ArrayRef, Float32Array, Float64Array}; +use arrow::error::ArrowError; + +type MathArrayFunction = fn(&Vec) -> Result; + +macro_rules! make_abs_function { + ($ARRAY_TYPE:ident) => {{ + |args: &Vec| { + let array = downcast_arg!(&args[0], "abs arg", $ARRAY_TYPE); + let res: $ARRAY_TYPE = array.unary(|x| x.abs()); + Ok(Arc::new(res) as ArrayRef) + } + }}; +} + +macro_rules! make_try_abs_function { + ($ARRAY_TYPE:ident) => {{ + |args: &Vec| { + let array = downcast_arg!(&args[0], "abs arg", $ARRAY_TYPE); + let res: $ARRAY_TYPE = array.try_unary(|x| { + x.checked_abs().ok_or_else(|| { + ArrowError::ComputeError(format!( + "{} overflow on abs({})", + stringify!($ARRAY_TYPE), + x + )) + }) + })?; + Ok(Arc::new(res) as ArrayRef) + } + }}; +} + +macro_rules! make_decimal_abs_function { + ($ARRAY_TYPE:ident) => {{ + |args: &Vec| { + let array = downcast_arg!(&args[0], "abs arg", $ARRAY_TYPE); + let res: $ARRAY_TYPE = array + .unary(|x| x.wrapping_abs()) + .with_data_type(args[0].data_type().clone()); + Ok(Arc::new(res) as ArrayRef) + } + }}; +} + +/// Abs SQL function +/// Return different implementations based on input datatype to reduce branches during execution +fn create_abs_function( + input_data_type: &DataType, +) -> Result { + match input_data_type { + DataType::Float32 => Ok(make_abs_function!(Float32Array)), + DataType::Float64 => Ok(make_abs_function!(Float64Array)), + + // Types that may overflow, such as abs(-128_i8). + DataType::Int8 => Ok(make_try_abs_function!(Int8Array)), + DataType::Int16 => Ok(make_try_abs_function!(Int16Array)), + DataType::Int32 => Ok(make_try_abs_function!(Int32Array)), + DataType::Int64 => Ok(make_try_abs_function!(Int64Array)), + + // Types of results are the same as the input. + DataType::Null + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 => Ok(|args: &Vec| Ok(args[0].clone())), + + // Decimal types + DataType::Decimal128(_, _) => Ok(make_decimal_abs_function!(Decimal128Array)), + DataType::Decimal256(_, _) => Ok(make_decimal_abs_function!(Decimal256Array)), + + other => not_impl_err!("Unsupported data type {other:?} for function abs"), + } +} +#[derive(Debug)] +pub(super) struct AbsFunc { + signature: Signature, +} + +impl AbsFunc { + pub fn new() -> Self { + Self { + signature: Signature::any(1, Volatility::Immutable) + } + } +} + +impl ScalarUDFImpl for AbsFunc { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "abs" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + if arg_types.len() != 1 { + return Err(plan_datafusion_err!( + "{}", + utils::generate_signature_error_msg( + self.name(), + self.signature().clone(), + arg_types, + ) + )); + } + match arg_types[0] { + DataType::Float32 => Ok(DataType::Float32), + DataType::Float64 => Ok(DataType::Float64), + DataType::Int8 => Ok(DataType::Int8), + DataType::Int16 => Ok(DataType::Int16), + DataType::Int32 => Ok(DataType::Int32), + DataType::Int64 => Ok(DataType::Int64), + DataType::Null => Ok(DataType::Null), + DataType::UInt8 => Ok(DataType::UInt8), + DataType::UInt16 => Ok(DataType::UInt16), + DataType::UInt32 => Ok(DataType::UInt32), + DataType::UInt64 => Ok(DataType::UInt64), + DataType::Decimal128(precision, scale) => Ok(DataType::Decimal128(precision, scale)), + DataType::Decimal256(precision, scale) => Ok(DataType::Decimal256(precision, scale)), + _ => not_impl_err!("Unsupported data type {} for function abs", arg_types[0].to_string()), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + + if args.len() != 1 { + return internal_err!("abs function requires 1 argument, got {}", args.len()); + } + + let input_data_type = args[0].data_type(); + let abs_fun = create_abs_function(input_data_type)?; + + let arr = abs_fun(&args)?; + Ok(ColumnarValue::Array(arr)) + } +} diff --git a/datafusion/functions/src/math/mod.rs b/datafusion/functions/src/math/mod.rs index 873625948a35..9d13103ef23f 100644 --- a/datafusion/functions/src/math/mod.rs +++ b/datafusion/functions/src/math/mod.rs @@ -18,12 +18,14 @@ //! "math" DataFusion functions mod nans; +mod abs; // create UDFs make_udf_function!(nans::IsNanFunc, ISNAN, isnan); +make_udf_function!(abs::AbsFunc, ABS, abs); // Export the functions out of this package, both as expr_fn as well as a list of functions export_functions!( - (isnan, num, "returns true if a given number is +NaN or -NaN otherwise returns false") -); - + (isnan, num, "returns true if a given number is +NaN or -NaN otherwise returns false"), + (abs, num, "returns the absolute value of a given number") +); \ No newline at end of file diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 8446a65d72c8..0dc3f96dc12a 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -260,9 +260,6 @@ pub fn create_physical_fun( ) -> Result { Ok(match fun { // math functions - BuiltinScalarFunction::Abs => Arc::new(|args| { - make_scalar_function_inner(math_expressions::abs_invoke)(args) - }), BuiltinScalarFunction::Acos => Arc::new(math_expressions::acos), BuiltinScalarFunction::Asin => Arc::new(math_expressions::asin), BuiltinScalarFunction::Atan => Arc::new(math_expressions::atan), @@ -3075,7 +3072,6 @@ mod tests { let funs = [ BuiltinScalarFunction::Concat, BuiltinScalarFunction::ToTimestamp, - BuiltinScalarFunction::Abs, BuiltinScalarFunction::Repeat, ]; diff --git a/datafusion/physical-expr/src/math_expressions.rs b/datafusion/physical-expr/src/math_expressions.rs index af66862aecc5..b622aee8e2b3 100644 --- a/datafusion/physical-expr/src/math_expressions.rs +++ b/datafusion/physical-expr/src/math_expressions.rs @@ -18,15 +18,11 @@ //! Math expressions use arrow::array::ArrayRef; -use arrow::array::{ - BooleanArray, Decimal128Array, Decimal256Array, Float32Array, Float64Array, - Int16Array, Int32Array, Int64Array, Int8Array, -}; +use arrow::array::{BooleanArray, Float32Array, Float64Array, Int64Array}; use arrow::datatypes::DataType; -use arrow::error::ArrowError; +use datafusion_common::internal_err; use datafusion_common::ScalarValue; use datafusion_common::ScalarValue::{Float32, Int64}; -use datafusion_common::{internal_err, not_impl_err}; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::ColumnarValue; use rand::{thread_rng, Rng}; @@ -35,8 +31,6 @@ use std::iter; use std::mem::swap; use std::sync::Arc; -type MathArrayFunction = fn(&[ArrayRef]) -> Result; - macro_rules! downcast_compute_op { ($ARRAY:expr, $NAME:expr, $FUNC:ident, $TYPE:ident) => {{ let n = $ARRAY.as_any().downcast_ref::<$TYPE>(); @@ -176,7 +170,6 @@ math_unary_function!("acosh", acosh); math_unary_function!("atanh", atanh); math_unary_function!("floor", floor); math_unary_function!("ceil", ceil); -math_unary_function!("abs", abs); math_unary_function!("signum", signum); math_unary_function!("exp", exp); math_unary_function!("ln", ln); @@ -673,88 +666,6 @@ fn compute_truncate64(x: f64, y: i64) -> f64 { (x * factor).round() / factor } -macro_rules! make_abs_function { - ($ARRAY_TYPE:ident) => {{ - |args: &[ArrayRef]| { - let array = downcast_arg!(&args[0], "abs arg", $ARRAY_TYPE); - let res: $ARRAY_TYPE = array.unary(|x| x.abs()); - Ok(Arc::new(res) as ArrayRef) - } - }}; -} - -macro_rules! make_try_abs_function { - ($ARRAY_TYPE:ident) => {{ - |args: &[ArrayRef]| { - let array = downcast_arg!(&args[0], "abs arg", $ARRAY_TYPE); - let res: $ARRAY_TYPE = array.try_unary(|x| { - x.checked_abs().ok_or_else(|| { - ArrowError::ComputeError(format!( - "{} overflow on abs({})", - stringify!($ARRAY_TYPE), - x - )) - }) - })?; - Ok(Arc::new(res) as ArrayRef) - } - }}; -} - -macro_rules! make_decimal_abs_function { - ($ARRAY_TYPE:ident) => {{ - |args: &[ArrayRef]| { - let array = downcast_arg!(&args[0], "abs arg", $ARRAY_TYPE); - let res: $ARRAY_TYPE = array - .unary(|x| x.wrapping_abs()) - .with_data_type(args[0].data_type().clone()); - Ok(Arc::new(res) as ArrayRef) - } - }}; -} - -/// Abs SQL function -/// Return different implementations based on input datatype to reduce branches during execution -pub(super) fn create_abs_function( - input_data_type: &DataType, -) -> Result { - match input_data_type { - DataType::Float32 => Ok(make_abs_function!(Float32Array)), - DataType::Float64 => Ok(make_abs_function!(Float64Array)), - - // Types that may overflow, such as abs(-128_i8). - DataType::Int8 => Ok(make_try_abs_function!(Int8Array)), - DataType::Int16 => Ok(make_try_abs_function!(Int16Array)), - DataType::Int32 => Ok(make_try_abs_function!(Int32Array)), - DataType::Int64 => Ok(make_try_abs_function!(Int64Array)), - - // Types of results are the same as the input. - DataType::Null - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 => Ok(|args: &[ArrayRef]| Ok(args[0].clone())), - - // Decimal types - DataType::Decimal128(_, _) => Ok(make_decimal_abs_function!(Decimal128Array)), - DataType::Decimal256(_, _) => Ok(make_decimal_abs_function!(Decimal256Array)), - - other => not_impl_err!("Unsupported data type {other:?} for function abs"), - } -} - -/// abs() SQL function implementation -pub fn abs_invoke(args: &[ArrayRef]) -> Result { - if args.len() != 1 { - return internal_err!("abs function requires 1 argument, got {}", args.len()); - } - - let input_data_type = args[0].data_type(); - let abs_fun = create_abs_function(input_data_type)?; - - abs_fun(args) -} - #[cfg(test)] mod tests { diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 7673ce86ae1d..d91373f8f8d2 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -545,7 +545,9 @@ message InListNode { } enum ScalarFunction { - Abs = 0; + // 0 was Abs before + // The first enum value must be zero for open enums + unknown = 0; Acos = 1; Asin = 2; Atan = 3; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 65483f9ac467..964b8890184c 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22321,7 +22321,7 @@ impl serde::Serialize for ScalarFunction { S: serde::Serializer, { let variant = match self { - Self::Abs => "Abs", + Self::Unknown => "unknown", Self::Acos => "Acos", Self::Asin => "Asin", Self::Atan => "Atan", @@ -22464,7 +22464,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { D: serde::Deserializer<'de>, { const FIELDS: &[&str] = &[ - "Abs", + "unknown", "Acos", "Asin", "Atan", @@ -22636,7 +22636,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { E: serde::de::Error, { match value { - "Abs" => Ok(ScalarFunction::Abs), + "unknown" => Ok(ScalarFunction::Unknown), "Acos" => Ok(ScalarFunction::Acos), "Asin" => Ok(ScalarFunction::Asin), "Atan" => Ok(ScalarFunction::Atan), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index a567269e3356..292aef4402a2 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2633,7 +2633,9 @@ impl JoinConstraint { #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] #[repr(i32)] pub enum ScalarFunction { - Abs = 0, + /// 0 was Abs before + /// The first enum value must be zero for open enums + Unknown = 0, Acos = 1, Asin = 2, Atan = 3, @@ -2776,7 +2778,7 @@ impl ScalarFunction { /// (if the ProtoBuf definition does not change) and safe for programmatic use. pub fn as_str_name(&self) -> &'static str { match self { - ScalarFunction::Abs => "Abs", + ScalarFunction::Unknown => "unknown", ScalarFunction::Acos => "Acos", ScalarFunction::Asin => "Asin", ScalarFunction::Atan => "Atan", @@ -2913,7 +2915,7 @@ impl ScalarFunction { /// Creates an enum from field names used in the ProtoBuf definition. pub fn from_str_name(value: &str) -> ::core::option::Option { match value { - "Abs" => Some(Self::Abs), + "unknown" => Some(Self::Unknown), "Acos" => Some(Self::Acos), "Asin" => Some(Self::Asin), "Atan" => Some(Self::Atan), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 2554018a9273..69114fd74595 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -47,7 +47,7 @@ use datafusion_common::{ use datafusion_expr::expr::Unnest; use datafusion_expr::window_frame::{check_window_frame, regularize_window_order_by}; use datafusion_expr::{ - abs, acos, acosh, array, array_append, array_concat, array_dims, array_distinct, + acos, acosh, array, array_append, array_concat, array_dims, array_distinct, array_element, array_empty, array_except, array_has, array_has_all, array_has_any, array_intersect, array_length, array_ndims, array_pop_back, array_pop_front, array_position, array_positions, array_prepend, array_remove, array_remove_all, @@ -442,6 +442,7 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { fn from(f: &protobuf::ScalarFunction) -> Self { use protobuf::ScalarFunction; match f { + ScalarFunction::Unknown => todo!(), ScalarFunction::Sqrt => Self::Sqrt, ScalarFunction::Cbrt => Self::Cbrt, ScalarFunction::Sin => Self::Sin, @@ -470,7 +471,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Ceil => Self::Ceil, ScalarFunction::Round => Self::Round, ScalarFunction::Trunc => Self::Trunc, - ScalarFunction::Abs => Self::Abs, ScalarFunction::OctetLength => Self::OctetLength, ScalarFunction::Concat => Self::Concat, ScalarFunction::Lower => Self::Lower, @@ -1360,6 +1360,7 @@ pub fn parse_expr( let args = &expr.args; match scalar_function { + ScalarFunction::Unknown => Err(proto_error("Unknown scalar function")), ScalarFunction::Asin => Ok(asin(parse_expr(&args[0], registry)?)), ScalarFunction::Acos => Ok(acos(parse_expr(&args[0], registry)?)), ScalarFunction::Asinh => Ok(asinh(parse_expr(&args[0], registry)?)), @@ -1537,7 +1538,6 @@ pub fn parse_expr( .map(|expr| parse_expr(expr, registry)) .collect::, _>>()?, )), - ScalarFunction::Abs => Ok(abs(parse_expr(&args[0], registry)?)), ScalarFunction::Signum => Ok(signum(parse_expr(&args[0], registry)?)), ScalarFunction::OctetLength => { Ok(octet_length(parse_expr(&args[0], registry)?)) diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index ccadbb217a58..9603df209ce4 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1450,7 +1450,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Ceil => Self::Ceil, BuiltinScalarFunction::Round => Self::Round, BuiltinScalarFunction::Trunc => Self::Trunc, - BuiltinScalarFunction::Abs => Self::Abs, BuiltinScalarFunction::OctetLength => Self::OctetLength, BuiltinScalarFunction::Concat => Self::Concat, BuiltinScalarFunction::Lower => Self::Lower, From 07a438d72b7e962fed92cb041f3063187e3cea29 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 27 Feb 2024 09:22:49 +0300 Subject: [PATCH 23/45] Address reviews --- datafusion-examples/examples/custom_datasource.rs | 1 + .../core/src/datasource/physical_plan/arrow_file.rs | 1 + datafusion/core/src/datasource/physical_plan/avro.rs | 1 + datafusion/core/src/datasource/physical_plan/csv.rs | 1 + datafusion/core/src/datasource/physical_plan/json.rs | 1 + .../core/src/datasource/physical_plan/parquet/mod.rs | 1 + .../core/src/physical_optimizer/enforce_distribution.rs | 1 + .../core/src/physical_optimizer/output_requirements.rs | 1 + datafusion/core/src/physical_planner.rs | 1 + datafusion/core/src/test/mod.rs | 1 + datafusion/core/src/test_util/mod.rs | 3 ++- datafusion/core/tests/custom_sources.rs | 1 + .../custom_sources_cases/provider_filter_pushdown.rs | 1 + datafusion/core/tests/custom_sources_cases/statistics.rs | 1 + datafusion/core/tests/user_defined/user_defined_plan.rs | 3 ++- datafusion/physical-plan/src/aggregates/mod.rs | 2 ++ datafusion/physical-plan/src/analyze.rs | 1 + datafusion/physical-plan/src/coalesce_batches.rs | 1 + datafusion/physical-plan/src/coalesce_partitions.rs | 1 + datafusion/physical-plan/src/empty.rs | 1 + datafusion/physical-plan/src/explain.rs | 1 + datafusion/physical-plan/src/filter.rs | 2 ++ datafusion/physical-plan/src/joins/cross_join.rs | 1 + datafusion/physical-plan/src/joins/hash_join.rs | 4 ++-- datafusion/physical-plan/src/joins/nested_loop_join.rs | 1 + datafusion/physical-plan/src/joins/sort_merge_join.rs | 1 + .../physical-plan/src/joins/symmetric_hash_join.rs | 1 + datafusion/physical-plan/src/limit.rs | 2 ++ datafusion/physical-plan/src/memory.rs | 1 + datafusion/physical-plan/src/placeholder_row.rs | 1 + datafusion/physical-plan/src/projection.rs | 1 + datafusion/physical-plan/src/recursive_query.rs | 1 + datafusion/physical-plan/src/repartition/mod.rs | 9 +++------ datafusion/physical-plan/src/sorts/partial_sort.rs | 1 + datafusion/physical-plan/src/sorts/sort.rs | 1 + .../physical-plan/src/sorts/sort_preserving_merge.rs | 1 + datafusion/physical-plan/src/streaming.rs | 1 + datafusion/physical-plan/src/test/exec.rs | 6 ++++++ datafusion/physical-plan/src/union.rs | 2 ++ datafusion/physical-plan/src/unnest.rs | 1 + datafusion/physical-plan/src/values.rs | 1 + .../physical-plan/src/windows/bounded_window_agg_exec.rs | 1 + datafusion/physical-plan/src/windows/window_agg_exec.rs | 1 + datafusion/physical-plan/src/work_table.rs | 1 + 44 files changed, 58 insertions(+), 10 deletions(-) diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_datasource.rs index d3cd66b2c9bc..1ce3ced0e1c4 100644 --- a/datafusion-examples/examples/custom_datasource.rs +++ b/datafusion-examples/examples/custom_datasource.rs @@ -208,6 +208,7 @@ impl CustomExec { } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); PlanPropertiesCache::new( diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 24e825a6920b..8eebc2b68f8b 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -84,6 +84,7 @@ impl ArrowExec { Partitioning::UnknownPartitioning(file_scan_config.file_groups.len()) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( schema: SchemaRef, projected_output_ordering: &[LexOrdering], diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 6b6e7bce90c1..9d65a0ce089a 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -69,6 +69,7 @@ impl AvroExec { &self.base_config } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( schema: SchemaRef, orderings: &[LexOrdering], diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 19281bc3c189..964f40b8e002 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -121,6 +121,7 @@ impl CsvExec { Partitioning::UnknownPartitioning(file_scan_config.file_groups.len()) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( schema: SchemaRef, orderings: &[LexOrdering], diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 6e17e58d8444..b27bcdaa917c 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -89,6 +89,7 @@ impl NdJsonExec { Partitioning::UnknownPartitioning(file_scan_config.file_groups.len()) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( schema: SchemaRef, orderings: &[LexOrdering], diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 810a84646c86..e2ad1980d422 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -266,6 +266,7 @@ impl ParquetExec { Partitioning::UnknownPartitioning(file_config.file_groups.len()) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( schema: SchemaRef, orderings: &[LexOrdering], diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index a5ad2d546d41..86a490278b0e 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -1360,6 +1360,7 @@ pub(crate) mod tests { } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(input: &Arc) -> PlanPropertiesCache { PlanPropertiesCache::new( input.equivalence_properties().clone(), // Equivalence Properties diff --git a/datafusion/core/src/physical_optimizer/output_requirements.rs b/datafusion/core/src/physical_optimizer/output_requirements.rs index a806580ce716..129ae538808f 100644 --- a/datafusion/core/src/physical_optimizer/output_requirements.rs +++ b/datafusion/core/src/physical_optimizer/output_requirements.rs @@ -112,6 +112,7 @@ impl OutputRequirementExec { self.input.clone() } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(input: &Arc) -> PlanPropertiesCache { PlanPropertiesCache::new( input.equivalence_properties().clone(), // Equivalence Properties diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index e571bc76f4d5..8049c3940a1a 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2584,6 +2584,7 @@ mod tests { Self { cache } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); PlanPropertiesCache::new( diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index f8eb67cfdaf5..e5d8f6ebda32 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -384,6 +384,7 @@ impl StatisticsExec { } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); PlanPropertiesCache::new( diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs index dda6d730ce84..55a30b07d893 100644 --- a/datafusion/core/src/test_util/mod.rs +++ b/datafusion/core/src/test_util/mod.rs @@ -46,6 +46,7 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::TableReference; use datafusion_expr::{CreateExternalTable, Expr, TableType}; +use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; use futures::Stream; @@ -55,7 +56,6 @@ use tempfile::TempDir; #[cfg(feature = "parquet")] pub use datafusion_common::test_util::parquet_test_data; pub use datafusion_common::test_util::{arrow_test_data, get_data_dir}; -use datafusion_physical_expr::EquivalenceProperties; /// Scan an empty data source, mainly used in tests pub fn scan_empty( @@ -246,6 +246,7 @@ impl UnboundedExec { } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( schema: SchemaRef, batch_produce: Option, diff --git a/datafusion/core/tests/custom_sources.rs b/datafusion/core/tests/custom_sources.rs index 11f29192276c..f62a3f723ad7 100644 --- a/datafusion/core/tests/custom_sources.rs +++ b/datafusion/core/tests/custom_sources.rs @@ -86,6 +86,7 @@ impl CustomExecutionPlan { Self { projection, cache } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); PlanPropertiesCache::new( diff --git a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs index da00effa00a8..dec2deb10cbb 100644 --- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs +++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs @@ -67,6 +67,7 @@ impl CustomPlan { Self { batches, cache } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); PlanPropertiesCache::new( diff --git a/datafusion/core/tests/custom_sources_cases/statistics.rs b/datafusion/core/tests/custom_sources_cases/statistics.rs index 37854908f021..e98781aae9bf 100644 --- a/datafusion/core/tests/custom_sources_cases/statistics.rs +++ b/datafusion/core/tests/custom_sources_cases/statistics.rs @@ -61,6 +61,7 @@ impl StatisticsValidation { } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index f2b6f6c93615..5f01334a4757 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -81,6 +81,7 @@ use datafusion::{ UserDefinedLogicalNodeCore, }, optimizer::{optimize_children, OptimizerConfig, OptimizerRule}, + physical_expr::EquivalenceProperties, physical_plan::{ DisplayAs, DisplayFormatType, Distribution, ExecutionMode, ExecutionPlan, Partitioning, PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, @@ -91,7 +92,6 @@ use datafusion::{ }; use async_trait::async_trait; -use datafusion_physical_expr::EquivalenceProperties; use futures::{Stream, StreamExt}; /// Execute the specified sql and return the resulting record batches @@ -421,6 +421,7 @@ impl TopKExec { Self { input, k, cache } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index fa5b65e40123..35e42b8a4d36 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -506,6 +506,7 @@ impl AggregateExec { true } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( input: &Arc, schema: SchemaRef, @@ -1629,6 +1630,7 @@ mod tests { Self { yield_first, cache } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); PlanPropertiesCache::new( diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs index 731f3e3c7ebf..94dd6ff3bd28 100644 --- a/datafusion/physical-plan/src/analyze.rs +++ b/datafusion/physical-plan/src/analyze.rs @@ -81,6 +81,7 @@ impl AnalyzeExec { &self.input } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( input: &Arc, schema: SchemaRef, diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index e83bce0664a3..bce48698a558 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -73,6 +73,7 @@ impl CoalesceBatchesExec { self.target_batch_size } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(input: &Arc) -> PlanPropertiesCache { // The coalesce batches operator does not make any changes to the // partitioning of its input. diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index 27f58c9bfd85..ad1094cee0e1 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -57,6 +57,7 @@ impl CoalescePartitionsExec { &self.input } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(input: &Arc) -> PlanPropertiesCache { // Coalescing partitions loses existing orderings: let mut eq_properties = input.equivalence_properties().clone(); diff --git a/datafusion/physical-plan/src/empty.rs b/datafusion/physical-plan/src/empty.rs index 942bee81f472..0705c4b4eca7 100644 --- a/datafusion/physical-plan/src/empty.rs +++ b/datafusion/physical-plan/src/empty.rs @@ -72,6 +72,7 @@ impl EmptyExec { Partitioning::UnknownPartitioning(n_partitions) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef, n_partitions: usize) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); let output_partitioning = Self::output_partitioning_helper(n_partitions); diff --git a/datafusion/physical-plan/src/explain.rs b/datafusion/physical-plan/src/explain.rs index 689ef32aa1a9..200ba0bd07c5 100644 --- a/datafusion/physical-plan/src/explain.rs +++ b/datafusion/physical-plan/src/explain.rs @@ -72,6 +72,7 @@ impl ExplainExec { self.verbose } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); PlanPropertiesCache::new( diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index d6942f0d5678..86502039c8ba 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -115,6 +115,7 @@ impl FilterExec { self.default_selectivity } + /// Calculates `Statistics` for `FilterExec`, by applying selectivity (either default, or estimated) to input statistics. fn statistics_helper( input: &Arc, predicate: &Arc, @@ -157,6 +158,7 @@ impl FilterExec { }) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( input: &Arc, predicate: &Arc, diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 42758e635060..3f6b6ba5e2b6 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -98,6 +98,7 @@ impl CrossJoinExec { &self.right } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( left: &Arc, right: &Arc, diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 2b88ec449a04..4b010e8c60c6 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -37,7 +37,7 @@ use crate::{ check_join_is_valid, estimate_join_statistics, get_final_indices_from_bit_map, need_produce_result_in_final, partitioned_join_output_partitioning, BuildProbeJoinMetrics, ColumnIndex, JoinFilter, JoinHashMap, JoinHashMapOffset, - JoinHashMapType, JoinOn, StatefulStreamResult, + JoinHashMapType, JoinOn, JoinOnRef, StatefulStreamResult, }, metrics::{ExecutionPlanMetricsSet, MetricsSet}, DisplayAs, DisplayFormatType, Distribution, ExecutionMode, ExecutionPlan, @@ -65,7 +65,6 @@ use datafusion_execution::TaskContext; use datafusion_physical_expr::equivalence::join_equivalence_properties; use datafusion_physical_expr::PhysicalExprRef; -use crate::joins::utils::JoinOnRef; use ahash::RandomState; use futures::{ready, Stream, StreamExt, TryStreamExt}; @@ -406,6 +405,7 @@ impl HashJoinExec { JoinSide::Right } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( left: &Arc, right: &Arc, diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index 89beac14816d..bbfc4c12f548 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -144,6 +144,7 @@ impl NestedLoopJoinExec { &self.join_type } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( left: &Arc, right: &Arc, diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index f7d754a99e0e..20bae468cc4a 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -200,6 +200,7 @@ impl SortMergeJoinExec { self.left.as_ref() } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( left: &Arc, right: &Arc, diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 4e07b10dd517..3eff026a176f 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -252,6 +252,7 @@ impl SymmetricHashJoinExec { }) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( left: &Arc, right: &Arc, diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index a4b924d71066..e678360dd471 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -82,6 +82,7 @@ impl GlobalLimitExec { self.fetch } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(input: &Arc) -> PlanPropertiesCache { PlanPropertiesCache::new( input.equivalence_properties().clone(), // Equivalence Properties @@ -292,6 +293,7 @@ impl LocalLimitExec { self.fetch } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(input: &Arc) -> PlanPropertiesCache { PlanPropertiesCache::new( input.equivalence_properties().clone(), // Equivalence Properties diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index f6039ee8b3ed..8bd4db0bd418 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -204,6 +204,7 @@ impl MemoryExec { self.schema.clone() } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( schema: SchemaRef, orderings: &[LexOrdering], diff --git a/datafusion/physical-plan/src/placeholder_row.rs b/datafusion/physical-plan/src/placeholder_row.rs index 9d4204ddb589..3280522e152c 100644 --- a/datafusion/physical-plan/src/placeholder_row.rs +++ b/datafusion/physical-plan/src/placeholder_row.rs @@ -94,6 +94,7 @@ impl PlaceholderRowExec { Partitioning::UnknownPartitioning(n_partitions) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef, n_partitions: usize) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); // Get output partitioning: diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 7420cf58b5ce..2ed8095f256c 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -113,6 +113,7 @@ impl ProjectionExec { &self.input } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( input: &Arc, projection_mapping: &ProjectionMapping, diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs index adc675ba2730..fd0d506e2ce4 100644 --- a/datafusion/physical-plan/src/recursive_query.rs +++ b/datafusion/physical-plan/src/recursive_query.rs @@ -94,6 +94,7 @@ impl RecursiveQueryExec { }) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index dc1e88f52e56..b9489bd12e64 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -616,12 +616,8 @@ impl RepartitionExec { input: &Arc, preserve_order: bool, ) -> Vec { - if preserve_order { - vec![true] - } else { - // We preserve ordering when input partitioning is 1 - vec![input.output_partitioning().partition_count() <= 1] - } + // We preserve ordering when repartition is order preserving variant or input partitioning is 1 + vec![preserve_order || input.output_partitioning().partition_count() <= 1] } fn eq_properties_helper( @@ -637,6 +633,7 @@ impl RepartitionExec { eq_properties } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( input: &Arc, partitioning: Partitioning, diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs index 16c4bc8601b6..095245a706ea 100644 --- a/datafusion/physical-plan/src/sorts/partial_sort.rs +++ b/datafusion/physical-plan/src/sorts/partial_sort.rs @@ -180,6 +180,7 @@ impl PartialSortExec { } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( input: &Arc, sort_exprs: LexOrdering, diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index a74705dd32ab..713ff86a5072 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -786,6 +786,7 @@ impl SortExec { } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( input: &Arc, sort_exprs: LexOrdering, diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index c07ae72d5492..16bdecd0f384 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -110,6 +110,7 @@ impl SortPreservingMergeExec { self.fetch } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(input: &Arc) -> PlanPropertiesCache { PlanPropertiesCache::new( input.equivalence_properties().clone(), // Equivalence Properties diff --git a/datafusion/physical-plan/src/streaming.rs b/datafusion/physical-plan/src/streaming.rs index e95fd37ab1b2..90e8600d78e5 100644 --- a/datafusion/physical-plan/src/streaming.rs +++ b/datafusion/physical-plan/src/streaming.rs @@ -127,6 +127,7 @@ impl StreamingTableExec { self.infinite } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( schema: SchemaRef, orderings: &[LexOrdering], diff --git a/datafusion/physical-plan/src/test/exec.rs b/datafusion/physical-plan/src/test/exec.rs index a677907295a7..a70e05809923 100644 --- a/datafusion/physical-plan/src/test/exec.rs +++ b/datafusion/physical-plan/src/test/exec.rs @@ -150,6 +150,7 @@ impl MockExec { self } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); @@ -305,6 +306,7 @@ impl BarrierExec { println!("BarrierExec::wait done waiting"); } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef, data: &[Vec]) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); PlanPropertiesCache::new( @@ -412,6 +414,7 @@ impl ErrorExec { Self { cache } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); @@ -489,6 +492,7 @@ impl StatisticsExec { } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); @@ -585,6 +589,7 @@ impl BlockingExec { Arc::downgrade(&self.refs) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef, n_partitions: usize) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); @@ -721,6 +726,7 @@ impl PanicExec { self } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( schema: SchemaRef, batches_until_panics: &[usize], diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 06a870123255..4d5377a9bdcc 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -111,6 +111,7 @@ impl UnionExec { &self.inputs } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( inputs: &[Arc], schema: SchemaRef, @@ -340,6 +341,7 @@ impl InterleaveExec { &self.inputs } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(inputs: &[Arc]) -> PlanPropertiesCache { let schema = union_schema(inputs); let eq_properties = EquivalenceProperties::new(schema); diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index ba90e8b4f1fc..d727091fd1c3 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -82,6 +82,7 @@ impl UnnestExec { } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( input: &Arc, schema: SchemaRef, diff --git a/datafusion/physical-plan/src/values.rs b/datafusion/physical-plan/src/values.rs index 20c8eddce6bd..f31272879279 100644 --- a/datafusion/physical-plan/src/values.rs +++ b/datafusion/physical-plan/src/values.rs @@ -127,6 +127,7 @@ impl ValuesExec { self.data.clone() } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index a9dfc9bfeedd..d7579cdc041d 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -182,6 +182,7 @@ impl BoundedWindowAggExec { }) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( input: &Arc, schema: &SchemaRef, diff --git a/datafusion/physical-plan/src/windows/window_agg_exec.rs b/datafusion/physical-plan/src/windows/window_agg_exec.rs index 852698bafe3a..f143d228f381 100644 --- a/datafusion/physical-plan/src/windows/window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/window_agg_exec.rs @@ -116,6 +116,7 @@ impl WindowAggExec { ) } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache( schema: SchemaRef, input: &Arc, diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs index 9a0b5daf27e4..44a42a4fcf92 100644 --- a/datafusion/physical-plan/src/work_table.rs +++ b/datafusion/physical-plan/src/work_table.rs @@ -111,6 +111,7 @@ impl WorkTableExec { } } + /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { let eq_properties = EquivalenceProperties::new(schema); From 8f3d1ef23f93cd4303745eba76c0850b39774d07 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Tue, 27 Feb 2024 11:23:53 +0100 Subject: [PATCH 24/45] refactor: `SchemaProvider::table` can fail (#9307) --- datafusion-cli/src/catalog.rs | 43 ++++++++++-------- .../examples/external_dependency/catalog.rs | 4 +- .../core/src/catalog/information_schema.rs | 45 +++++++++++++------ datafusion/core/src/catalog/listing_schema.rs | 10 +++-- datafusion/core/src/catalog/schema.rs | 12 +++-- datafusion/core/src/execution/context/mod.rs | 6 +-- datafusion/core/src/physical_planner.rs | 4 +- datafusion/core/tests/sql/create_drop.rs | 2 +- 8 files changed, 80 insertions(+), 46 deletions(-) diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index f664d40df5db..67184b8257b8 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -19,6 +19,7 @@ use crate::object_storage::get_object_store; use async_trait::async_trait; use datafusion::catalog::schema::SchemaProvider; use datafusion::catalog::{CatalogProvider, CatalogProviderList}; +use datafusion::common::{plan_datafusion_err, DataFusionError}; use datafusion::datasource::listing::{ ListingTable, ListingTableConfig, ListingTableUrl, }; @@ -145,16 +146,21 @@ impl SchemaProvider for DynamicFileSchemaProvider { self.inner.register_table(name, table) } - async fn table(&self, name: &str) -> Option> { - let inner_table = self.inner.table(name).await; + async fn table(&self, name: &str) -> Result>> { + let inner_table = self.inner.table(name).await?; if inner_table.is_some() { - return inner_table; + return Ok(inner_table); } // if the inner schema provider didn't have a table by // that name, try to treat it as a listing table - let state = self.state.upgrade()?.read().clone(); - let table_url = ListingTableUrl::parse(name).ok()?; + let state = self + .state + .upgrade() + .ok_or_else(|| plan_datafusion_err!("locking error"))? + .read() + .clone(); + let table_url = ListingTableUrl::parse(name)?; let url: &Url = table_url.as_ref(); // If the store is already registered for this URL then `get_store` @@ -169,18 +175,20 @@ impl SchemaProvider for DynamicFileSchemaProvider { let mut options = HashMap::new(); let store = get_object_store(&state, &mut options, table_url.scheme(), url) - .await - .unwrap(); + .await?; state.runtime_env().register_object_store(url, store); } } - let config = ListingTableConfig::new(table_url) - .infer(&state) - .await - .ok()?; + let config = match ListingTableConfig::new(table_url).infer(&state).await { + Ok(cfg) => cfg, + Err(_) => { + // treat as non-existing + return Ok(None); + } + }; - Some(Arc::new(ListingTable::try_new(config).ok()?)) + Ok(Some(Arc::new(ListingTable::try_new(config)?))) } fn deregister_table(&self, name: &str) -> Result>> { @@ -227,7 +235,7 @@ mod tests { let (ctx, schema) = setup_context(); // That's a non registered table so expecting None here - let table = schema.table(&location).await; + let table = schema.table(&location).await.unwrap(); assert!(table.is_none()); // It should still create an object store for the location in the SessionState @@ -251,7 +259,7 @@ mod tests { let (ctx, schema) = setup_context(); - let table = schema.table(&location).await; + let table = schema.table(&location).await.unwrap(); assert!(table.is_none()); let store = ctx @@ -273,7 +281,7 @@ mod tests { let (ctx, schema) = setup_context(); - let table = schema.table(&location).await; + let table = schema.table(&location).await.unwrap(); assert!(table.is_none()); let store = ctx @@ -289,13 +297,10 @@ mod tests { } #[tokio::test] - #[should_panic] async fn query_invalid_location_test() { let location = "ts://file.parquet"; let (_ctx, schema) = setup_context(); - // This will panic, we cannot prevent that because `schema.table` - // returns an Option - schema.table(location).await; + assert!(schema.table(location).await.is_err()); } } diff --git a/datafusion-examples/examples/external_dependency/catalog.rs b/datafusion-examples/examples/external_dependency/catalog.rs index a623eafdf3d7..28a720cc33a9 100644 --- a/datafusion-examples/examples/external_dependency/catalog.rs +++ b/datafusion-examples/examples/external_dependency/catalog.rs @@ -180,9 +180,9 @@ impl SchemaProvider for DirSchema { tables.keys().cloned().collect::>() } - async fn table(&self, name: &str) -> Option> { + async fn table(&self, name: &str) -> Result>> { let tables = self.tables.read().unwrap(); - tables.get(name).cloned() + Ok(tables.get(name).cloned()) } fn table_exist(&self, name: &str) -> bool { diff --git a/datafusion/core/src/catalog/information_schema.rs b/datafusion/core/src/catalog/information_schema.rs index 80ce3b1ae419..cd8f7649534f 100644 --- a/datafusion/core/src/catalog/information_schema.rs +++ b/datafusion/core/src/catalog/information_schema.rs @@ -20,6 +20,7 @@ //! [Information Schema]: https://en.wikipedia.org/wiki/Information_schema use async_trait::async_trait; +use datafusion_common::DataFusionError; use std::{any::Any, sync::Arc}; use arrow::{ @@ -78,7 +79,10 @@ struct InformationSchemaConfig { impl InformationSchemaConfig { /// Construct the `information_schema.tables` virtual table - async fn make_tables(&self, builder: &mut InformationSchemaTablesBuilder) { + async fn make_tables( + &self, + builder: &mut InformationSchemaTablesBuilder, + ) -> Result<(), DataFusionError> { // create a mem table with the names of tables for catalog_name in self.catalog_list.catalog_names() { @@ -89,7 +93,7 @@ impl InformationSchemaConfig { // schema name may not exist in the catalog, so we need to check if let Some(schema) = catalog.schema(&schema_name) { for table_name in schema.table_names() { - if let Some(table) = schema.table(&table_name).await { + if let Some(table) = schema.table(&table_name).await? { builder.add_table( &catalog_name, &schema_name, @@ -124,6 +128,8 @@ impl InformationSchemaConfig { TableType::View, ); } + + Ok(()) } async fn make_schemata(&self, builder: &mut InformationSchemataBuilder) { @@ -141,7 +147,10 @@ impl InformationSchemaConfig { } } - async fn make_views(&self, builder: &mut InformationSchemaViewBuilder) { + async fn make_views( + &self, + builder: &mut InformationSchemaViewBuilder, + ) -> Result<(), DataFusionError> { for catalog_name in self.catalog_list.catalog_names() { let catalog = self.catalog_list.catalog(&catalog_name).unwrap(); @@ -150,7 +159,7 @@ impl InformationSchemaConfig { // schema name may not exist in the catalog, so we need to check if let Some(schema) = catalog.schema(&schema_name) { for table_name in schema.table_names() { - if let Some(table) = schema.table(&table_name).await { + if let Some(table) = schema.table(&table_name).await? { builder.add_view( &catalog_name, &schema_name, @@ -163,10 +172,15 @@ impl InformationSchemaConfig { } } } + + Ok(()) } /// Construct the `information_schema.columns` virtual table - async fn make_columns(&self, builder: &mut InformationSchemaColumnsBuilder) { + async fn make_columns( + &self, + builder: &mut InformationSchemaColumnsBuilder, + ) -> Result<(), DataFusionError> { for catalog_name in self.catalog_list.catalog_names() { let catalog = self.catalog_list.catalog(&catalog_name).unwrap(); @@ -175,7 +189,7 @@ impl InformationSchemaConfig { // schema name may not exist in the catalog, so we need to check if let Some(schema) = catalog.schema(&schema_name) { for table_name in schema.table_names() { - if let Some(table) = schema.table(&table_name).await { + if let Some(table) = schema.table(&table_name).await? { for (field_position, field) in table.schema().fields().iter().enumerate() { @@ -193,6 +207,8 @@ impl InformationSchemaConfig { } } } + + Ok(()) } /// Construct the `information_schema.df_settings` virtual table @@ -223,7 +239,10 @@ impl SchemaProvider for InformationSchemaProvider { ] } - async fn table(&self, name: &str) -> Option> { + async fn table( + &self, + name: &str, + ) -> Result>, DataFusionError> { let config = self.config.clone(); let table: Arc = if name.eq_ignore_ascii_case("tables") { Arc::new(InformationSchemaTables::new(config)) @@ -236,12 +255,12 @@ impl SchemaProvider for InformationSchemaProvider { } else if name.eq_ignore_ascii_case("schemata") { Arc::new(InformationSchemata::new(config)) } else { - return None; + return Ok(None); }; - Some(Arc::new( + Ok(Some(Arc::new( StreamingTable::try_new(table.schema().clone(), vec![table]).unwrap(), - )) + ))) } fn table_exist(&self, name: &str) -> bool { @@ -292,7 +311,7 @@ impl PartitionStream for InformationSchemaTables { self.schema.clone(), // TODO: Stream this futures::stream::once(async move { - config.make_tables(&mut builder).await; + config.make_tables(&mut builder).await?; Ok(builder.finish()) }), )) @@ -383,7 +402,7 @@ impl PartitionStream for InformationSchemaViews { self.schema.clone(), // TODO: Stream this futures::stream::once(async move { - config.make_views(&mut builder).await; + config.make_views(&mut builder).await?; Ok(builder.finish()) }), )) @@ -497,7 +516,7 @@ impl PartitionStream for InformationSchemaColumns { self.schema.clone(), // TODO: Stream this futures::stream::once(async move { - config.make_columns(&mut builder).await; + config.make_columns(&mut builder).await?; Ok(builder.finish()) }), )) diff --git a/datafusion/core/src/catalog/listing_schema.rs b/datafusion/core/src/catalog/listing_schema.rs index c3c682689542..f64b43062d2f 100644 --- a/datafusion/core/src/catalog/listing_schema.rs +++ b/datafusion/core/src/catalog/listing_schema.rs @@ -175,12 +175,16 @@ impl SchemaProvider for ListingSchemaProvider { .collect() } - async fn table(&self, name: &str) -> Option> { - self.tables + async fn table( + &self, + name: &str, + ) -> Result>, DataFusionError> { + Ok(self + .tables .lock() .expect("Can't lock tables") .get(name) - .cloned() + .cloned()) } fn register_table( diff --git a/datafusion/core/src/catalog/schema.rs b/datafusion/core/src/catalog/schema.rs index 1e9a86b49611..49f8350ecc5b 100644 --- a/datafusion/core/src/catalog/schema.rs +++ b/datafusion/core/src/catalog/schema.rs @@ -49,7 +49,10 @@ pub trait SchemaProvider: Sync + Send { /// Retrieves a specific table from the schema by name, if it exists, /// otherwise returns `None`. - async fn table(&self, name: &str) -> Option>; + async fn table( + &self, + name: &str, + ) -> Result>, DataFusionError>; /// If supported by the implementation, adds a new table named `name` to /// this schema. @@ -111,8 +114,11 @@ impl SchemaProvider for MemorySchemaProvider { .collect() } - async fn table(&self, name: &str) -> Option> { - self.tables.get(name).map(|table| table.value().clone()) + async fn table( + &self, + name: &str, + ) -> Result>, DataFusionError> { + Ok(self.tables.get(name).map(|table| table.value().clone())) } fn register_table( diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index b130070141b2..ffc4a4f717d7 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -783,7 +783,7 @@ impl SessionContext { }; if let Some(schema) = maybe_schema { - if let Some(table_provider) = schema.table(&table).await { + if let Some(table_provider) = schema.table(&table).await? { if table_provider.table_type() == table_type { schema.deregister_table(&table)?; return Ok(true); @@ -1115,7 +1115,7 @@ impl SessionContext { let table_ref = table_ref.into(); let table = table_ref.table().to_string(); let schema = self.state.read().schema_for_ref(table_ref)?; - match schema.table(&table).await { + match schema.table(&table).await? { Some(ref provider) => Ok(Arc::clone(provider)), _ => plan_err!("No table named '{table}'"), } @@ -1714,7 +1714,7 @@ impl SessionState { let resolved = self.resolve_table_ref(&reference); if let Entry::Vacant(v) = provider.tables.entry(resolved.to_string()) { if let Ok(schema) = self.schema_for_ref(resolved) { - if let Some(table) = schema.table(table).await { + if let Some(table) = schema.table(table).await? { v.insert(provider_as_source(table)); } } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 23ac7e08cad8..83ba773464f6 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -624,7 +624,7 @@ impl DefaultPhysicalPlanner { }) => { let name = table_name.table(); let schema = session_state.schema_for_ref(table_name)?; - if let Some(provider) = schema.table(name).await { + if let Some(provider) = schema.table(name).await? { let input_exec = self.create_initial_plan(input, session_state).await?; provider.insert_into(session_state, input_exec, false).await } else { @@ -641,7 +641,7 @@ impl DefaultPhysicalPlanner { }) => { let name = table_name.table(); let schema = session_state.schema_for_ref(table_name)?; - if let Some(provider) = schema.table(name).await { + if let Some(provider) = schema.table(name).await? { let input_exec = self.create_initial_plan(input, session_state).await?; provider.insert_into(session_state, input_exec, true).await } else { diff --git a/datafusion/core/tests/sql/create_drop.rs b/datafusion/core/tests/sql/create_drop.rs index b1434dddee50..2174009b8557 100644 --- a/datafusion/core/tests/sql/create_drop.rs +++ b/datafusion/core/tests/sql/create_drop.rs @@ -63,7 +63,7 @@ async fn create_external_table_with_ddl() -> Result<()> { let exists = schema.table_exist("dt"); assert!(exists, "Table should have been created!"); - let table_schema = schema.table("dt").await.unwrap().schema(); + let table_schema = schema.table("dt").await.unwrap().unwrap().schema(); assert_eq!(3, table_schema.fields().len()); From 372204e1ed71ee600553e806885cdd7f596e168f Mon Sep 17 00:00:00 2001 From: junxiangMu <63799833+guojidan@users.noreply.github.com> Date: Tue, 27 Feb 2024 20:40:08 +0800 Subject: [PATCH 25/45] fix write_partitioned_parquet_results bug (#9360) --- datafusion/core/tests/dataframe/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index ee842004172c..abe5fd29182e 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -1975,7 +1975,7 @@ async fn write_partitioned_parquet_results() -> Result<()> { "+-----+-----+", ]; - assert_batches_eq!(expected, &results); + assert_batches_sorted_eq!(expected, &results); Ok(()) } From 14264d2c3947e432f71bfe0af1a3dbafbb6ee686 Mon Sep 17 00:00:00 2001 From: Artem Medvedev Date: Tue, 27 Feb 2024 14:11:59 +0100 Subject: [PATCH 26/45] fix: use `JoinSet` to make spawned tasks cancel-safe (#9318) * fix: use `JoinSet` to make spawned tasks cancel-safe * feat: drop `AbortOnDropSingle` and `AbortOnDropMany` * style: doc lint * fix: ordering of the tasks in `RepartitionExec` * fix: replace spawn_blocking with JoinSet * style: disallow spawn methods * fixes: preserve ordering of tasks * style: allow spawning in tests * chore: exclude clippy.toml from rat * chore: typo * feat: introduce `SpawnedTask` * revert outdated comment * switch to SpawnedTask missed outdated part * doc: improve reason for disallowed-method --- clippy.toml | 4 + datafusion/core/src/dataframe/mod.rs | 1 + .../core/src/datasource/file_format/arrow.rs | 2 +- .../src/datasource/file_format/parquet.rs | 51 +++++++------ .../src/datasource/file_format/write/demux.rs | 12 +-- .../file_format/write/orchestration.rs | 29 ++++---- datafusion/core/src/datasource/stream.rs | 9 +-- datafusion/core/src/execution/context/mod.rs | 1 + datafusion/core/tests/fifo.rs | 2 + .../sort_preserving_repartition_fuzz.rs | 1 + .../core/tests/fuzz_cases/window_fuzz.rs | 1 + datafusion/physical-plan/src/common.rs | 73 ++++++++----------- datafusion/physical-plan/src/lib.rs | 10 +-- .../physical-plan/src/repartition/mod.rs | 46 ++++++------ datafusion/physical-plan/src/sorts/sort.rs | 7 +- datafusion/sqllogictest/bin/sqllogictests.rs | 1 + dev/release/rat_exclude_files.txt | 3 +- 17 files changed, 129 insertions(+), 124 deletions(-) create mode 100644 clippy.toml diff --git a/clippy.toml b/clippy.toml new file mode 100644 index 000000000000..c6c754e440c7 --- /dev/null +++ b/clippy.toml @@ -0,0 +1,4 @@ +disallowed-methods = [ + { path = "tokio::task::spawn", reason = "To provide cancel-safety, use `SpawnedTask::spawn` instead (https://github.com/apache/arrow-datafusion/issues/6513)" }, + { path = "tokio::task::spawn_blocking", reason = "To provide cancel-safety, use `SpawnedTask::spawn` instead (https://github.com/apache/arrow-datafusion/issues/6513)" }, +] diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 3a60d57f6685..c04247210d46 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -2172,6 +2172,7 @@ mod tests { } #[tokio::test] + #[allow(clippy::disallowed_methods)] async fn sendable() { let df = test_table().await.unwrap(); // dataframes should be sendable between threads/tasks diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index ead2db5a10c0..d5f07d11bee9 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -295,7 +295,7 @@ impl DataSink for ArrowFileSink { } } - match demux_task.await { + match demux_task.join().await { Ok(r) => r?, Err(e) => { if e.is_panic() { diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 89ec81630c1b..739850115370 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -32,7 +32,7 @@ use std::fmt::Debug; use std::sync::Arc; use tokio::io::{AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::task::{JoinHandle, JoinSet}; +use tokio::task::JoinSet; use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::statistics::{create_max_min_accs, get_col_stats}; @@ -42,6 +42,7 @@ use bytes::{BufMut, BytesMut}; use datafusion_common::{exec_err, not_impl_err, DataFusionError, FileType}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; +use datafusion_physical_plan::common::SpawnedTask; use futures::{StreamExt, TryStreamExt}; use hashbrown::HashMap; use object_store::path::Path; @@ -728,7 +729,7 @@ impl DataSink for ParquetSink { } } - match demux_task.await { + match demux_task.join().await { Ok(r) => r?, Err(e) => { if e.is_panic() { @@ -738,6 +739,7 @@ impl DataSink for ParquetSink { } } } + Ok(row_count as u64) } } @@ -754,8 +756,9 @@ async fn column_serializer_task( Ok(writer) } -type ColumnJoinHandle = JoinHandle>; +type ColumnWriterTask = SpawnedTask>; type ColSender = Sender; + /// Spawns a parallel serialization task for each column /// Returns join handles for each columns serialization task along with a send channel /// to send arrow arrays to each serialization task. @@ -763,23 +766,24 @@ fn spawn_column_parallel_row_group_writer( schema: Arc, parquet_props: Arc, max_buffer_size: usize, -) -> Result<(Vec, Vec)> { +) -> Result<(Vec, Vec)> { let schema_desc = arrow_to_parquet_schema(&schema)?; let col_writers = get_column_writers(&schema_desc, &parquet_props, &schema)?; let num_columns = col_writers.len(); - let mut col_writer_handles = Vec::with_capacity(num_columns); + let mut col_writer_tasks = Vec::with_capacity(num_columns); let mut col_array_channels = Vec::with_capacity(num_columns); for writer in col_writers.into_iter() { // Buffer size of this channel limits the number of arrays queued up for column level serialization let (send_array, recieve_array) = mpsc::channel::(max_buffer_size); col_array_channels.push(send_array); - col_writer_handles - .push(tokio::spawn(column_serializer_task(recieve_array, writer))) + + let task = SpawnedTask::spawn(column_serializer_task(recieve_array, writer)); + col_writer_tasks.push(task); } - Ok((col_writer_handles, col_array_channels)) + Ok((col_writer_tasks, col_array_channels)) } /// Settings related to writing parquet files in parallel @@ -820,14 +824,14 @@ async fn send_arrays_to_col_writers( /// Spawns a tokio task which joins the parallel column writer tasks, /// and finalizes the row group fn spawn_rg_join_and_finalize_task( - column_writer_handles: Vec>>, + column_writer_tasks: Vec, rg_rows: usize, -) -> JoinHandle { - tokio::spawn(async move { - let num_cols = column_writer_handles.len(); +) -> SpawnedTask { + SpawnedTask::spawn(async move { + let num_cols = column_writer_tasks.len(); let mut finalized_rg = Vec::with_capacity(num_cols); - for handle in column_writer_handles.into_iter() { - match handle.await { + for task in column_writer_tasks.into_iter() { + match task.join().await { Ok(r) => { let w = r?; finalized_rg.push(w.close()?); @@ -856,12 +860,12 @@ fn spawn_rg_join_and_finalize_task( /// given by n_columns * num_row_groups. fn spawn_parquet_parallel_serialization_task( mut data: Receiver, - serialize_tx: Sender>, + serialize_tx: Sender>, schema: Arc, writer_props: Arc, parallel_options: ParallelParquetWriterOptions, -) -> JoinHandle> { - tokio::spawn(async move { +) -> SpawnedTask> { + SpawnedTask::spawn(async move { let max_buffer_rb = parallel_options.max_buffered_record_batches_per_stream; let max_row_group_rows = writer_props.max_row_group_size(); let (mut column_writer_handles, mut col_array_channels) = @@ -931,7 +935,7 @@ fn spawn_parquet_parallel_serialization_task( /// Consume RowGroups serialized by other parallel tasks and concatenate them in /// to the final parquet file, while flushing finalized bytes to an [ObjectStore] async fn concatenate_parallel_row_groups( - mut serialize_rx: Receiver>, + mut serialize_rx: Receiver>, schema: Arc, writer_props: Arc, mut object_store_writer: AbortableWrite>, @@ -947,9 +951,8 @@ async fn concatenate_parallel_row_groups( let mut row_count = 0; - while let Some(handle) = serialize_rx.recv().await { - let join_result = handle.await; - match join_result { + while let Some(task) = serialize_rx.recv().await { + match task.join().await { Ok(result) => { let mut rg_out = parquet_writer.next_row_group()?; let (serialized_columns, cnt) = result?; @@ -999,7 +1002,7 @@ async fn output_single_parquet_file_parallelized( let max_rowgroups = parallel_options.max_parallel_row_groups; // Buffer size of this channel limits maximum number of RowGroups being worked on in parallel let (serialize_tx, serialize_rx) = - mpsc::channel::>(max_rowgroups); + mpsc::channel::>(max_rowgroups); let arc_props = Arc::new(parquet_props.clone()); let launch_serialization_task = spawn_parquet_parallel_serialization_task( @@ -1017,7 +1020,7 @@ async fn output_single_parquet_file_parallelized( ) .await?; - match launch_serialization_task.await { + match launch_serialization_task.join().await { Ok(Ok(_)) => (), Ok(Err(e)) => return Err(e), Err(e) => { @@ -1027,7 +1030,7 @@ async fn output_single_parquet_file_parallelized( unreachable!() } } - }; + } Ok(row_count) } diff --git a/datafusion/core/src/datasource/file_format/write/demux.rs b/datafusion/core/src/datasource/file_format/write/demux.rs index 8bccf3d71cf9..d70b4811da5b 100644 --- a/datafusion/core/src/datasource/file_format/write/demux.rs +++ b/datafusion/core/src/datasource/file_format/write/demux.rs @@ -41,8 +41,8 @@ use object_store::path::Path; use rand::distributions::DistString; +use datafusion_physical_plan::common::SpawnedTask; use tokio::sync::mpsc::{self, Receiver, Sender, UnboundedReceiver, UnboundedSender}; -use tokio::task::JoinHandle; type RecordBatchReceiver = Receiver; type DemuxedStreamReceiver = UnboundedReceiver<(Path, RecordBatchReceiver)>; @@ -76,15 +76,15 @@ pub(crate) fn start_demuxer_task( partition_by: Option>, base_output_path: ListingTableUrl, file_extension: String, -) -> (JoinHandle>, DemuxedStreamReceiver) { - let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); +) -> (SpawnedTask>, DemuxedStreamReceiver) { + let (tx, rx) = mpsc::unbounded_channel(); let context = context.clone(); let single_file_output = !base_output_path.is_collection(); - let task: JoinHandle> = match partition_by { + let task = match partition_by { Some(parts) => { // There could be an arbitrarily large number of parallel hive style partitions being written to, so we cannot // bound this channel without risking a deadlock. - tokio::spawn(async move { + SpawnedTask::spawn(async move { hive_style_partitions_demuxer( tx, input, @@ -96,7 +96,7 @@ pub(crate) fn start_demuxer_task( .await }) } - None => tokio::spawn(async move { + None => SpawnedTask::spawn(async move { row_count_demuxer( tx, input, diff --git a/datafusion/core/src/datasource/file_format/write/orchestration.rs b/datafusion/core/src/datasource/file_format/write/orchestration.rs index 1a3042cbc00b..05406d3751c9 100644 --- a/datafusion/core/src/datasource/file_format/write/orchestration.rs +++ b/datafusion/core/src/datasource/file_format/write/orchestration.rs @@ -33,10 +33,11 @@ use datafusion_common::{internal_datafusion_err, internal_err, DataFusionError}; use datafusion_execution::TaskContext; use bytes::Bytes; +use datafusion_physical_plan::common::SpawnedTask; +use futures::try_join; use tokio::io::{AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc::{self, Receiver}; -use tokio::task::{JoinHandle, JoinSet}; -use tokio::try_join; +use tokio::task::JoinSet; type WriterType = AbortableWrite>; type SerializerType = Arc; @@ -51,14 +52,14 @@ pub(crate) async fn serialize_rb_stream_to_object_store( mut writer: AbortableWrite>, ) -> std::result::Result<(WriterType, u64), (WriterType, DataFusionError)> { let (tx, mut rx) = - mpsc::channel::>>(100); - let serialize_task = tokio::spawn(async move { + mpsc::channel::>>(100); + let serialize_task = SpawnedTask::spawn(async move { // Some serializers (like CSV) handle the first batch differently than // subsequent batches, so we track that here. let mut initial = true; while let Some(batch) = data_rx.recv().await { let serializer_clone = serializer.clone(); - let handle = tokio::spawn(async move { + let task = SpawnedTask::spawn(async move { let num_rows = batch.num_rows(); let bytes = serializer_clone.serialize(batch, initial)?; Ok((num_rows, bytes)) @@ -66,7 +67,7 @@ pub(crate) async fn serialize_rb_stream_to_object_store( if initial { initial = false; } - tx.send(handle).await.map_err(|_| { + tx.send(task).await.map_err(|_| { internal_datafusion_err!("Unknown error writing to object store") })?; } @@ -74,8 +75,8 @@ pub(crate) async fn serialize_rb_stream_to_object_store( }); let mut row_count = 0; - while let Some(handle) = rx.recv().await { - match handle.await { + while let Some(task) = rx.recv().await { + match task.join().await { Ok(Ok((cnt, bytes))) => { match writer.write_all(&bytes).await { Ok(_) => (), @@ -106,7 +107,7 @@ pub(crate) async fn serialize_rb_stream_to_object_store( } } - match serialize_task.await { + match serialize_task.join().await { Ok(Ok(_)) => (), Ok(Err(e)) => return Err((writer, e)), Err(_) => { @@ -115,7 +116,7 @@ pub(crate) async fn serialize_rb_stream_to_object_store( internal_datafusion_err!("Unknown error writing to object store"), )) } - }; + } Ok((writer, row_count as u64)) } @@ -241,9 +242,9 @@ pub(crate) async fn stateless_multipart_put( .execution .max_buffered_batches_per_output_file; - let (tx_file_bundle, rx_file_bundle) = tokio::sync::mpsc::channel(rb_buffer_size / 2); + let (tx_file_bundle, rx_file_bundle) = mpsc::channel(rb_buffer_size / 2); let (tx_row_cnt, rx_row_cnt) = tokio::sync::oneshot::channel(); - let write_coordinater_task = tokio::spawn(async move { + let write_coordinator_task = SpawnedTask::spawn(async move { stateless_serialize_and_write_files(rx_file_bundle, tx_row_cnt).await }); while let Some((location, rb_stream)) = file_stream_rx.recv().await { @@ -260,10 +261,10 @@ pub(crate) async fn stateless_multipart_put( })?; } - // Signal to the write coordinater that no more files are coming + // Signal to the write coordinator that no more files are coming drop(tx_file_bundle); - match try_join!(write_coordinater_task, demux_task) { + match try_join!(write_coordinator_task.join(), demux_task.join()) { Ok((r1, r2)) => { r1?; r2?; diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index 830cd7a07e46..6dc59e4a5c65 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -29,12 +29,11 @@ use arrow_array::{RecordBatch, RecordBatchReader, RecordBatchWriter}; use arrow_schema::SchemaRef; use async_trait::async_trait; use futures::StreamExt; -use tokio::task::spawn_blocking; use datafusion_common::{plan_err, Constraints, DataFusionError, Result}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::{CreateExternalTable, Expr, TableType}; -use datafusion_physical_plan::common::AbortOnDropSingle; +use datafusion_physical_plan::common::SpawnedTask; use datafusion_physical_plan::insert::{DataSink, FileSinkExec}; use datafusion_physical_plan::metrics::MetricsSet; use datafusion_physical_plan::stream::RecordBatchReceiverStreamBuilder; @@ -344,7 +343,7 @@ impl DataSink for StreamWrite { let config = self.0.clone(); let (sender, mut receiver) = tokio::sync::mpsc::channel::(2); // Note: FIFO Files support poll so this could use AsyncFd - let write = AbortOnDropSingle::new(spawn_blocking(move || { + let write_task = SpawnedTask::spawn_blocking(move || { let mut count = 0_u64; let mut writer = config.writer()?; while let Some(batch) = receiver.blocking_recv() { @@ -352,7 +351,7 @@ impl DataSink for StreamWrite { writer.write(&batch)?; } Ok(count) - })); + }); while let Some(b) = data.next().await.transpose()? { if sender.send(b).await.is_err() { @@ -360,6 +359,6 @@ impl DataSink for StreamWrite { } } drop(sender); - write.await.unwrap() + write_task.join().await.unwrap() } } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index ffc4a4f717d7..453a00a1a5cf 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -2288,6 +2288,7 @@ mod tests { } #[tokio::test] + #[allow(clippy::disallowed_methods)] async fn send_context_to_threads() -> Result<()> { // ensure SessionContexts can be used in a multi-threaded // environment. Usecase is for concurrent planing. diff --git a/datafusion/core/tests/fifo.rs b/datafusion/core/tests/fifo.rs index 93c7f7368065..c9ad95a3a042 100644 --- a/datafusion/core/tests/fifo.rs +++ b/datafusion/core/tests/fifo.rs @@ -103,6 +103,7 @@ mod unix_test { let broken_pipe_timeout = Duration::from_secs(10); let sa = file_path.clone(); // Spawn a new thread to write to the FIFO file + #[allow(clippy::disallowed_methods)] // spawn allowed only in tests spawn_blocking(move || { let file = OpenOptions::new().write(true).open(sa).unwrap(); // Reference time to use when deciding to fail the test @@ -357,6 +358,7 @@ mod unix_test { (sink_fifo_path.clone(), sink_fifo_path.display()); // Spawn a new thread to read sink EXTERNAL TABLE. + #[allow(clippy::disallowed_methods)] // spawn allowed only in tests tasks.push(spawn_blocking(move || { let file = File::open(sink_fifo_path_thread).unwrap(); let schema = Arc::new(Schema::new(vec![ diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs index df6499e9b1e4..6c9c3359ebf4 100644 --- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs @@ -302,6 +302,7 @@ mod sp_repartition_fuzz_tests { let mut handles = Vec::new(); for seed in seed_start..seed_end { + #[allow(clippy::disallowed_methods)] // spawn allowed only in tests let job = tokio::spawn(run_sort_preserving_repartition_test( make_staggered_batches::(n_row, n_distinct, seed as u64), is_first_roundrobin, diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index 609d26c9c253..1cab4d5c2f98 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -123,6 +123,7 @@ async fn window_bounded_window_random_comparison() -> Result<()> { for i in 0..n { let idx = i % test_cases.len(); let (pb_cols, ob_cols, search_mode) = test_cases[idx].clone(); + #[allow(clippy::disallowed_methods)] // spawn allowed only in tests let job = tokio::spawn(run_window_test( make_staggered_batches::(1000, n_distinct, i as u64), i as u64, diff --git a/datafusion/physical-plan/src/common.rs b/datafusion/physical-plan/src/common.rs index e83dc2525b9f..5172bc9b2a3c 100644 --- a/datafusion/physical-plan/src/common.rs +++ b/datafusion/physical-plan/src/common.rs @@ -21,7 +21,6 @@ use std::fs; use std::fs::{metadata, File}; use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::task::{Context, Poll}; use super::SendableRecordBatchStream; use crate::stream::RecordBatchReceiverStream; @@ -39,8 +38,7 @@ use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; use futures::{Future, StreamExt, TryStreamExt}; use parking_lot::Mutex; -use pin_project_lite::pin_project; -use tokio::task::JoinHandle; +use tokio::task::{JoinError, JoinSet}; /// [`MemoryReservation`] used across query execution streams pub(crate) type SharedMemoryReservation = Arc>; @@ -174,50 +172,43 @@ pub fn compute_record_batch_statistics( } } -pin_project! { - /// Helper that aborts the given join handle on drop. - /// - /// Useful to kill background tasks when the consumer is dropped. - #[derive(Debug)] - pub struct AbortOnDropSingle{ - #[pin] - join_handle: JoinHandle, - } - - impl PinnedDrop for AbortOnDropSingle { - fn drop(this: Pin<&mut Self>) { - this.join_handle.abort(); - } - } +/// Helper that provides a simple API to spawn a single task and join it. +/// Provides guarantees of aborting on `Drop` to keep it cancel-safe. +/// +/// Technically, it's just a wrapper of `JoinSet` (with size=1). +#[derive(Debug)] +pub struct SpawnedTask { + inner: JoinSet, } -impl AbortOnDropSingle { - /// Create new abort helper from join handle. - pub fn new(join_handle: JoinHandle) -> Self { - Self { join_handle } +impl SpawnedTask { + pub fn spawn(task: T) -> Self + where + T: Future, + T: Send + 'static, + R: Send, + { + let mut inner = JoinSet::new(); + inner.spawn(task); + Self { inner } } -} -impl Future for AbortOnDropSingle { - type Output = Result; - - fn poll(self: std::pin::Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { - let this = self.project(); - this.join_handle.poll(cx) + pub fn spawn_blocking(task: T) -> Self + where + T: FnOnce() -> R, + T: Send + 'static, + R: Send, + { + let mut inner = JoinSet::new(); + inner.spawn_blocking(task); + Self { inner } } -} - -/// Helper that aborts the given join handles on drop. -/// -/// Useful to kill background tasks when the consumer is dropped. -#[derive(Debug)] -pub struct AbortOnDropMany(pub Vec>); -impl Drop for AbortOnDropMany { - fn drop(&mut self) { - for join_handle in &self.0 { - join_handle.abort(); - } + pub async fn join(mut self) -> Result { + self.inner + .join_next() + .await + .expect("`SpawnedTask` instance always contains exactly 1 task") } } diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 1c4a6ac0ecaf..562e42a7da3b 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -298,14 +298,14 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// "abort" such tasks, they may continue to consume resources even after /// the plan is dropped, generating intermediate results that are never /// used. + /// Thus, [`spawn`] is disallowed, and instead use [`SpawnedTask`]. /// - /// See [`AbortOnDropSingle`], [`AbortOnDropMany`] and - /// [`RecordBatchReceiverStreamBuilder`] for structures to help ensure all - /// background tasks are cancelled. + /// For more details see [`SpawnedTask`], [`JoinSet`] and [`RecordBatchReceiverStreamBuilder`] + /// for structures to help ensure all background tasks are cancelled. /// /// [`spawn`]: tokio::task::spawn - /// [`AbortOnDropSingle`]: crate::common::AbortOnDropSingle - /// [`AbortOnDropMany`]: crate::common::AbortOnDropMany + /// [`JoinSet`]: tokio::task::JoinSet + /// [`SpawnedTask`]: crate::common::SpawnedTask /// [`RecordBatchReceiverStreamBuilder`]: crate::stream::RecordBatchReceiverStreamBuilder /// /// # Implementation Examples diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 07693f747fee..a66a929796ab 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -32,21 +32,20 @@ use futures::{FutureExt, StreamExt}; use hashbrown::HashMap; use log::trace; use parking_lot::Mutex; -use tokio::task::JoinHandle; use datafusion_common::{arrow_datafusion_err, not_impl_err, DataFusionError, Result}; use datafusion_execution::memory_pool::MemoryConsumer; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, PhysicalExpr}; -use crate::common::transpose; +use crate::common::{transpose, SpawnedTask}; use crate::hash_utils::create_hashes; use crate::metrics::BaselineMetrics; use crate::repartition::distributor_channels::{channels, partition_aware_channels}; use crate::sorts::streaming_merge; use crate::{DisplayFormatType, ExecutionPlan, Partitioning, Statistics}; -use super::common::{AbortOnDropMany, AbortOnDropSingle, SharedMemoryReservation}; +use super::common::SharedMemoryReservation; use super::expressions::PhysicalSortExpr; use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; use super::{DisplayAs, RecordBatchStream, SendableRecordBatchStream}; @@ -74,7 +73,7 @@ struct RepartitionExecState { >, /// Helper that ensures that that background job is killed once it is no longer needed. - abort_helper: Arc>, + abort_helper: Arc>>, } /// A utility that can be used to partition batches based on [`Partitioning`] @@ -522,7 +521,7 @@ impl ExecutionPlan for RepartitionExec { } // launch one async task per *input* partition - let mut join_handles = Vec::with_capacity(num_input_partitions); + let mut spawned_tasks = Vec::with_capacity(num_input_partitions); for i in 0..num_input_partitions { let txs: HashMap<_, _> = state .channels @@ -534,28 +533,27 @@ impl ExecutionPlan for RepartitionExec { let r_metrics = RepartitionMetrics::new(i, partition, &self.metrics); - let input_task: JoinHandle> = - tokio::spawn(Self::pull_from_input( - self.input.clone(), - i, - txs.clone(), - self.partitioning.clone(), - r_metrics, - context.clone(), - )); + let input_task = SpawnedTask::spawn(Self::pull_from_input( + self.input.clone(), + i, + txs.clone(), + self.partitioning.clone(), + r_metrics, + context.clone(), + )); // In a separate task, wait for each input to be done // (and pass along any errors, including panic!s) - let join_handle = tokio::spawn(Self::wait_for_task( - AbortOnDropSingle::new(input_task), + let wait_for_task = SpawnedTask::spawn(Self::wait_for_task( + input_task, txs.into_iter() .map(|(partition, (tx, _reservation))| (partition, tx)) .collect(), )); - join_handles.push(join_handle); + spawned_tasks.push(wait_for_task); } - state.abort_helper = Arc::new(AbortOnDropMany(join_handles)) + state.abort_helper = Arc::new(spawned_tasks) } trace!( @@ -638,7 +636,7 @@ impl RepartitionExec { partitioning, state: Arc::new(Mutex::new(RepartitionExecState { channels: HashMap::new(), - abort_helper: Arc::new(AbortOnDropMany::<()>(vec![])), + abort_helper: Arc::new(Vec::new()), })), metrics: ExecutionPlanMetricsSet::new(), preserve_order: false, @@ -759,12 +757,13 @@ impl RepartitionExec { /// complete. Upon error, propagates the errors to all output tx /// channels. async fn wait_for_task( - input_task: AbortOnDropSingle>, + input_task: SpawnedTask>, txs: HashMap>, ) { // wait for completion, and propagate error // note we ignore errors on send (.ok) as that means the receiver has already shutdown. - match input_task.await { + + match input_task.join().await { // Error in joining task Err(e) => { let e = Arc::new(e); @@ -813,7 +812,7 @@ struct RepartitionStream { /// Handle to ensure background tasks are killed when no longer needed. #[allow(dead_code)] - drop_helper: Arc>, + drop_helper: Arc>>, /// Memory reservation. reservation: SharedMemoryReservation, @@ -877,7 +876,7 @@ struct PerPartitionStream { /// Handle to ensure background tasks are killed when no longer needed. #[allow(dead_code)] - drop_helper: Arc>, + drop_helper: Arc>>, /// Memory reservation. reservation: SharedMemoryReservation, @@ -1056,6 +1055,7 @@ mod tests { } #[tokio::test] + #[allow(clippy::disallowed_methods)] async fn many_to_many_round_robin_within_tokio_task() -> Result<()> { let join_handle: JoinHandle>>> = tokio::spawn(async move { diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 2d8237011fff..84bf3ec415ef 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -27,7 +27,7 @@ use std::io::BufReader; use std::path::{Path, PathBuf}; use std::sync::Arc; -use crate::common::{spawn_buffered, IPCWriter}; +use crate::common::{spawn_buffered, IPCWriter, SpawnedTask}; use crate::expressions::PhysicalSortExpr; use crate::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, @@ -56,7 +56,6 @@ use datafusion_physical_expr::EquivalenceProperties; use futures::{StreamExt, TryStreamExt}; use log::{debug, error, trace}; use tokio::sync::mpsc::Sender; -use tokio::task; struct ExternalSorterMetrics { /// metrics @@ -604,8 +603,8 @@ async fn spill_sorted_batches( schema: SchemaRef, ) -> Result<()> { let path: PathBuf = path.into(); - let handle = task::spawn_blocking(move || write_sorted(batches, path, schema)); - match handle.await { + let task = SpawnedTask::spawn_blocking(move || write_sorted(batches, path, schema)); + match task.join().await { Ok(r) => r, Err(e) => exec_err!("Error occurred while spilling {e}"), } diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs b/datafusion/sqllogictest/bin/sqllogictests.rs index ffae144eae84..41c33deec643 100644 --- a/datafusion/sqllogictest/bin/sqllogictests.rs +++ b/datafusion/sqllogictest/bin/sqllogictests.rs @@ -88,6 +88,7 @@ async fn run_tests() -> Result<()> { // modifying shared state like `/tmp/`) let errors: Vec<_> = futures::stream::iter(read_test_files(&options)?) .map(|test_file| { + #[allow(clippy::disallowed_methods)] // spawn allowed only in tests tokio::task::spawn(async move { println!("Running {:?}", test_file.relative_path); if options.complete { diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index f99d6e15e869..ce5635b6daf4 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -136,4 +136,5 @@ datafusion/proto/src/generated/prost.rs .github/ISSUE_TEMPLATE/feature_request.yml .github/workflows/docs.yaml **/node_modules/* -datafusion/wasmtest/pkg/* \ No newline at end of file +datafusion/wasmtest/pkg/* +clippy.toml From acd09da1731a77f33a87dbbedee7d759cedcecc8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 27 Feb 2024 08:17:58 -0500 Subject: [PATCH 27/45] Update nix requirement from 0.27.1 to 0.28.0 (#9344) Updates the requirements on [nix](https://github.com/nix-rust/nix) to permit the latest version. - [Changelog](https://github.com/nix-rust/nix/blob/master/CHANGELOG.md) - [Commits](https://github.com/nix-rust/nix/compare/v0.27.1...v0.28.0) --- updated-dependencies: - dependency-name: nix dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- datafusion/core/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index c3bd89037cfe..1247143f9fb4 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -142,7 +142,7 @@ thiserror = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] } tokio-postgres = "0.7.7" [target.'cfg(not(target_os = "windows"))'.dev-dependencies] -nix = { version = "0.27.1", features = ["fs"] } +nix = { version = "0.28.0", features = ["fs"] } [[bench]] harness = false From c439bc73b6a9ba9efa4c8a9b5d2fb6111e660e74 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Tue, 27 Feb 2024 08:19:52 -0500 Subject: [PATCH 28/45] Replace usages of internal_err with exec_err where appropriate (#9241) * internal_err! -> exec_err! * fmt updates. * Updated error macro from exec_err! to not_impl_err! for all unsupported type errors. * Revert "Updated error macro from exec_err! to not_impl_err! for all unsupported type errors." This reverts commit fe0517a7ca0c0c6e7fa2a61b0bf321d45185854c. * Updated a few instances of internal_err missed in previous audit. --------- Co-authored-by: Andrew Lamb --- datafusion/functions/src/core/nullif.rs | 13 ++- datafusion/functions/src/encoding/inner.rs | 34 ++++---- datafusion/functions/src/math/abs.rs | 6 +- datafusion/functions/src/math/nans.rs | 10 +-- .../physical-expr/src/aggregate/build_in.rs | 25 +++--- .../src/conditional_expressions.rs | 4 +- .../physical-expr/src/crypto_expressions.rs | 16 ++-- datafusion/physical-expr/src/functions.rs | 85 +++++++++---------- .../physical-expr/src/math_expressions.rs | 84 +++++++++--------- .../physical-expr/src/regex_expressions.rs | 17 ++-- .../physical-expr/src/string_expressions.rs | 49 +++++------ .../physical-expr/src/unicode_expressions.rs | 20 +++-- 12 files changed, 183 insertions(+), 180 deletions(-) diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index f83bd987c937..73bfba9b38b1 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -18,15 +18,15 @@ //! Encoding expressions use arrow::datatypes::DataType; -use datafusion_common::{internal_err, Result, DataFusionError}; +use datafusion_common::{exec_err, DataFusionError, Result}; use datafusion_expr::ColumnarValue; -use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use std::any::Any; use arrow::array::Array; use arrow::compute::kernels::cmp::eq; use arrow::compute::kernels::nullif::nullif; use datafusion_common::ScalarValue; +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; #[derive(Debug)] pub(super) struct NullIfFunc { @@ -58,7 +58,7 @@ impl NullIfFunc { Self { signature: Signature::uniform(2, SUPPORTED_NULLIF_TYPES.to_vec(), - Volatility::Immutable, + Volatility::Immutable, ) } } @@ -81,7 +81,7 @@ impl ScalarUDFImpl for NullIfFunc { let coerced_types = datafusion_expr::type_coercion::functions::data_types(arg_types, &self.signature); coerced_types.map(|typs| typs[0].clone()) .map_err(|e| e.context("Failed to coerce arguments for NULLIF") - ) + ) } fn invoke(&self, args: &[ColumnarValue]) -> Result { @@ -90,14 +90,13 @@ impl ScalarUDFImpl for NullIfFunc { } - /// Implements NULLIF(expr1, expr2) /// Args: 0 - left expr is any array /// 1 - if the left is equal to this expr2, then the result is NULL, otherwise left value is passed. /// fn nullif_func(args: &[ColumnarValue]) -> Result { if args.len() != 2 { - return internal_err!( + return exec_err!( "{:?} args were supplied but NULLIF takes exactly two args", args.len() ); diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index 886a031a5269..4cbeab3092c7 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -22,11 +22,11 @@ use arrow::{ datatypes::DataType, }; use base64::{engine::general_purpose, Engine as _}; -use datafusion_common::ScalarValue; use datafusion_common::{ cast::{as_generic_binary_array, as_generic_string_array}, - internal_err, not_impl_err, plan_err, + not_impl_err, plan_err, }; +use datafusion_common::{exec_err, ScalarValue}; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::ColumnarValue; use std::sync::Arc; @@ -111,6 +111,7 @@ impl DecodeFunc { } } } + impl ScalarUDFImpl for DecodeFunc { fn as_any(&self) -> &dyn Any { self @@ -148,6 +149,7 @@ enum Encoding { Base64, Hex, } + fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result { match value { ColumnarValue::Array(a) => match a.data_type() { @@ -155,7 +157,7 @@ fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result encoding.encode_utf8_array::(a.as_ref()), DataType::Binary => encoding.encode_binary_array::(a.as_ref()), DataType::LargeBinary => encoding.encode_binary_array::(a.as_ref()), - other => internal_err!( + other => exec_err!( "Unsupported data type {other:?} for function encode({encoding})" ), }, @@ -171,7 +173,7 @@ fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result Ok(encoding .encode_large_scalar(a.as_ref().map(|v: &Vec| v.as_slice()))), - other => internal_err!( + other => exec_err!( "Unsupported data type {other:?} for function encode({encoding})" ), } @@ -186,7 +188,7 @@ fn decode_process(value: &ColumnarValue, encoding: Encoding) -> Result encoding.decode_utf8_array::(a.as_ref()), DataType::Binary => encoding.decode_binary_array::(a.as_ref()), DataType::LargeBinary => encoding.decode_binary_array::(a.as_ref()), - other => internal_err!( + other => exec_err!( "Unsupported data type {other:?} for function decode({encoding})" ), }, @@ -202,7 +204,7 @@ fn decode_process(value: &ColumnarValue, encoding: Encoding) -> Result encoding .decode_large_scalar(a.as_ref().map(|v: &Vec| v.as_slice())), - other => internal_err!( + other => exec_err!( "Unsupported data type {other:?} for function decode({encoding})" ), } @@ -270,8 +272,8 @@ impl Encoding { } fn encode_binary_array(self, value: &dyn Array) -> Result - where - T: OffsetSizeTrait, + where + T: OffsetSizeTrait, { let input_value = as_generic_binary_array::(value)?; let array: ArrayRef = match self { @@ -282,8 +284,8 @@ impl Encoding { } fn encode_utf8_array(self, value: &dyn Array) -> Result - where - T: OffsetSizeTrait, + where + T: OffsetSizeTrait, { let input_value = as_generic_string_array::(value)?; let array: ArrayRef = match self { @@ -350,8 +352,8 @@ impl Encoding { } fn decode_binary_array(self, value: &dyn Array) -> Result - where - T: OffsetSizeTrait, + where + T: OffsetSizeTrait, { let input_value = as_generic_binary_array::(value)?; let array: ArrayRef = match self { @@ -362,8 +364,8 @@ impl Encoding { } fn decode_utf8_array(self, value: &dyn Array) -> Result - where - T: OffsetSizeTrait, + where + T: OffsetSizeTrait, { let input_value = as_generic_string_array::(value)?; let array: ArrayRef = match self { @@ -405,7 +407,7 @@ impl FromStr for Encoding { /// Standard encodings are base64 and hex. fn encode(args: &[ColumnarValue]) -> Result { if args.len() != 2 { - return internal_err!( + return exec_err!( "{:?} args were supplied but encode takes exactly two arguments", args.len() ); @@ -431,7 +433,7 @@ fn encode(args: &[ColumnarValue]) -> Result { /// Standard encodings are base64 and hex. fn decode(args: &[ColumnarValue]) -> Result { if args.len() != 2 { - return internal_err!( + return exec_err!( "{:?} args were supplied but decode takes exactly two arguments", args.len() ); diff --git a/datafusion/functions/src/math/abs.rs b/datafusion/functions/src/math/abs.rs index 21ca37fb8ec3..9ba0e3da2ad4 100644 --- a/datafusion/functions/src/math/abs.rs +++ b/datafusion/functions/src/math/abs.rs @@ -24,9 +24,9 @@ use arrow::array::Int32Array; use arrow::array::Int64Array; use arrow::array::Int8Array; use arrow::datatypes::DataType; -use datafusion_common::not_impl_err; +use datafusion_common::{exec_err, not_impl_err}; use datafusion_common::plan_datafusion_err; -use datafusion_common::{internal_err, Result, DataFusionError}; +use datafusion_common::{Result, DataFusionError}; use datafusion_expr::utils; use datafusion_expr::ColumnarValue; @@ -165,7 +165,7 @@ impl ScalarUDFImpl for AbsFunc { let args = ColumnarValue::values_to_arrays(args)?; if args.len() != 1 { - return internal_err!("abs function requires 1 argument, got {}", args.len()); + return exec_err!("abs function requires 1 argument, got {}", args.len()); } let input_data_type = args[0].data_type(); diff --git a/datafusion/functions/src/math/nans.rs b/datafusion/functions/src/math/nans.rs index 20754c18aa8e..c7868e6d5eca 100644 --- a/datafusion/functions/src/math/nans.rs +++ b/datafusion/functions/src/math/nans.rs @@ -18,14 +18,14 @@ //! Encoding expressions use arrow::datatypes::DataType; -use datafusion_common::{internal_err, Result, DataFusionError}; +use datafusion_common::{exec_err, DataFusionError, Result}; use datafusion_expr::ColumnarValue; +use arrow::array::{ArrayRef, BooleanArray, Float32Array, Float64Array}; use datafusion_expr::TypeSignature::*; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, BooleanArray, Float32Array, Float64Array}; #[derive(Debug)] pub(super) struct IsNanFunc { @@ -73,7 +73,7 @@ impl ScalarUDFImpl for IsNanFunc { BooleanArray, { f64::is_nan } )) - }, + } DataType::Float32 => { Arc::new(make_function_scalar_inputs_return_type!( &args[0], @@ -82,8 +82,8 @@ impl ScalarUDFImpl for IsNanFunc { BooleanArray, { f32::is_nan } )) - }, - other => return internal_err!("Unsupported data type {other:?} for function isnan"), + } + other => return exec_err!("Unsupported data type {other:?} for function isnan"), }; Ok(ColumnarValue::Array(arr)) } diff --git a/datafusion/physical-expr/src/aggregate/build_in.rs b/datafusion/physical-expr/src/aggregate/build_in.rs index 1a3d21fc40bc..2918856aa623 100644 --- a/datafusion/physical-expr/src/aggregate/build_in.rs +++ b/datafusion/physical-expr/src/aggregate/build_in.rs @@ -28,14 +28,15 @@ use std::sync::Arc; +use arrow::datatypes::Schema; + +use datafusion_common::{exec_err, not_impl_err, DataFusionError, Result}; +use datafusion_expr::AggregateFunction; + use crate::aggregate::regr::RegrType; use crate::expressions::{self, Literal}; use crate::{AggregateExpr, PhysicalExpr, PhysicalSortExpr}; -use arrow::datatypes::Schema; -use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result}; -use datafusion_expr::AggregateFunction; - /// Create a physical aggregation expression. /// This function errors when `input_phy_exprs`' can't be coerced to a valid argument type of the aggregation function. pub fn create_aggregate_expr( @@ -379,9 +380,7 @@ pub fn create_aggregate_expr( .downcast_ref::() .map(|literal| literal.value()) else { - return internal_err!( - "Second argument of NTH_VALUE needs to be a literal" - ); + return exec_err!("Second argument of NTH_VALUE needs to be a literal"); }; let nullable = expr.nullable(input_schema)?; Arc::new(expressions::NthValueAgg::new( @@ -415,17 +414,19 @@ pub fn create_aggregate_expr( #[cfg(test)] mod tests { - use super::*; + use arrow::datatypes::{DataType, Field}; + + use datafusion_common::{plan_err, ScalarValue}; + use datafusion_expr::type_coercion::aggregates::NUMERICS; + use datafusion_expr::{type_coercion, Signature}; + use crate::expressions::{ try_cast, ApproxDistinct, ApproxMedian, ApproxPercentileCont, ArrayAgg, Avg, BitAnd, BitOr, BitXor, BoolAnd, BoolOr, Correlation, Count, Covariance, DistinctArrayAgg, DistinctCount, Max, Min, Stddev, Sum, Variance, }; - use arrow::datatypes::{DataType, Field}; - use datafusion_common::{plan_err, ScalarValue}; - use datafusion_expr::type_coercion::aggregates::NUMERICS; - use datafusion_expr::{type_coercion, Signature}; + use super::*; #[test] fn test_count_arragg_approx_expr() -> Result<()> { diff --git a/datafusion/physical-expr/src/conditional_expressions.rs b/datafusion/physical-expr/src/conditional_expressions.rs index 782897d46379..cc8f3c8dfaf0 100644 --- a/datafusion/physical-expr/src/conditional_expressions.rs +++ b/datafusion/physical-expr/src/conditional_expressions.rs @@ -19,14 +19,14 @@ use arrow::array::{new_null_array, Array, BooleanArray}; use arrow::compute::kernels::zip::zip; use arrow::compute::{and, is_not_null, is_null}; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{exec_err, DataFusionError, Result}; use datafusion_expr::ColumnarValue; /// coalesce evaluates to the first value which is not NULL pub fn coalesce(args: &[ColumnarValue]) -> Result { // do not accept 0 arguments. if args.is_empty() { - return internal_err!( + return exec_err!( "coalesce was called with {} arguments. It requires at least 1.", args.len() ); diff --git a/datafusion/physical-expr/src/crypto_expressions.rs b/datafusion/physical-expr/src/crypto_expressions.rs index 580b0ed01b6e..3ff3bc83f297 100644 --- a/datafusion/physical-expr/src/crypto_expressions.rs +++ b/datafusion/physical-expr/src/crypto_expressions.rs @@ -23,11 +23,11 @@ use arrow::{ }; use blake2::{Blake2b512, Blake2s256, Digest}; use blake3::Hasher as Blake3; -use datafusion_common::ScalarValue; use datafusion_common::{ cast::{as_binary_array, as_generic_binary_array, as_generic_string_array}, plan_err, }; +use datafusion_common::{exec_err, ScalarValue}; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_expr::ColumnarValue; use md5::Md5; @@ -66,7 +66,7 @@ fn digest_process( DataType::LargeBinary => { digest_algorithm.digest_binary_array::(a.as_ref()) } - other => internal_err!( + other => exec_err!( "Unsupported data type {other:?} for function {digest_algorithm}" ), }, @@ -77,7 +77,7 @@ fn digest_process( } ScalarValue::Binary(a) | ScalarValue::LargeBinary(a) => Ok(digest_algorithm .digest_scalar(a.as_ref().map(|v: &Vec| v.as_slice()))), - other => internal_err!( + other => exec_err!( "Unsupported data type {other:?} for function {digest_algorithm}" ), }, @@ -238,7 +238,7 @@ macro_rules! define_digest_function { #[doc = $DOC] pub fn $NAME(args: &[ColumnarValue]) -> Result { if args.len() != 1 { - return internal_err!( + return exec_err!( "{:?} args were supplied but {} takes exactly one argument", args.len(), DigestAlgorithm::$METHOD.to_string() @@ -264,7 +264,7 @@ fn hex_encode>(data: T) -> String { /// computes md5 hash digest of the given input pub fn md5(args: &[ColumnarValue]) -> Result { if args.len() != 1 { - return internal_err!( + return exec_err!( "{:?} args were supplied but {} takes exactly one argument", args.len(), DigestAlgorithm::Md5 @@ -284,7 +284,7 @@ pub fn md5(args: &[ColumnarValue]) -> Result { ColumnarValue::Scalar(ScalarValue::Binary(opt)) => { ColumnarValue::Scalar(ScalarValue::Utf8(opt.map(hex_encode::<_>))) } - _ => return internal_err!("Impossibly got invalid results from digest"), + _ => return exec_err!("Impossibly got invalid results from digest"), }) } @@ -329,7 +329,7 @@ define_digest_function!( /// Standard algorithms are md5, sha1, sha224, sha256, sha384 and sha512. pub fn digest(args: &[ColumnarValue]) -> Result { if args.len() != 2 { - return internal_err!( + return exec_err!( "{:?} args were supplied but digest takes exactly two arguments", args.len() ); @@ -339,7 +339,7 @@ pub fn digest(args: &[ColumnarValue]) -> Result { ScalarValue::Utf8(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => { method.parse::() } - other => internal_err!("Unsupported data type {other:?} for function digest"), + other => exec_err!("Unsupported data type {other:?} for function digest"), }, ColumnarValue::Array(_) => { internal_err!("Digest using dynamically decided method is not yet supported") diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 0dc3f96dc12a..c91b96d67a22 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -42,7 +42,7 @@ use arrow::{ datatypes::{DataType, Int32Type, Int64Type, Schema}, }; use arrow_array::Array; -use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{exec_err, DataFusionError, Result, ScalarValue}; pub use datafusion_expr::FuncMonotonicity; use datafusion_expr::{ type_coercion::functions::data_types, BuiltinScalarFunction, ColumnarValue, @@ -95,6 +95,7 @@ macro_rules! invoke_if_crypto_expressions_feature_flag { #[cfg(not(feature = "crypto_expressions"))] macro_rules! invoke_if_crypto_expressions_feature_flag { ($FUNC:ident, $NAME:expr) => { + use datafusion_common::internal_err; |_: &[ColumnarValue]| -> Result { internal_err!( "function {} requires compilation with feature flag: crypto_expressions.", @@ -433,7 +434,7 @@ pub fn create_physical_fun( DataType::LargeUtf8 => { make_scalar_function_inner(string_expressions::ascii::)(args) } - other => internal_err!("Unsupported data type {other:?} for function ascii"), + other => exec_err!("Unsupported data type {other:?} for function ascii"), }), BuiltinScalarFunction::BitLength => Arc::new(|args| match &args[0] { ColumnarValue::Array(v) => Ok(ColumnarValue::Array(bit_length(v.as_ref())?)), @@ -454,7 +455,7 @@ pub fn create_physical_fun( DataType::LargeUtf8 => { make_scalar_function_inner(string_expressions::btrim::)(args) } - other => internal_err!("Unsupported data type {other:?} for function btrim"), + other => exec_err!("Unsupported data type {other:?} for function btrim"), }), BuiltinScalarFunction::CharacterLength => { Arc::new(|args| match args[0].data_type() { @@ -474,7 +475,7 @@ pub fn create_physical_fun( ); make_scalar_function_inner(func)(args) } - other => internal_err!( + other => exec_err!( "Unsupported data type {other:?} for function character_length" ), }) @@ -536,7 +537,7 @@ pub fn create_physical_fun( make_scalar_function_inner(string_expressions::initcap::)(args) } other => { - internal_err!("Unsupported data type {other:?} for function initcap") + exec_err!("Unsupported data type {other:?} for function initcap") } }), BuiltinScalarFunction::InStr => Arc::new(|args| match args[0].data_type() { @@ -546,7 +547,7 @@ pub fn create_physical_fun( DataType::LargeUtf8 => { make_scalar_function_inner(string_expressions::instr::)(args) } - other => internal_err!("Unsupported data type {other:?} for function instr"), + other => exec_err!("Unsupported data type {other:?} for function instr"), }), BuiltinScalarFunction::Left => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { @@ -557,7 +558,7 @@ pub fn create_physical_fun( let func = invoke_if_unicode_expressions_feature_flag!(left, i64, "left"); make_scalar_function_inner(func)(args) } - other => internal_err!("Unsupported data type {other:?} for function left"), + other => exec_err!("Unsupported data type {other:?} for function left"), }), BuiltinScalarFunction::Lower => Arc::new(string_expressions::lower), BuiltinScalarFunction::Lpad => Arc::new(|args| match args[0].data_type() { @@ -569,7 +570,7 @@ pub fn create_physical_fun( let func = invoke_if_unicode_expressions_feature_flag!(lpad, i64, "lpad"); make_scalar_function_inner(func)(args) } - other => internal_err!("Unsupported data type {other:?} for function lpad"), + other => exec_err!("Unsupported data type {other:?} for function lpad"), }), BuiltinScalarFunction::Ltrim => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { @@ -578,7 +579,7 @@ pub fn create_physical_fun( DataType::LargeUtf8 => { make_scalar_function_inner(string_expressions::ltrim::)(args) } - other => internal_err!("Unsupported data type {other:?} for function ltrim"), + other => exec_err!("Unsupported data type {other:?} for function ltrim"), }), BuiltinScalarFunction::MD5 => { Arc::new(invoke_if_crypto_expressions_feature_flag!(md5, "md5")) @@ -616,7 +617,7 @@ pub fn create_physical_fun( make_scalar_function_inner(func)(args) } other => { - internal_err!("Unsupported data type {other:?} for function regexp_like") + exec_err!("Unsupported data type {other:?} for function regexp_like") } }), BuiltinScalarFunction::RegexpMatch => { @@ -637,9 +638,9 @@ pub fn create_physical_fun( ); make_scalar_function_inner(func)(args) } - other => internal_err!( - "Unsupported data type {other:?} for function regexp_match" - ), + other => { + exec_err!("Unsupported data type {other:?} for function regexp_match") + } }) } BuiltinScalarFunction::RegexpReplace => { @@ -662,7 +663,7 @@ pub fn create_physical_fun( let func = specializer_func(args)?; func(args) } - other => internal_err!( + other => exec_err!( "Unsupported data type {other:?} for function regexp_replace" ), }) @@ -674,7 +675,7 @@ pub fn create_physical_fun( DataType::LargeUtf8 => { make_scalar_function_inner(string_expressions::repeat::)(args) } - other => internal_err!("Unsupported data type {other:?} for function repeat"), + other => exec_err!("Unsupported data type {other:?} for function repeat"), }), BuiltinScalarFunction::Replace => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { @@ -684,7 +685,7 @@ pub fn create_physical_fun( make_scalar_function_inner(string_expressions::replace::)(args) } other => { - internal_err!("Unsupported data type {other:?} for function replace") + exec_err!("Unsupported data type {other:?} for function replace") } }), BuiltinScalarFunction::Reverse => Arc::new(|args| match args[0].data_type() { @@ -699,7 +700,7 @@ pub fn create_physical_fun( make_scalar_function_inner(func)(args) } other => { - internal_err!("Unsupported data type {other:?} for function reverse") + exec_err!("Unsupported data type {other:?} for function reverse") } }), BuiltinScalarFunction::Right => Arc::new(|args| match args[0].data_type() { @@ -713,7 +714,7 @@ pub fn create_physical_fun( invoke_if_unicode_expressions_feature_flag!(right, i64, "right"); make_scalar_function_inner(func)(args) } - other => internal_err!("Unsupported data type {other:?} for function right"), + other => exec_err!("Unsupported data type {other:?} for function right"), }), BuiltinScalarFunction::Rpad => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { @@ -724,7 +725,7 @@ pub fn create_physical_fun( let func = invoke_if_unicode_expressions_feature_flag!(rpad, i64, "rpad"); make_scalar_function_inner(func)(args) } - other => internal_err!("Unsupported data type {other:?} for function rpad"), + other => exec_err!("Unsupported data type {other:?} for function rpad"), }), BuiltinScalarFunction::Rtrim => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { @@ -733,7 +734,7 @@ pub fn create_physical_fun( DataType::LargeUtf8 => { make_scalar_function_inner(string_expressions::rtrim::)(args) } - other => internal_err!("Unsupported data type {other:?} for function rtrim"), + other => exec_err!("Unsupported data type {other:?} for function rtrim"), }), BuiltinScalarFunction::SHA224 => { Arc::new(invoke_if_crypto_expressions_feature_flag!(sha224, "sha224")) @@ -755,7 +756,7 @@ pub fn create_physical_fun( make_scalar_function_inner(string_expressions::split_part::)(args) } other => { - internal_err!("Unsupported data type {other:?} for function split_part") + exec_err!("Unsupported data type {other:?} for function split_part") } }), BuiltinScalarFunction::StringToArray => { @@ -767,7 +768,7 @@ pub fn create_physical_fun( array_expressions::string_to_array::, )(args), other => { - internal_err!( + exec_err!( "Unsupported data type {other:?} for function string_to_array" ) } @@ -781,7 +782,7 @@ pub fn create_physical_fun( make_scalar_function_inner(string_expressions::starts_with::)(args) } other => { - internal_err!("Unsupported data type {other:?} for function starts_with") + exec_err!("Unsupported data type {other:?} for function starts_with") } }), BuiltinScalarFunction::EndsWith => Arc::new(|args| match args[0].data_type() { @@ -792,7 +793,7 @@ pub fn create_physical_fun( make_scalar_function_inner(string_expressions::ends_with::)(args) } other => { - internal_err!("Unsupported data type {other:?} for function ends_with") + exec_err!("Unsupported data type {other:?} for function ends_with") } }), BuiltinScalarFunction::Strpos => Arc::new(|args| match args[0].data_type() { @@ -808,7 +809,7 @@ pub fn create_physical_fun( ); make_scalar_function_inner(func)(args) } - other => internal_err!("Unsupported data type {other:?} for function strpos"), + other => exec_err!("Unsupported data type {other:?} for function strpos"), }), BuiltinScalarFunction::Substr => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { @@ -821,7 +822,7 @@ pub fn create_physical_fun( invoke_if_unicode_expressions_feature_flag!(substr, i64, "substr"); make_scalar_function_inner(func)(args) } - other => internal_err!("Unsupported data type {other:?} for function substr"), + other => exec_err!("Unsupported data type {other:?} for function substr"), }), BuiltinScalarFunction::ToHex => Arc::new(|args| match args[0].data_type() { DataType::Int32 => { @@ -830,7 +831,7 @@ pub fn create_physical_fun( DataType::Int64 => { make_scalar_function_inner(string_expressions::to_hex::)(args) } - other => internal_err!("Unsupported data type {other:?} for function to_hex"), + other => exec_err!("Unsupported data type {other:?} for function to_hex"), }), BuiltinScalarFunction::Translate => Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { @@ -850,7 +851,7 @@ pub fn create_physical_fun( make_scalar_function_inner(func)(args) } other => { - internal_err!("Unsupported data type {other:?} for function translate") + exec_err!("Unsupported data type {other:?} for function translate") } }), BuiltinScalarFunction::Trim => Arc::new(|args| match args[0].data_type() { @@ -860,13 +861,13 @@ pub fn create_physical_fun( DataType::LargeUtf8 => { make_scalar_function_inner(string_expressions::btrim::)(args) } - other => internal_err!("Unsupported data type {other:?} for function trim"), + other => exec_err!("Unsupported data type {other:?} for function trim"), }), BuiltinScalarFunction::Upper => Arc::new(string_expressions::upper), BuiltinScalarFunction::Uuid => Arc::new(string_expressions::uuid), BuiltinScalarFunction::ArrowTypeof => Arc::new(move |args| { if args.len() != 1 { - return internal_err!( + return exec_err!( "arrow_typeof function requires 1 arguments, got {}", args.len() ); @@ -884,9 +885,7 @@ pub fn create_physical_fun( DataType::LargeUtf8 => { make_scalar_function_inner(string_expressions::overlay::)(args) } - other => Err(DataFusionError::Internal(format!( - "Unsupported data type {other:?} for function overlay", - ))), + other => exec_err!("Unsupported data type {other:?} for function overlay"), }), BuiltinScalarFunction::Levenshtein => { Arc::new(|args| match args[0].data_type() { @@ -896,9 +895,9 @@ pub fn create_physical_fun( DataType::LargeUtf8 => make_scalar_function_inner( string_expressions::levenshtein::, )(args), - other => Err(DataFusionError::Internal(format!( - "Unsupported data type {other:?} for function levenshtein", - ))), + other => { + exec_err!("Unsupported data type {other:?} for function levenshtein") + } }) } BuiltinScalarFunction::SubstrIndex => { @@ -919,9 +918,9 @@ pub fn create_physical_fun( ); make_scalar_function_inner(func)(args) } - other => Err(DataFusionError::Internal(format!( - "Unsupported data type {other:?} for function substr_index", - ))), + other => { + exec_err!("Unsupported data type {other:?} for function substr_index") + } }) } BuiltinScalarFunction::FindInSet => Arc::new(|args| match args[0].data_type() { @@ -941,9 +940,9 @@ pub fn create_physical_fun( ); make_scalar_function_inner(func)(args) } - other => Err(DataFusionError::Internal(format!( - "Unsupported data type {other:?} for function find_in_set", - ))), + other => { + exec_err!("Unsupported data type {other:?} for function find_in_set") + } }), }) } @@ -1023,7 +1022,7 @@ mod tests { record_batch::RecordBatch, }; use datafusion_common::cast::{as_boolean_array, as_uint64_array}; - use datafusion_common::{exec_err, plan_err}; + use datafusion_common::{exec_err, internal_err, plan_err}; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::type_coercion::functions::data_types; use datafusion_expr::Signature; diff --git a/datafusion/physical-expr/src/math_expressions.rs b/datafusion/physical-expr/src/math_expressions.rs index b622aee8e2b3..98a05dff5386 100644 --- a/datafusion/physical-expr/src/math_expressions.rs +++ b/datafusion/physical-expr/src/math_expressions.rs @@ -17,19 +17,20 @@ //! Math expressions +use std::any::type_name; +use std::iter; +use std::mem::swap; +use std::sync::Arc; + use arrow::array::ArrayRef; use arrow::array::{BooleanArray, Float32Array, Float64Array, Int64Array}; use arrow::datatypes::DataType; -use datafusion_common::internal_err; -use datafusion_common::ScalarValue; +use rand::{thread_rng, Rng}; + use datafusion_common::ScalarValue::{Float32, Int64}; +use datafusion_common::{exec_err, ScalarValue}; use datafusion_common::{DataFusionError, Result}; use datafusion_expr::ColumnarValue; -use rand::{thread_rng, Rng}; -use std::any::type_name; -use std::iter; -use std::mem::swap; -use std::sync::Arc; macro_rules! downcast_compute_op { ($ARRAY:expr, $NAME:expr, $FUNC:ident, $TYPE:ident) => {{ @@ -40,7 +41,7 @@ macro_rules! downcast_compute_op { arrow::compute::kernels::arity::unary(array, |x| x.$FUNC()); Ok(Arc::new(res)) } - _ => internal_err!("Invalid data type for {}", $NAME), + _ => exec_err!("Invalid data type for {}", $NAME), } }}; } @@ -57,11 +58,9 @@ macro_rules! unary_primitive_array_op { let result = downcast_compute_op!(array, $NAME, $FUNC, Float64Array); Ok(ColumnarValue::Array(result?)) } - other => internal_err!( - "Unsupported data type {:?} for function {}", - other, - $NAME - ), + other => { + exec_err!("Unsupported data type {:?} for function {}", other, $NAME) + } }, ColumnarValue::Scalar(a) => match a { ScalarValue::Float32(a) => Ok(ColumnarValue::Scalar( @@ -70,7 +69,7 @@ macro_rules! unary_primitive_array_op { ScalarValue::Float64(a) => Ok(ColumnarValue::Scalar( ScalarValue::Float64(a.map(|x| x.$FUNC())), )), - _ => internal_err!( + _ => exec_err!( "Unsupported data type {:?} for function {}", ($VALUE).data_type(), $NAME @@ -187,7 +186,7 @@ pub fn factorial(args: &[ArrayRef]) -> Result { Int64Array, { |value: i64| { (1..=value).product() } } )) as ArrayRef), - other => internal_err!("Unsupported data type {other:?} for function factorial."), + other => exec_err!("Unsupported data type {other:?} for function factorial."), } } @@ -234,7 +233,7 @@ pub fn gcd(args: &[ArrayRef]) -> Result { Int64Array, { compute_gcd } )) as ArrayRef), - other => internal_err!("Unsupported data type {other:?} for function gcd"), + other => exec_err!("Unsupported data type {other:?} for function gcd"), } } @@ -260,7 +259,7 @@ pub fn lcm(args: &[ArrayRef]) -> Result { Int64Array, { compute_lcm } )) as ArrayRef), - other => internal_err!("Unsupported data type {other:?} for function lcm"), + other => exec_err!("Unsupported data type {other:?} for function lcm"), } } @@ -305,7 +304,7 @@ pub fn nanvl(args: &[ArrayRef]) -> Result { )) as ArrayRef) } - other => internal_err!("Unsupported data type {other:?} for function nanvl"), + other => exec_err!("Unsupported data type {other:?} for function nanvl"), } } @@ -328,7 +327,7 @@ pub fn isnan(args: &[ArrayRef]) -> Result { { f32::is_nan } )) as ArrayRef), - other => internal_err!("Unsupported data type {other:?} for function isnan"), + other => exec_err!("Unsupported data type {other:?} for function isnan"), } } @@ -351,14 +350,14 @@ pub fn iszero(args: &[ArrayRef]) -> Result { { |x: f32| { x == 0_f32 } } )) as ArrayRef), - other => internal_err!("Unsupported data type {other:?} for function iszero"), + other => exec_err!("Unsupported data type {other:?} for function iszero"), } } /// Pi SQL function pub fn pi(args: &[ColumnarValue]) -> Result { if !matches!(&args[0], ColumnarValue::Array(_)) { - return internal_err!("Expect pi function to take no param"); + return exec_err!("Expect pi function to take no param"); } let array = Float64Array::from_value(std::f64::consts::PI, 1); Ok(ColumnarValue::Array(Arc::new(array))) @@ -368,7 +367,7 @@ pub fn pi(args: &[ColumnarValue]) -> Result { pub fn random(args: &[ColumnarValue]) -> Result { let len: usize = match &args[0] { ColumnarValue::Array(array) => array.len(), - _ => return internal_err!("Expect random function to take no param"), + _ => return exec_err!("Expect random function to take no param"), }; let mut rng = thread_rng(); let values = iter::repeat_with(|| rng.gen_range(0.0..1.0)).take(len); @@ -379,7 +378,7 @@ pub fn random(args: &[ColumnarValue]) -> Result { /// Round SQL function pub fn round(args: &[ArrayRef]) -> Result { if args.len() != 1 && args.len() != 2 { - return internal_err!( + return exec_err!( "round function requires one or two arguments, got {}", args.len() ); @@ -423,9 +422,9 @@ pub fn round(args: &[ArrayRef]) -> Result { } } )) as ArrayRef), - _ => internal_err!( - "round function requires a scalar or array for decimal_places" - ), + _ => { + exec_err!("round function requires a scalar or array for decimal_places") + } }, DataType::Float32 => match decimal_places { @@ -459,12 +458,12 @@ pub fn round(args: &[ArrayRef]) -> Result { } } )) as ArrayRef), - _ => internal_err!( - "round function requires a scalar or array for decimal_places" - ), + _ => { + exec_err!("round function requires a scalar or array for decimal_places") + } }, - other => internal_err!("Unsupported data type {other:?} for function round"), + other => exec_err!("Unsupported data type {other:?} for function round"), } } @@ -489,7 +488,7 @@ pub fn power(args: &[ArrayRef]) -> Result { { i64::pow } )) as ArrayRef), - other => internal_err!("Unsupported data type {other:?} for function power"), + other => exec_err!("Unsupported data type {other:?} for function power"), } } @@ -514,7 +513,7 @@ pub fn atan2(args: &[ArrayRef]) -> Result { { f32::atan2 } )) as ArrayRef), - other => internal_err!("Unsupported data type {other:?} for function atan2"), + other => exec_err!("Unsupported data type {other:?} for function atan2"), } } @@ -547,7 +546,7 @@ pub fn log(args: &[ArrayRef]) -> Result { Float64Array, { f64::log } )) as ArrayRef), - _ => internal_err!("log function requires a scalar or array for base"), + _ => exec_err!("log function requires a scalar or array for base"), }, DataType::Float32 => match base { @@ -565,10 +564,10 @@ pub fn log(args: &[ArrayRef]) -> Result { Float32Array, { f32::log } )) as ArrayRef), - _ => internal_err!("log function requires a scalar or array for base"), + _ => exec_err!("log function requires a scalar or array for base"), }, - other => internal_err!("Unsupported data type {other:?} for function log"), + other => exec_err!("Unsupported data type {other:?} for function log"), } } @@ -589,7 +588,7 @@ pub fn cot(args: &[ArrayRef]) -> Result { { compute_cot32 } )) as ArrayRef), - other => internal_err!("Unsupported data type {other:?} for function cot"), + other => exec_err!("Unsupported data type {other:?} for function cot"), } } @@ -606,7 +605,7 @@ fn compute_cot64(x: f64) -> f64 { /// Truncate(numeric, decimalPrecision) and trunc(numeric) SQL function pub fn trunc(args: &[ArrayRef]) -> Result { if args.len() != 1 && args.len() != 2 { - return internal_err!( + return exec_err!( "truncate function requires one or two arguments, got {}", args.len() ); @@ -635,7 +634,7 @@ pub fn trunc(args: &[ArrayRef]) -> Result { Int64Array, { compute_truncate64 } )) as ArrayRef), - _ => internal_err!("trunc function requires a scalar or array for precision"), + _ => exec_err!("trunc function requires a scalar or array for precision"), }, DataType::Float32 => match precision { ColumnarValue::Scalar(Int64(Some(0))) => Ok(Arc::new( @@ -650,9 +649,9 @@ pub fn trunc(args: &[ArrayRef]) -> Result { Int64Array, { compute_truncate32 } )) as ArrayRef), - _ => internal_err!("trunc function requires a scalar or array for precision"), + _ => exec_err!("trunc function requires a scalar or array for precision"), }, - other => internal_err!("Unsupported data type {other:?} for function trunc"), + other => exec_err!("Unsupported data type {other:?} for function trunc"), } } @@ -668,13 +667,14 @@ fn compute_truncate64(x: f64, y: i64) -> f64 { #[cfg(test)] mod tests { - - use super::*; use arrow::array::{Float64Array, NullArray}; + use datafusion_common::cast::{ as_boolean_array, as_float32_array, as_float64_array, as_int64_array, }; + use super::*; + #[test] fn test_random_expression() { let args = vec![ColumnarValue::Array(Arc::new(NullArray::new(1)))]; diff --git a/datafusion/physical-expr/src/regex_expressions.rs b/datafusion/physical-expr/src/regex_expressions.rs index b1334854ba0b..846e5801af1c 100644 --- a/datafusion/physical-expr/src/regex_expressions.rs +++ b/datafusion/physical-expr/src/regex_expressions.rs @@ -21,19 +21,18 @@ //! Regex expressions +use std::sync::{Arc, OnceLock}; + use arrow::array::{ new_null_array, Array, ArrayDataBuilder, ArrayRef, BufferBuilder, GenericStringArray, OffsetSizeTrait, }; +use hashbrown::HashMap; +use regex::Regex; use datafusion_common::{arrow_datafusion_err, exec_err, plan_err}; -use datafusion_common::{ - cast::as_generic_string_array, internal_err, DataFusionError, Result, -}; +use datafusion_common::{cast::as_generic_string_array, DataFusionError, Result}; use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation}; -use hashbrown::HashMap; -use regex::Regex; -use std::sync::{Arc, OnceLock}; use crate::functions::{ make_scalar_function_inner, make_scalar_function_with_hints, Hint, @@ -188,7 +187,7 @@ pub fn regexp_match(args: &[ArrayRef]) -> Result { arrow_string::regexp::regexp_match(values, regex, Some(flags)) .map_err(|e| arrow_datafusion_err!(e)) } - other => internal_err!( + other => exec_err!( "regexp_match was called with {other} arguments. It requires at least 2 and at most 3." ), } @@ -341,7 +340,7 @@ pub fn regexp_replace(args: &[ArrayRef]) -> Result Ok(Arc::new(result) as ArrayRef) } - other => internal_err!( + other => exec_err!( "regexp_replace was called with {other} arguments. It requires at least 3 and at most 4." ), } @@ -374,7 +373,7 @@ fn _regexp_replace_static_pattern_replace( 3 => None, 4 => Some(fetch_string_arg!(&args[3], "flags", T, _regexp_replace_early_abort)), other => { - return internal_err!( + return exec_err!( "regexp_replace was called with {other} arguments. It requires at least 3 and at most 4." ) } diff --git a/datafusion/physical-expr/src/string_expressions.rs b/datafusion/physical-expr/src/string_expressions.rs index 34a436ebe3cd..6a4a29763e4b 100644 --- a/datafusion/physical-expr/src/string_expressions.rs +++ b/datafusion/physical-expr/src/string_expressions.rs @@ -21,6 +21,12 @@ //! String expressions +use std::sync::Arc; +use std::{ + fmt::{Display, Formatter}, + iter, +}; + use arrow::{ array::{ Array, ArrayRef, GenericStringArray, Int32Array, Int64Array, OffsetSizeTrait, @@ -28,6 +34,8 @@ use arrow::{ }, datatypes::{ArrowNativeType, ArrowPrimitiveType, DataType}, }; +use uuid::Uuid; + use datafusion_common::utils::datafusion_strsim; use datafusion_common::{ cast::{ @@ -35,14 +43,8 @@ use datafusion_common::{ }, exec_err, ScalarValue, }; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{DataFusionError, Result}; use datafusion_expr::ColumnarValue; -use std::sync::Arc; -use std::{ - fmt::{Display, Formatter}, - iter, -}; -use uuid::Uuid; /// applies a unary expression to `args[0]` that is expected to be downcastable to /// a `GenericStringArray` and returns a `GenericStringArray` (which may have a different offset) @@ -62,7 +64,7 @@ where F: Fn(&'a str) -> R, { if args.len() != 1 { - return internal_err!( + return exec_err!( "{:?} args were supplied but {} takes exactly one argument", args.len(), name @@ -102,7 +104,7 @@ where &[a.as_ref()], op, name )?))) } - other => internal_err!("Unsupported data type {other:?} for function {name}"), + other => exec_err!("Unsupported data type {other:?} for function {name}"), }, ColumnarValue::Scalar(scalar) => match scalar { ScalarValue::Utf8(a) => { @@ -113,7 +115,7 @@ where let result = a.as_ref().map(|x| (op)(x).as_ref().to_string()); Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(result))) } - other => internal_err!("Unsupported data type {other:?} for function {name}"), + other => exec_err!("Unsupported data type {other:?} for function {name}"), }, } } @@ -170,7 +172,7 @@ pub fn chr(args: &[ArrayRef]) -> Result { pub fn concat(args: &[ColumnarValue]) -> Result { // do not accept 0 arguments. if args.is_empty() { - return internal_err!( + return exec_err!( "concat was called with {} arguments. It requires at least 1.", args.len() ); @@ -236,7 +238,7 @@ pub fn concat_ws(args: &[ArrayRef]) -> Result { // do not accept 0 or 1 arguments. if args.len() < 2 { - return internal_err!( + return exec_err!( "concat_ws was called with {} arguments. It requires at least 2.", args.len() ); @@ -333,7 +335,7 @@ pub fn instr(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } other => { - internal_err!( + exec_err!( "instr was called with {other} datatype arguments. It requires Utf8 or LargeUtf8." ) } @@ -410,7 +412,7 @@ fn general_trim( Ok(Arc::new(result) as ArrayRef) } other => { - internal_err!( + exec_err!( "{trim_type} was called with {other} arguments. It requires at least 1 and at most 2." ) } @@ -541,7 +543,7 @@ where } else if let Some(value_isize) = value.to_isize() { Ok(Some(format!("{value_isize:x}"))) } else { - internal_err!("Unsupported data type {integer:?} for function to_hex") + exec_err!("Unsupported data type {integer:?} for function to_hex") } } else { Ok(None) @@ -563,7 +565,7 @@ pub fn upper(args: &[ColumnarValue]) -> Result { pub fn uuid(args: &[ColumnarValue]) -> Result { let len: usize = match &args[0] { ColumnarValue::Array(array) => array.len(), - _ => return internal_err!("Expect uuid function to take no param"), + _ => return exec_err!("Expect uuid function to take no param"), }; let values = iter::repeat_with(|| Uuid::new_v4().to_string()).take(len); @@ -654,9 +656,7 @@ pub fn overlay(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } other => { - internal_err!( - "overlay was called with {other} arguments. It requires 3 or 4." - ) + exec_err!("overlay was called with {other} arguments. It requires 3 or 4.") } } } @@ -665,10 +665,10 @@ pub fn overlay(args: &[ArrayRef]) -> Result { /// LEVENSHTEIN('kitten', 'sitting') = 3 pub fn levenshtein(args: &[ArrayRef]) -> Result { if args.len() != 2 { - return Err(DataFusionError::Internal(format!( + return exec_err!( "levenshtein function requires two arguments, got {}", args.len() - ))); + ); } let str1_array = as_generic_string_array::(&args[0])?; let str2_array = as_generic_string_array::(&args[1])?; @@ -700,7 +700,7 @@ pub fn levenshtein(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } other => { - internal_err!( + exec_err!( "levenshtein was called with {other} datatype arguments. It requires Utf8 or LargeUtf8." ) } @@ -709,12 +709,13 @@ pub fn levenshtein(args: &[ArrayRef]) -> Result { #[cfg(test)] mod tests { - - use crate::string_expressions; use arrow::{array::Int32Array, datatypes::Int32Type}; use arrow_array::Int64Array; + use datafusion_common::cast::as_int32_array; + use crate::string_expressions; + use super::*; #[test] diff --git a/datafusion/physical-expr/src/unicode_expressions.rs b/datafusion/physical-expr/src/unicode_expressions.rs index 240efe4223c3..3209a6176fad 100644 --- a/datafusion/physical-expr/src/unicode_expressions.rs +++ b/datafusion/physical-expr/src/unicode_expressions.rs @@ -21,18 +21,20 @@ //! Unicode expressions +use std::cmp::{max, Ordering}; +use std::sync::Arc; + use arrow::{ array::{ArrayRef, GenericStringArray, OffsetSizeTrait, PrimitiveArray}, datatypes::{ArrowNativeType, ArrowPrimitiveType}, }; +use hashbrown::HashMap; +use unicode_segmentation::UnicodeSegmentation; + use datafusion_common::{ cast::{as_generic_string_array, as_int64_array}, - exec_err, internal_err, DataFusionError, Result, + exec_err, DataFusionError, Result, }; -use hashbrown::HashMap; -use std::cmp::{max, Ordering}; -use std::sync::Arc; -use unicode_segmentation::UnicodeSegmentation; /// Returns number of characters in the string. /// character_length('josé') = 4 @@ -312,7 +314,7 @@ pub fn rpad(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } - other => internal_err!( + other => exec_err!( "rpad was called with {other} arguments. It requires at least 2 and at most 3." ), } @@ -407,7 +409,7 @@ pub fn substr(args: &[ArrayRef]) -> Result { Ok(Arc::new(result) as ArrayRef) } other => { - internal_err!("substr was called with {other} arguments. It requires 2 or 3.") + exec_err!("substr was called with {other} arguments. It requires 2 or 3.") } } } @@ -463,7 +465,7 @@ pub fn translate(args: &[ArrayRef]) -> Result { /// SUBSTRING_INDEX('www.apache.org', '.', -1) = org pub fn substr_index(args: &[ArrayRef]) -> Result { if args.len() != 3 { - return internal_err!( + return exec_err!( "substr_index was called with {} arguments. It requires 3.", args.len() ); @@ -528,7 +530,7 @@ where T::Native: OffsetSizeTrait, { if args.len() != 2 { - return internal_err!( + return exec_err!( "find_in_set was called with {} arguments. It requires 2.", args.len() ); From eced5bc002b1a0442f55e43751973292f88d17d9 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 27 Feb 2024 19:04:09 +0300 Subject: [PATCH 29/45] Address reviews --- .../examples/custom_datasource.rs | 12 ++-- .../datasource/physical_plan/arrow_file.rs | 14 ++-- .../core/src/datasource/physical_plan/avro.rs | 14 ++-- .../core/src/datasource/physical_plan/csv.rs | 14 ++-- .../core/src/datasource/physical_plan/json.rs | 14 ++-- .../datasource/physical_plan/parquet/mod.rs | 16 ++--- .../enforce_distribution.rs | 12 ++-- .../physical_optimizer/output_requirements.rs | 12 ++-- datafusion/core/src/physical_planner.rs | 12 ++-- datafusion/core/src/test/mod.rs | 12 ++-- datafusion/core/src/test_util/mod.rs | 14 ++-- datafusion/core/tests/custom_sources.rs | 12 ++-- .../provider_filter_pushdown.rs | 12 ++-- .../tests/custom_sources_cases/statistics.rs | 12 ++-- .../tests/user_defined/user_defined_plan.rs | 12 ++-- .../physical-plan/src/aggregates/mod.rs | 24 +++---- datafusion/physical-plan/src/analyze.rs | 14 ++-- .../physical-plan/src/coalesce_batches.rs | 12 ++-- .../physical-plan/src/coalesce_partitions.rs | 12 ++-- datafusion/physical-plan/src/display.rs | 4 +- datafusion/physical-plan/src/empty.rs | 12 ++-- datafusion/physical-plan/src/explain.rs | 12 ++-- datafusion/physical-plan/src/filter.rs | 15 +++-- datafusion/physical-plan/src/insert.rs | 10 +-- .../physical-plan/src/joins/cross_join.rs | 18 ++--- .../physical-plan/src/joins/hash_join.rs | 18 ++--- .../src/joins/nested_loop_join.rs | 19 +++--- .../src/joins/sort_merge_join.rs | 19 +++--- .../src/joins/symmetric_hash_join.rs | 19 +++--- datafusion/physical-plan/src/lib.rs | 20 +++--- datafusion/physical-plan/src/limit.rs | 22 +++--- datafusion/physical-plan/src/memory.rs | 14 ++-- .../physical-plan/src/placeholder_row.rs | 16 ++--- datafusion/physical-plan/src/projection.rs | 16 ++--- .../physical-plan/src/recursive_query.rs | 12 ++-- .../physical-plan/src/repartition/mod.rs | 17 +++-- .../physical-plan/src/sorts/partial_sort.rs | 14 ++-- datafusion/physical-plan/src/sorts/sort.rs | 15 ++--- .../src/sorts/sort_preserving_merge.rs | 12 ++-- datafusion/physical-plan/src/streaming.rs | 14 ++-- datafusion/physical-plan/src/test/exec.rs | 67 ++++++++++--------- datafusion/physical-plan/src/union.rs | 30 ++++----- datafusion/physical-plan/src/unnest.rs | 14 ++-- datafusion/physical-plan/src/values.rs | 12 ++-- .../src/windows/bounded_window_agg_exec.rs | 14 ++-- .../src/windows/window_agg_exec.rs | 14 ++-- datafusion/physical-plan/src/work_table.rs | 14 ++-- 47 files changed, 369 insertions(+), 370 deletions(-) diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_datasource.rs index 1ce3ced0e1c4..0b7e3d4c6442 100644 --- a/datafusion-examples/examples/custom_datasource.rs +++ b/datafusion-examples/examples/custom_datasource.rs @@ -31,7 +31,7 @@ use datafusion::execution::context::{SessionState, TaskContext}; use datafusion::physical_plan::memory::MemoryStream; use datafusion::physical_plan::{ project_schema, DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, - Partitioning, PlanPropertiesCache, SendableRecordBatchStream, + Partitioning, PlanProperties, SendableRecordBatchStream, }; use datafusion::prelude::*; use datafusion_expr::{Expr, LogicalPlanBuilder}; @@ -190,7 +190,7 @@ impl TableProvider for CustomDataSource { struct CustomExec { db: CustomDataSource, projected_schema: SchemaRef, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl CustomExec { @@ -200,7 +200,7 @@ impl CustomExec { db: CustomDataSource, ) -> Self { let projected_schema = project_schema(&schema, projections).unwrap(); - let cache = Self::create_cache(projected_schema.clone()); + let cache = Self::compute_properties(projected_schema.clone()); Self { db, projected_schema, @@ -209,9 +209,9 @@ impl CustomExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(1), ExecutionMode::Bounded, @@ -230,7 +230,7 @@ impl ExecutionPlan for CustomExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 8eebc2b68f8b..8f010f1dcbf8 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -37,7 +37,7 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::Statistics; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; -use datafusion_physical_plan::{ExecutionMode, PlanPropertiesCache}; +use datafusion_physical_plan::{ExecutionMode, PlanProperties}; use futures::StreamExt; use itertools::Itertools; @@ -53,7 +53,7 @@ pub struct ArrowExec { projected_output_ordering: Vec, /// Execution metrics metrics: ExecutionPlanMetricsSet, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl ArrowExec { @@ -61,7 +61,7 @@ impl ArrowExec { pub fn new(base_config: FileScanConfig) -> Self { let (projected_schema, projected_statistics, projected_output_ordering) = base_config.project(); - let cache = Self::create_cache( + let cache = Self::compute_properties( projected_schema.clone(), &projected_output_ordering, &base_config, @@ -85,16 +85,16 @@ impl ArrowExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( schema: SchemaRef, projected_output_ordering: &[LexOrdering], file_scan_config: &FileScanConfig, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Equivalence Properties let eq_properties = EquivalenceProperties::new_with_orderings(schema, projected_output_ordering); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Self::output_partitioning_helper(file_scan_config), // Output Partitioning ExecutionMode::Bounded, // Execution Mode @@ -126,7 +126,7 @@ impl ExecutionPlan for ArrowExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 9d65a0ce089a..2b913d862576 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -25,7 +25,7 @@ use crate::error::Result; use crate::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use crate::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, - PlanPropertiesCache, SendableRecordBatchStream, Statistics, + PlanProperties, SendableRecordBatchStream, Statistics, }; use arrow::datatypes::SchemaRef; @@ -42,7 +42,7 @@ pub struct AvroExec { projected_output_ordering: Vec, /// Execution metrics metrics: ExecutionPlanMetricsSet, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl AvroExec { @@ -50,7 +50,7 @@ impl AvroExec { pub fn new(base_config: FileScanConfig) -> Self { let (projected_schema, projected_statistics, projected_output_ordering) = base_config.project(); - let cache = Self::create_cache( + let cache = Self::compute_properties( projected_schema.clone(), &projected_output_ordering, &base_config, @@ -70,16 +70,16 @@ impl AvroExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( schema: SchemaRef, orderings: &[LexOrdering], file_scan_config: &FileScanConfig, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Equivalence Properties let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); let n_partitions = file_scan_config.file_groups.len(); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(n_partitions), // Output Partitioning ExecutionMode::Bounded, // Execution Mode @@ -103,7 +103,7 @@ impl ExecutionPlan for AvroExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 964f40b8e002..a509121a82c8 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -33,7 +33,7 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use crate::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, - PlanPropertiesCache, SendableRecordBatchStream, Statistics, + PlanProperties, SendableRecordBatchStream, Statistics, }; use arrow::csv; @@ -61,7 +61,7 @@ pub struct CsvExec { metrics: ExecutionPlanMetricsSet, /// Compression type of the file associated with CsvExec pub file_compression_type: FileCompressionType, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl CsvExec { @@ -76,7 +76,7 @@ impl CsvExec { ) -> Self { let (projected_schema, projected_statistics, projected_output_ordering) = base_config.project(); - let cache = Self::create_cache( + let cache = Self::compute_properties( projected_schema, &projected_output_ordering, &base_config, @@ -122,15 +122,15 @@ impl CsvExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( schema: SchemaRef, orderings: &[LexOrdering], file_scan_config: &FileScanConfig, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Equivalence Properties let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Self::output_partitioning_helper(file_scan_config), // Output Partitioning ExecutionMode::Bounded, // Execution Mode @@ -164,7 +164,7 @@ impl ExecutionPlan for CsvExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index b27bcdaa917c..7b0e84c4410b 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -33,7 +33,7 @@ use crate::error::{DataFusionError, Result}; use crate::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use crate::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, - PlanPropertiesCache, SendableRecordBatchStream, Statistics, + PlanProperties, SendableRecordBatchStream, Statistics, }; use arrow::json::ReaderBuilder; @@ -55,7 +55,7 @@ pub struct NdJsonExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, file_compression_type: FileCompressionType, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl NdJsonExec { @@ -66,7 +66,7 @@ impl NdJsonExec { ) -> Self { let (projected_schema, projected_statistics, projected_output_ordering) = base_config.project(); - let cache = Self::create_cache( + let cache = Self::compute_properties( projected_schema, &projected_output_ordering, &base_config, @@ -90,15 +90,15 @@ impl NdJsonExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( schema: SchemaRef, orderings: &[LexOrdering], file_scan_config: &FileScanConfig, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Equivalence Properties let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Self::output_partitioning_helper(file_scan_config), // Output Partitioning ExecutionMode::Bounded, // Execution Mode @@ -129,7 +129,7 @@ impl ExecutionPlan for NdJsonExec { fn as_any(&self) -> &dyn Any { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index e2ad1980d422..300ced3a7657 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -38,8 +38,8 @@ use crate::{ physical_optimizer::pruning::PruningPredicate, physical_plan::{ metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}, - DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, - PlanPropertiesCache, SendableRecordBatchStream, Statistics, + DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, PlanProperties, + SendableRecordBatchStream, Statistics, }, }; @@ -100,7 +100,7 @@ pub struct ParquetExec { metadata_size_hint: Option, /// Optional user defined parquet file reader factory parquet_file_reader_factory: Option>, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl ParquetExec { @@ -148,7 +148,7 @@ impl ParquetExec { let (projected_schema, projected_statistics, projected_output_ordering) = base_config.project(); - let cache = Self::create_cache( + let cache = Self::compute_properties( projected_schema, &projected_output_ordering, &base_config, @@ -267,15 +267,15 @@ impl ParquetExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( schema: SchemaRef, orderings: &[LexOrdering], file_config: &FileScanConfig, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Equivalence Properties let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Self::output_partitioning_helper(file_config), // Output Partitioning ExecutionMode::Bounded, // Execution Mode @@ -335,7 +335,7 @@ impl ExecutionPlan for ParquetExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index 86a490278b0e..c7ffc7838b36 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -1331,7 +1331,7 @@ pub(crate) mod tests { expressions, expressions::binary, expressions::lit, expressions::Column, LexOrdering, PhysicalExpr, PhysicalSortExpr, PhysicalSortRequirement, }; - use datafusion_physical_plan::PlanPropertiesCache; + use datafusion_physical_plan::PlanProperties; /// Models operators like BoundedWindowExec that require an input /// ordering but is easy to construct @@ -1339,7 +1339,7 @@ pub(crate) mod tests { struct SortRequiredExec { input: Arc, expr: LexOrdering, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl SortRequiredExec { @@ -1352,7 +1352,7 @@ pub(crate) mod tests { input: Arc, requirement: Vec, ) -> Self { - let cache = Self::create_cache(&input); + let cache = Self::compute_properties(&input); Self { input, expr: requirement, @@ -1361,8 +1361,8 @@ pub(crate) mod tests { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(input: &Arc) -> PlanPropertiesCache { - PlanPropertiesCache::new( + fn compute_properties(input: &Arc) -> PlanProperties { + PlanProperties::new( input.equivalence_properties().clone(), // Equivalence Properties input.output_partitioning().clone(), // Output Partitioning input.execution_mode(), // Execution Mode @@ -1389,7 +1389,7 @@ pub(crate) mod tests { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/src/physical_optimizer/output_requirements.rs b/datafusion/core/src/physical_optimizer/output_requirements.rs index 129ae538808f..992a6e7f82c0 100644 --- a/datafusion/core/src/physical_optimizer/output_requirements.rs +++ b/datafusion/core/src/physical_optimizer/output_requirements.rs @@ -33,7 +33,7 @@ use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{Result, Statistics}; use datafusion_physical_expr::{Distribution, LexRequirement, PhysicalSortRequirement}; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion_physical_plan::PlanPropertiesCache; +use datafusion_physical_plan::PlanProperties; /// This rule either adds or removes [`OutputRequirements`]s to/from the physical /// plan according to its `mode` attribute, which is set by the constructors @@ -90,7 +90,7 @@ pub(crate) struct OutputRequirementExec { input: Arc, order_requirement: Option, dist_requirement: Distribution, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl OutputRequirementExec { @@ -99,7 +99,7 @@ impl OutputRequirementExec { requirements: Option, dist_requirement: Distribution, ) -> Self { - let cache = Self::create_cache(&input); + let cache = Self::compute_properties(&input); Self { input, order_requirement: requirements, @@ -113,8 +113,8 @@ impl OutputRequirementExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(input: &Arc) -> PlanPropertiesCache { - PlanPropertiesCache::new( + fn compute_properties(input: &Arc) -> PlanProperties { + PlanProperties::new( input.equivalence_properties().clone(), // Equivalence Properties input.output_partitioning().clone(), // Output Partitioning input.execution_mode(), // Execution Mode @@ -137,7 +137,7 @@ impl ExecutionPlan for OutputRequirementExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 8049c3940a1a..7e858953ac8f 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1995,7 +1995,7 @@ mod tests { use crate::datasource::MemTable; use crate::physical_plan::{ expressions, DisplayAs, DisplayFormatType, ExecutionMode, Partitioning, - PlanPropertiesCache, SendableRecordBatchStream, + PlanProperties, SendableRecordBatchStream, }; use crate::physical_planner::PhysicalPlanner; use crate::prelude::{SessionConfig, SessionContext}; @@ -2575,19 +2575,19 @@ mod tests { #[derive(Debug)] struct NoOpExecutionPlan { - cache: PlanPropertiesCache, + cache: PlanProperties, } impl NoOpExecutionPlan { fn new(schema: SchemaRef) -> Self { - let cache = Self::create_cache(schema.clone()); + let cache = Self::compute_properties(schema.clone()); Self { cache } } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, // Output Partitioning Partitioning::UnknownPartitioning(1), @@ -2613,7 +2613,7 @@ mod tests { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index e5d8f6ebda32..0042554f6c73 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -45,7 +45,7 @@ use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalSortExpr}; use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; use datafusion_physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionMode, PlanPropertiesCache, + DisplayAs, DisplayFormatType, ExecutionMode, PlanProperties, }; #[cfg(feature = "compression")] @@ -367,7 +367,7 @@ pub fn csv_exec_ordered( pub struct StatisticsExec { stats: Statistics, schema: Arc, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl StatisticsExec { @@ -376,7 +376,7 @@ impl StatisticsExec { stats.column_statistics.len(), schema.fields().len(), "if defined, the column statistics vector length should be the number of fields" ); - let cache = Self::create_cache(Arc::new(schema.clone())); + let cache = Self::compute_properties(Arc::new(schema.clone())); Self { stats, schema: Arc::new(schema), @@ -385,9 +385,9 @@ impl StatisticsExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, // Output Partitioning Partitioning::UnknownPartitioning(2), @@ -421,7 +421,7 @@ impl ExecutionPlan for StatisticsExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs index 55a30b07d893..3244ad49d1c6 100644 --- a/datafusion/core/src/test_util/mod.rs +++ b/datafusion/core/src/test_util/mod.rs @@ -38,7 +38,7 @@ use crate::execution::context::{SessionState, TaskContext}; use crate::logical_expr::{LogicalPlanBuilder, UNNAMED_TABLE}; use crate::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, - PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, + PlanProperties, RecordBatchStream, SendableRecordBatchStream, }; use crate::prelude::{CsvReadOptions, SessionContext}; @@ -227,7 +227,7 @@ impl TableProvider for TestTableProvider { pub struct UnboundedExec { batch_produce: Option, batch: RecordBatch, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl UnboundedExec { /// Create new exec that clones the given record batch to its output. @@ -238,7 +238,7 @@ impl UnboundedExec { batch: RecordBatch, partitions: usize, ) -> Self { - let cache = Self::create_cache(batch.schema(), batch_produce, partitions); + let cache = Self::compute_properties(batch.schema(), batch_produce, partitions); Self { batch_produce, batch, @@ -247,18 +247,18 @@ impl UnboundedExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( schema: SchemaRef, batch_produce: Option, n_partitions: usize, - ) -> PlanPropertiesCache { + ) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); let mode = if batch_produce.is_none() { ExecutionMode::Unbounded } else { ExecutionMode::Bounded }; - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(n_partitions), mode, @@ -289,7 +289,7 @@ impl ExecutionPlan for UnboundedExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/tests/custom_sources.rs b/datafusion/core/tests/custom_sources.rs index f62a3f723ad7..aa3f35e29541 100644 --- a/datafusion/core/tests/custom_sources.rs +++ b/datafusion/core/tests/custom_sources.rs @@ -40,7 +40,7 @@ use datafusion_common::project_schema; use datafusion_common::stats::Precision; use datafusion_physical_expr::EquivalenceProperties; use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; -use datafusion_physical_plan::{ExecutionMode, PlanPropertiesCache}; +use datafusion_physical_plan::{ExecutionMode, PlanProperties}; use async_trait::async_trait; use futures::stream::Stream; @@ -74,7 +74,7 @@ struct CustomTableProvider; #[derive(Debug, Clone)] struct CustomExecutionPlan { projection: Option>, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl CustomExecutionPlan { @@ -82,14 +82,14 @@ impl CustomExecutionPlan { let schema = TEST_CUSTOM_SCHEMA_REF!(); let schema = project_schema(&schema, projection.as_ref()).expect("projected schema"); - let cache = Self::create_cache(schema); + let cache = Self::compute_properties(schema); Self { projection, cache } } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, // Output Partitioning Partitioning::UnknownPartitioning(1), @@ -144,7 +144,7 @@ impl ExecutionPlan for CustomExecutionPlan { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs index dec2deb10cbb..9f6c44d4603f 100644 --- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs +++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs @@ -28,7 +28,7 @@ use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, - PlanPropertiesCache, SendableRecordBatchStream, Statistics, + PlanProperties, SendableRecordBatchStream, Statistics, }; use datafusion::prelude::*; use datafusion::scalar::ScalarValue; @@ -58,19 +58,19 @@ fn create_batch(value: i32, num_rows: usize) -> Result { #[derive(Debug)] struct CustomPlan { batches: Vec, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl CustomPlan { fn new(schema: SchemaRef, batches: Vec) -> Self { - let cache = Self::create_cache(schema); + let cache = Self::compute_properties(schema); Self { batches, cache } } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(1), ExecutionMode::Bounded, @@ -97,7 +97,7 @@ impl ExecutionPlan for CustomPlan { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/tests/custom_sources_cases/statistics.rs b/datafusion/core/tests/custom_sources_cases/statistics.rs index e98781aae9bf..85ac47dc97fc 100644 --- a/datafusion/core/tests/custom_sources_cases/statistics.rs +++ b/datafusion/core/tests/custom_sources_cases/statistics.rs @@ -27,7 +27,7 @@ use datafusion::{ logical_expr::Expr, physical_plan::{ ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, - Partitioning, PlanPropertiesCache, SendableRecordBatchStream, Statistics, + Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, }, prelude::SessionContext, scalar::ScalarValue, @@ -43,7 +43,7 @@ use async_trait::async_trait; struct StatisticsValidation { stats: Statistics, schema: Arc, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl StatisticsValidation { @@ -53,7 +53,7 @@ impl StatisticsValidation { schema.fields().len(), "the column statistics vector length should be the number of fields" ); - let cache = Self::create_cache(schema.clone()); + let cache = Self::compute_properties(schema.clone()); Self { stats, schema, @@ -62,10 +62,10 @@ impl StatisticsValidation { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(2), ExecutionMode::Bounded, @@ -149,7 +149,7 @@ impl ExecutionPlan for StatisticsValidation { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index 5f01334a4757..2c12e108bb47 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -84,7 +84,7 @@ use datafusion::{ physical_expr::EquivalenceProperties, physical_plan::{ DisplayAs, DisplayFormatType, Distribution, ExecutionMode, ExecutionPlan, - Partitioning, PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, + Partitioning, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }, physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner}, @@ -412,20 +412,20 @@ struct TopKExec { input: Arc, /// The maxium number of values k: usize, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl TopKExec { fn new(input: Arc, k: usize) -> Self { - let cache = Self::create_cache(input.schema()); + let cache = Self::compute_properties(input.schema()); Self { input, k, cache } } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(1), ExecutionMode::Bounded, @@ -460,7 +460,7 @@ impl ExecutionPlan for TopKExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 35e42b8a4d36..98d41cca6764 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -20,7 +20,7 @@ use std::any::Any; use std::sync::Arc; -use super::{DisplayAs, ExecutionMode, PlanPropertiesCache}; +use super::{DisplayAs, ExecutionMode, PlanProperties}; use crate::aggregates::{ no_grouping::AggregateStream, row_hash::GroupedHashAggregateStream, topk_stream::GroupedTopKAggregateStream, @@ -265,7 +265,7 @@ pub struct AggregateExec { required_input_ordering: Option, /// Describes how the input is ordered relative to the group by columns input_order_mode: InputOrderMode, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl AggregateExec { @@ -362,7 +362,7 @@ impl AggregateExec { let required_input_ordering = (!new_requirement.is_empty()).then_some(new_requirement); - let cache = Self::create_cache( + let cache = Self::compute_properties( &input, schema.clone(), &projection_mapping, @@ -507,13 +507,13 @@ impl AggregateExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( input: &Arc, schema: SchemaRef, projection_mapping: &ProjectionMapping, mode: &AggregateMode, input_order_mode: &InputOrderMode, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Construct equivalence properties: let eq_properties = input .equivalence_properties() @@ -550,7 +550,7 @@ impl AggregateExec { exec_mode = ExecutionMode::PipelineBreaking; } - PlanPropertiesCache::new(eq_properties, output_partitioning, exec_mode) + PlanProperties::new(eq_properties, output_partitioning, exec_mode) } pub fn input_order_mode(&self) -> &InputOrderMode { @@ -641,7 +641,7 @@ impl ExecutionPlan for AggregateExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } @@ -1620,20 +1620,20 @@ mod tests { struct TestYieldingExec { /// True if this exec should yield back to runtime the first time it is polled pub yield_first: bool, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl TestYieldingExec { fn new(yield_first: bool) -> Self { let schema = some_data().0; - let cache = Self::create_cache(schema); + let cache = Self::compute_properties(schema); Self { yield_first, cache } } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, // Output Partitioning Partitioning::UnknownPartitioning(1), @@ -1662,7 +1662,7 @@ mod tests { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs index 94dd6ff3bd28..fed4b97d2afb 100644 --- a/datafusion/physical-plan/src/analyze.rs +++ b/datafusion/physical-plan/src/analyze.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use std::{any::Any, time::Instant}; use super::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter}; -use super::{DisplayAs, Distribution, PlanPropertiesCache, SendableRecordBatchStream}; +use super::{DisplayAs, Distribution, PlanProperties, SendableRecordBatchStream}; use crate::display::DisplayableExecutionPlan; use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; @@ -45,7 +45,7 @@ pub struct AnalyzeExec { pub(crate) input: Arc, /// The output schema for RecordBatches of this exec node schema: SchemaRef, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl AnalyzeExec { @@ -56,7 +56,7 @@ impl AnalyzeExec { input: Arc, schema: SchemaRef, ) -> Self { - let cache = Self::create_cache(&input, schema.clone()); + let cache = Self::compute_properties(&input, schema.clone()); AnalyzeExec { verbose, show_statistics, @@ -82,14 +82,14 @@ impl AnalyzeExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( input: &Arc, schema: SchemaRef, - ) -> PlanPropertiesCache { + ) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); let output_partitioning = Partitioning::UnknownPartitioning(1); let exec_mode = input.execution_mode(); - PlanPropertiesCache::new(eq_properties, output_partitioning, exec_mode) + PlanProperties::new(eq_properties, output_partitioning, exec_mode) } } @@ -113,7 +113,7 @@ impl ExecutionPlan for AnalyzeExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index bce48698a558..055f16288f95 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -24,7 +24,7 @@ use std::sync::Arc; use std::task::{Context, Poll}; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; -use super::{DisplayAs, PlanPropertiesCache, Statistics}; +use super::{DisplayAs, PlanProperties, Statistics}; use crate::{ DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, }; @@ -48,13 +48,13 @@ pub struct CoalesceBatchesExec { target_batch_size: usize, /// Execution metrics metrics: ExecutionPlanMetricsSet, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl CoalesceBatchesExec { /// Create a new CoalesceBatchesExec pub fn new(input: Arc, target_batch_size: usize) -> Self { - let cache = Self::create_cache(&input); + let cache = Self::compute_properties(&input); Self { input, target_batch_size, @@ -74,10 +74,10 @@ impl CoalesceBatchesExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(input: &Arc) -> PlanPropertiesCache { + fn compute_properties(input: &Arc) -> PlanProperties { // The coalesce batches operator does not make any changes to the // partitioning of its input. - PlanPropertiesCache::new( + PlanProperties::new( input.equivalence_properties().clone(), // Equivalence Properties input.output_partitioning().clone(), // Output Partitioning input.execution_mode(), // Execution Mode @@ -109,7 +109,7 @@ impl ExecutionPlan for CoalesceBatchesExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index ad1094cee0e1..7037445164a3 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use super::stream::{ObservedStream, RecordBatchReceiverStream}; -use super::{DisplayAs, PlanPropertiesCache, SendableRecordBatchStream, Statistics}; +use super::{DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; @@ -38,13 +38,13 @@ pub struct CoalescePartitionsExec { input: Arc, /// Execution metrics metrics: ExecutionPlanMetricsSet, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl CoalescePartitionsExec { /// Create a new CoalescePartitionsExec pub fn new(input: Arc) -> Self { - let cache = Self::create_cache(&input); + let cache = Self::compute_properties(&input); CoalescePartitionsExec { input, metrics: ExecutionPlanMetricsSet::new(), @@ -58,12 +58,12 @@ impl CoalescePartitionsExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(input: &Arc) -> PlanPropertiesCache { + fn compute_properties(input: &Arc) -> PlanProperties { // Coalescing partitions loses existing orderings: let mut eq_properties = input.equivalence_properties().clone(); eq_properties.clear_orderings(); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, // Equivalence Properties Partitioning::UnknownPartitioning(1), // Output Partitioning input.execution_mode(), // Execution Mode @@ -91,7 +91,7 @@ impl ExecutionPlan for CoalescePartitionsExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs index d4afca6a1acf..38c23331983e 100644 --- a/datafusion/physical-plan/src/display.rs +++ b/datafusion/physical-plan/src/display.rs @@ -466,7 +466,7 @@ mod tests { use std::sync::Arc; use super::DisplayableExecutionPlan; - use crate::{DisplayAs, ExecutionPlan, PlanPropertiesCache}; + use crate::{DisplayAs, ExecutionPlan, PlanProperties}; use datafusion_common::{DataFusionError, Result, Statistics}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; @@ -493,7 +493,7 @@ mod tests { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { unimplemented!() } diff --git a/datafusion/physical-plan/src/empty.rs b/datafusion/physical-plan/src/empty.rs index 0705c4b4eca7..b6708515ec6b 100644 --- a/datafusion/physical-plan/src/empty.rs +++ b/datafusion/physical-plan/src/empty.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::sync::Arc; use super::{ - common, DisplayAs, ExecutionMode, PlanPropertiesCache, SendableRecordBatchStream, + common, DisplayAs, ExecutionMode, PlanProperties, SendableRecordBatchStream, Statistics, }; use crate::{memory::MemoryStream, DisplayFormatType, ExecutionPlan, Partitioning}; @@ -41,13 +41,13 @@ pub struct EmptyExec { schema: SchemaRef, /// Number of partitions partitions: usize, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl EmptyExec { /// Create a new EmptyExec pub fn new(schema: SchemaRef) -> Self { - let cache = Self::create_cache(schema.clone(), 1); + let cache = Self::compute_properties(schema.clone(), 1); EmptyExec { schema, partitions: 1, @@ -73,10 +73,10 @@ impl EmptyExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef, n_partitions: usize) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef, n_partitions: usize) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); let output_partitioning = Self::output_partitioning_helper(n_partitions); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, // Output Partitioning output_partitioning, @@ -106,7 +106,7 @@ impl ExecutionPlan for EmptyExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/explain.rs b/datafusion/physical-plan/src/explain.rs index 200ba0bd07c5..f63b1df29da5 100644 --- a/datafusion/physical-plan/src/explain.rs +++ b/datafusion/physical-plan/src/explain.rs @@ -20,7 +20,7 @@ use std::any::Any; use std::sync::Arc; -use super::{DisplayAs, ExecutionMode, PlanPropertiesCache, SendableRecordBatchStream}; +use super::{DisplayAs, ExecutionMode, PlanProperties, SendableRecordBatchStream}; use crate::stream::RecordBatchStreamAdapter; use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; @@ -43,7 +43,7 @@ pub struct ExplainExec { stringified_plans: Vec, /// control which plans to print verbose: bool, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl ExplainExec { @@ -53,7 +53,7 @@ impl ExplainExec { stringified_plans: Vec, verbose: bool, ) -> Self { - let cache = Self::create_cache(schema.clone()); + let cache = Self::compute_properties(schema.clone()); ExplainExec { schema, stringified_plans, @@ -73,9 +73,9 @@ impl ExplainExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(1), ExecutionMode::Bounded, @@ -103,7 +103,7 @@ impl ExecutionPlan for ExplainExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 86502039c8ba..95c09f541cc2 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -24,7 +24,7 @@ use std::sync::Arc; use std::task::{Context, Poll}; use super::{ - ColumnStatistics, DisplayAs, PlanPropertiesCache, RecordBatchStream, + ColumnStatistics, DisplayAs, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; use crate::{ @@ -62,7 +62,7 @@ pub struct FilterExec { metrics: ExecutionPlanMetricsSet, /// Selectivity for statistics. 0 = no rows, 100 all rows default_selectivity: u8, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl FilterExec { @@ -74,7 +74,8 @@ impl FilterExec { match predicate.data_type(input.schema().as_ref())? { DataType::Boolean => { let default_selectivity = 20; - let cache = Self::create_cache(&input, &predicate, default_selectivity)?; + let cache = + Self::compute_properties(&input, &predicate, default_selectivity)?; Ok(Self { predicate, input: input.clone(), @@ -159,11 +160,11 @@ impl FilterExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( input: &Arc, predicate: &Arc, default_selectivity: u8, - ) -> Result { + ) -> Result { // Combine the equal predicates with the input equivalence properties // to construct the equivalence properties: let stats = Self::statistics_helper(input, predicate, default_selectivity)?; @@ -182,7 +183,7 @@ impl FilterExec { .map(|column| Arc::new(column) as _); eq_properties = eq_properties.add_constants(constants); - Ok(PlanPropertiesCache::new( + Ok(PlanProperties::new( eq_properties, input.output_partitioning().clone(), // Output Partitioning input.execution_mode(), // Execution Mode @@ -210,7 +211,7 @@ impl ExecutionPlan for FilterExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/insert.rs b/datafusion/physical-plan/src/insert.rs index 472c65f25b30..fd0bec108e03 100644 --- a/datafusion/physical-plan/src/insert.rs +++ b/datafusion/physical-plan/src/insert.rs @@ -23,7 +23,7 @@ use std::fmt::Debug; use std::sync::Arc; use super::{ - DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanPropertiesCache, + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, SendableRecordBatchStream, }; use crate::metrics::MetricsSet; @@ -88,7 +88,7 @@ pub struct FileSinkExec { count_schema: SchemaRef, /// Optional required sort order for output data. sort_order: Option>, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl fmt::Debug for FileSinkExec { @@ -180,9 +180,9 @@ impl FileSinkExec { fn create_schema( input: &Arc, schema: SchemaRef, - ) -> PlanPropertiesCache { + ) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(1), input.execution_mode(), @@ -211,7 +211,7 @@ impl ExecutionPlan for FileSinkExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 3f6b6ba5e2b6..f73f3d36cdb4 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -27,8 +27,8 @@ use crate::coalesce_batches::concat_batches; use crate::coalesce_partitions::CoalescePartitionsExec; use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use crate::{ - exec_mode_flatten, ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, - ExecutionMode, ExecutionPlan, PlanPropertiesCache, RecordBatchStream, + execution_mode_from_children, ColumnStatistics, DisplayAs, DisplayFormatType, + Distribution, ExecutionMode, ExecutionPlan, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; @@ -61,7 +61,7 @@ pub struct CrossJoinExec { left_fut: OnceAsync, /// Execution plan metrics metrics: ExecutionPlanMetricsSet, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl CrossJoinExec { @@ -77,7 +77,7 @@ impl CrossJoinExec { }; let schema = Arc::new(Schema::new(all_columns)); - let cache = Self::create_cache(&left, &right, schema.clone()); + let cache = Self::compute_properties(&left, &right, schema.clone()); CrossJoinExec { left, right, @@ -99,11 +99,11 @@ impl CrossJoinExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( left: &Arc, right: &Arc, schema: SchemaRef, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties // TODO: Check equivalence properties of cross join, it may preserve // ordering in some cases. @@ -126,13 +126,13 @@ impl CrossJoinExec { ); // Determine the execution mode: - let mut mode = exec_mode_flatten([left, right]); + let mut mode = execution_mode_from_children([left, right]); if mode.is_unbounded() { // If any of the inputs is unbounded, cross join breaks the pipeline. mode = ExecutionMode::PipelineBreaking; } - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } } @@ -197,7 +197,7 @@ impl ExecutionPlan for CrossJoinExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 4b010e8c60c6..2fdb2a17ebe8 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -29,7 +29,7 @@ use super::{ }; use crate::{ coalesce_partitions::CoalescePartitionsExec, - exec_mode_flatten, handle_state, + execution_mode_from_children, handle_state, hash_utils::create_hashes, joins::utils::{ adjust_indices_by_join_type, adjust_right_output_partitioning, @@ -41,7 +41,7 @@ use crate::{ }, metrics::{ExecutionPlanMetricsSet, MetricsSet}, DisplayAs, DisplayFormatType, Distribution, ExecutionMode, ExecutionPlan, - Partitioning, PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, + Partitioning, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; @@ -297,7 +297,7 @@ pub struct HashJoinExec { /// matched and thus will not appear in the output. pub null_equals_null: bool, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl HashJoinExec { @@ -327,7 +327,7 @@ impl HashJoinExec { let random_state = RandomState::with_seeds(0, 0, 0, 0); - let cache = Self::create_cache( + let cache = Self::compute_properties( &left, &right, Arc::new(schema.clone()), @@ -406,14 +406,14 @@ impl HashJoinExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( left: &Arc, right: &Arc, schema: SchemaRef, join_type: JoinType, on: JoinOnRef, mode: PartitionMode, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties: let eq_properties = join_equivalence_properties( left.equivalence_properties().clone(), @@ -470,10 +470,10 @@ impl HashJoinExec { let mode = if pipeline_breaking { ExecutionMode::PipelineBreaking } else { - exec_mode_flatten([left, right]) + execution_mode_from_children([left, right]) }; - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } } @@ -506,7 +506,7 @@ impl ExecutionPlan for HashJoinExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index bbfc4c12f548..5d2175d4a820 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -34,8 +34,9 @@ use crate::joins::utils::{ }; use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use crate::{ - exec_mode_flatten, DisplayAs, DisplayFormatType, Distribution, ExecutionMode, - ExecutionPlan, PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, + execution_mode_from_children, DisplayAs, DisplayFormatType, Distribution, + ExecutionMode, ExecutionPlan, PlanProperties, RecordBatchStream, + SendableRecordBatchStream, }; use arrow::array::{ @@ -93,7 +94,7 @@ pub struct NestedLoopJoinExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl NestedLoopJoinExec { @@ -110,7 +111,7 @@ impl NestedLoopJoinExec { let (schema, column_indices) = build_join_schema(&left_schema, &right_schema, join_type); let schema = Arc::new(schema); - let cache = Self::create_cache(&left, &right, schema.clone(), *join_type); + let cache = Self::compute_properties(&left, &right, schema.clone(), *join_type); Ok(NestedLoopJoinExec { left, right, @@ -145,12 +146,12 @@ impl NestedLoopJoinExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( left: &Arc, right: &Arc, schema: SchemaRef, join_type: JoinType, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties: let eq_properties = join_equivalence_properties( left.equivalence_properties().clone(), @@ -176,12 +177,12 @@ impl NestedLoopJoinExec { }; // Determine execution mode: - let mut mode = exec_mode_flatten([left, right]); + let mut mode = execution_mode_from_children([left, right]); if mode.is_unbounded() { mode = ExecutionMode::PipelineBreaking; } - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } } @@ -208,7 +209,7 @@ impl ExecutionPlan for NestedLoopJoinExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index 20bae468cc4a..bde831b731ba 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -37,8 +37,8 @@ use crate::joins::utils::{ }; use crate::metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; use crate::{ - exec_mode_flatten, metrics, DisplayAs, DisplayFormatType, Distribution, - ExecutionPlan, PhysicalExpr, PlanPropertiesCache, RecordBatchStream, + execution_mode_from_children, metrics, DisplayAs, DisplayFormatType, Distribution, + ExecutionPlan, PhysicalExpr, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; @@ -84,7 +84,7 @@ pub struct SortMergeJoinExec { /// If null_equals_null is true, null == null else null != null pub null_equals_null: bool, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl SortMergeJoinExec { @@ -137,7 +137,8 @@ impl SortMergeJoinExec { let schema = Arc::new(build_join_schema(&left_schema, &right_schema, &join_type).0); - let cache = Self::create_cache(&left, &right, schema.clone(), join_type, &on); + let cache = + Self::compute_properties(&left, &right, schema.clone(), join_type, &on); Ok(Self { left, right, @@ -201,13 +202,13 @@ impl SortMergeJoinExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( left: &Arc, right: &Arc, schema: SchemaRef, join_type: JoinType, join_on: JoinOnRef, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties: let eq_properties = join_equivalence_properties( left.equivalence_properties().clone(), @@ -229,9 +230,9 @@ impl SortMergeJoinExec { ); // Determine execution mode: - let mode = exec_mode_flatten([left, right]); + let mode = execution_mode_from_children([left, right]); - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } } @@ -265,7 +266,7 @@ impl ExecutionPlan for SortMergeJoinExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 3eff026a176f..77871a8b5483 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -46,11 +46,11 @@ use crate::joins::utils::{ JoinHashMapType, JoinOn, JoinOnRef, StatefulStreamResult, }; use crate::{ - exec_mode_flatten, + execution_mode_from_children, expressions::PhysicalSortExpr, joins::StreamJoinPartitionMode, metrics::{ExecutionPlanMetricsSet, MetricsSet}, - DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, PlanPropertiesCache, + DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; @@ -192,7 +192,7 @@ pub struct SymmetricHashJoinExec { /// Partition Mode mode: StreamJoinPartitionMode, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl SymmetricHashJoinExec { @@ -234,7 +234,8 @@ impl SymmetricHashJoinExec { // Initialize the random state for the join operation: let random_state = RandomState::with_seeds(0, 0, 0, 0); let schema = Arc::new(schema); - let cache = Self::create_cache(&left, &right, schema.clone(), *join_type, &on); + let cache = + Self::compute_properties(&left, &right, schema.clone(), *join_type, &on); Ok(SymmetricHashJoinExec { left, right, @@ -253,13 +254,13 @@ impl SymmetricHashJoinExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( left: &Arc, right: &Arc, schema: SchemaRef, join_type: JoinType, join_on: JoinOnRef, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties: let eq_properties = join_equivalence_properties( left.equivalence_properties().clone(), @@ -282,9 +283,9 @@ impl SymmetricHashJoinExec { ); // Determine execution mode: - let mode = exec_mode_flatten([left, right]); + let mode = execution_mode_from_children([left, right]); - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } /// left stream @@ -390,7 +391,7 @@ impl ExecutionPlan for SymmetricHashJoinExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index f90bbf061d38..3145c999d6b5 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -122,22 +122,22 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// Get the schema for this execution plan fn schema(&self) -> SchemaRef { - self.cache().schema().clone() + self.properties().schema().clone() } - fn cache(&self) -> &PlanPropertiesCache; + fn properties(&self) -> &PlanProperties; /// Specifies how the output of this `ExecutionPlan` is split into /// partitions. fn output_partitioning(&self) -> &Partitioning { - &self.cache().partitioning + &self.properties().partitioning } /// Specifies whether this plan generates an infinite stream of records. /// If the plan does not support pipelining, but its input(s) are /// infinite, returns an error to indicate this. fn execution_mode(&self) -> ExecutionMode { - self.cache().exec_mode + self.properties().exec_mode } /// If the output of this `ExecutionPlan` within each partition is sorted, @@ -151,7 +151,7 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// It is safe to return `None` here if your `ExecutionPlan` does not /// have any particular output order here fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - self.cache().output_ordering.as_deref() + self.properties().output_ordering.as_deref() } /// Specifies the data distribution requirements for all the @@ -230,7 +230,7 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { /// See also [`Self::maintains_input_order`] and [`Self::output_ordering`] /// for related concepts. fn equivalence_properties(&self) -> &EquivalenceProperties { - &self.cache().eq_properties + &self.properties().eq_properties } /// Get a list of children `ExecutionPlan`s that act as inputs to this plan. @@ -482,7 +482,7 @@ impl ExecutionMode { } /// Conservatively "combines" execution modes of a given collection of operators. -fn exec_mode_flatten<'a>( +fn execution_mode_from_children<'a>( children: impl IntoIterator>, ) -> ExecutionMode { let mut result = ExecutionMode::Bounded; @@ -506,12 +506,12 @@ fn exec_mode_flatten<'a>( result } -/// Represents a cache for plan properties used in query optimization. +/// Stores the plan properties used in query optimization. /// /// This struct holds various properties useful for the query planning, which are used /// during optimization and execution phases. #[derive(Debug, Clone)] -pub struct PlanPropertiesCache { +pub struct PlanProperties { /// Stores the [`EquivalenceProperties`] of the [`ExecutionPlan`]. pub eq_properties: EquivalenceProperties, /// Stores the output [`Partitioning`] of the [`ExecutionPlan`]. @@ -523,7 +523,7 @@ pub struct PlanPropertiesCache { output_ordering: Option, } -impl PlanPropertiesCache { +impl PlanProperties { /// Construct a new `PlanPropertiesCache` from the pub fn new( eq_properties: EquivalenceProperties, diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index e678360dd471..a0b49d4ef136 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -24,7 +24,7 @@ use std::task::{Context, Poll}; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use super::{ - DisplayAs, ExecutionMode, PlanPropertiesCache, RecordBatchStream, + DisplayAs, ExecutionMode, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; use crate::{DisplayFormatType, Distribution, ExecutionPlan, Partitioning}; @@ -51,13 +51,13 @@ pub struct GlobalLimitExec { fetch: Option, /// Execution metrics metrics: ExecutionPlanMetricsSet, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl GlobalLimitExec { /// Create a new GlobalLimitExec pub fn new(input: Arc, skip: usize, fetch: Option) -> Self { - let cache = Self::create_cache(&input); + let cache = Self::compute_properties(&input); GlobalLimitExec { input, skip, @@ -83,8 +83,8 @@ impl GlobalLimitExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(input: &Arc) -> PlanPropertiesCache { - PlanPropertiesCache::new( + fn compute_properties(input: &Arc) -> PlanProperties { + PlanProperties::new( input.equivalence_properties().clone(), // Equivalence Properties Partitioning::UnknownPartitioning(1), // Output Partitioning ExecutionMode::Bounded, // Execution Mode @@ -117,7 +117,7 @@ impl ExecutionPlan for GlobalLimitExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } @@ -268,13 +268,13 @@ pub struct LocalLimitExec { fetch: usize, /// Execution metrics metrics: ExecutionPlanMetricsSet, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl LocalLimitExec { /// Create a new LocalLimitExec partition pub fn new(input: Arc, fetch: usize) -> Self { - let cache = Self::create_cache(&input); + let cache = Self::compute_properties(&input); Self { input, fetch, @@ -294,8 +294,8 @@ impl LocalLimitExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(input: &Arc) -> PlanPropertiesCache { - PlanPropertiesCache::new( + fn compute_properties(input: &Arc) -> PlanProperties { + PlanProperties::new( input.equivalence_properties().clone(), // Equivalence Properties input.output_partitioning().clone(), // Output Partitioning ExecutionMode::Bounded, // Execution Mode @@ -323,7 +323,7 @@ impl ExecutionPlan for LocalLimitExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index 8bd4db0bd418..23699295e121 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -25,7 +25,7 @@ use std::task::{Context, Poll}; use super::expressions::PhysicalSortExpr; use super::{ common, DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, - PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, Statistics, + PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; use arrow::datatypes::SchemaRef; @@ -48,7 +48,7 @@ pub struct MemoryExec { projection: Option>, // Sort information: one or more equivalent orderings sort_information: Vec, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl fmt::Debug for MemoryExec { @@ -101,7 +101,7 @@ impl ExecutionPlan for MemoryExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } @@ -153,7 +153,7 @@ impl MemoryExec { projection: Option>, ) -> Result { let projected_schema = project_schema(&schema, projection.as_ref())?; - let cache = Self::create_cache(projected_schema.clone(), &[], partitions); + let cache = Self::compute_properties(projected_schema.clone(), &[], partitions); Ok(Self { partitions: partitions.to_vec(), schema, @@ -205,13 +205,13 @@ impl MemoryExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( schema: SchemaRef, orderings: &[LexOrdering], partitions: &[Vec], - ) -> PlanPropertiesCache { + ) -> PlanProperties { let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, // Equivalence Properties Partitioning::UnknownPartitioning(partitions.len()), // Output Partitioning ExecutionMode::Bounded, // Execution Mode diff --git a/datafusion/physical-plan/src/placeholder_row.rs b/datafusion/physical-plan/src/placeholder_row.rs index 3280522e152c..3fc9a512e79e 100644 --- a/datafusion/physical-plan/src/placeholder_row.rs +++ b/datafusion/physical-plan/src/placeholder_row.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::sync::Arc; use super::{ - common, DisplayAs, ExecutionMode, PlanPropertiesCache, SendableRecordBatchStream, + common, DisplayAs, ExecutionMode, PlanProperties, SendableRecordBatchStream, Statistics, }; use crate::{memory::MemoryStream, DisplayFormatType, ExecutionPlan, Partitioning}; @@ -43,14 +43,14 @@ pub struct PlaceholderRowExec { schema: SchemaRef, /// Number of partitions partitions: usize, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl PlaceholderRowExec { /// Create a new PlaceholderRowExec pub fn new(schema: SchemaRef) -> Self { let partitions = 1; - let cache = Self::create_cache(schema.clone(), partitions); + let cache = Self::compute_properties(schema.clone(), partitions); PlaceholderRowExec { schema, partitions, @@ -95,16 +95,12 @@ impl PlaceholderRowExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef, n_partitions: usize) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef, n_partitions: usize) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); // Get output partitioning: let output_partitioning = Self::output_partitioning_helper(n_partitions); - PlanPropertiesCache::new( - eq_properties, - output_partitioning, - ExecutionMode::Bounded, - ) + PlanProperties::new(eq_properties, output_partitioning, ExecutionMode::Bounded) } } @@ -128,7 +124,7 @@ impl ExecutionPlan for PlaceholderRowExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 2ed8095f256c..402feeaf80ba 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -29,8 +29,7 @@ use std::task::{Context, Poll}; use super::expressions::Column; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use super::{ - DisplayAs, PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, - Statistics, + DisplayAs, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; use crate::{ ColumnStatistics, DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, @@ -59,7 +58,7 @@ pub struct ProjectionExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl ProjectionExec { @@ -93,7 +92,8 @@ impl ProjectionExec { // construct a map from the input expressions to the output expression of the Projection let projection_mapping = ProjectionMapping::try_new(&expr, &input_schema)?; - let cache = Self::create_cache(&input, &projection_mapping, schema.clone())?; + let cache = + Self::compute_properties(&input, &projection_mapping, schema.clone())?; Ok(Self { expr, schema, @@ -114,11 +114,11 @@ impl ProjectionExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( input: &Arc, projection_mapping: &ProjectionMapping, schema: SchemaRef, - ) -> Result { + ) -> Result { // Calculate equivalence properties: let mut input_eq_properties = input.equivalence_properties().clone(); input_eq_properties.substitute_oeq_class(projection_mapping)?; @@ -143,7 +143,7 @@ impl ProjectionExec { input_partition.clone() }; - Ok(PlanPropertiesCache::new( + Ok(PlanProperties::new( eq_properties, output_partitioning, input.execution_mode(), @@ -184,7 +184,7 @@ impl ExecutionPlan for ProjectionExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/recursive_query.rs b/datafusion/physical-plan/src/recursive_query.rs index fd0d506e2ce4..9786b1cbf6fd 100644 --- a/datafusion/physical-plan/src/recursive_query.rs +++ b/datafusion/physical-plan/src/recursive_query.rs @@ -24,7 +24,7 @@ use std::task::{Context, Poll}; use super::{ metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, work_table::{WorkTable, WorkTableExec}, - PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, Statistics, + PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; use crate::{DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan}; @@ -67,7 +67,7 @@ pub struct RecursiveQueryExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl RecursiveQueryExec { @@ -82,7 +82,7 @@ impl RecursiveQueryExec { let work_table = Arc::new(WorkTable::new()); // Use the same work table for both the WorkTableExec and the recursive term let recursive_term = assign_work_table(recursive_term, work_table.clone())?; - let cache = Self::create_cache(static_term.schema()); + let cache = Self::compute_properties(static_term.schema()); Ok(RecursiveQueryExec { name, static_term, @@ -95,10 +95,10 @@ impl RecursiveQueryExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(1), ExecutionMode::Bounded, @@ -111,7 +111,7 @@ impl ExecutionPlan for RecursiveQueryExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index b9489bd12e64..1b92a0b03e49 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -34,9 +34,7 @@ use crate::repartition::distributor_channels::{ channels, partition_aware_channels, DistributionReceiver, DistributionSender, }; use crate::sorts::streaming_merge; -use crate::{ - DisplayFormatType, ExecutionPlan, Partitioning, PlanPropertiesCache, Statistics, -}; +use crate::{DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics}; use arrow::array::{ArrayRef, UInt64Builder}; use arrow::datatypes::SchemaRef; @@ -305,7 +303,7 @@ pub struct RepartitionExec { /// `SortPreservingRepartitionExec`, false means `RepartitionExec`. preserve_order: bool, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } #[derive(Debug, Clone)] @@ -411,7 +409,7 @@ impl ExecutionPlan for RepartitionExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } @@ -598,7 +596,8 @@ impl RepartitionExec { partitioning: Partitioning, ) -> Result { let preserve_order = false; - let cache = Self::create_cache(&input, partitioning.clone(), preserve_order); + let cache = + Self::compute_properties(&input, partitioning.clone(), preserve_order); Ok(RepartitionExec { input, partitioning, @@ -634,15 +633,15 @@ impl RepartitionExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( input: &Arc, partitioning: Partitioning, preserve_order: bool, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Equivalence Properties let eq_properties = Self::eq_properties_helper(input, preserve_order); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, // Equivalence Properties partitioning, // Output Partitioning input.execution_mode(), // Execution Mode diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs index 095245a706ea..01e592d29f7b 100644 --- a/datafusion/physical-plan/src/sorts/partial_sort.rs +++ b/datafusion/physical-plan/src/sorts/partial_sort.rs @@ -62,7 +62,7 @@ use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use crate::sorts::sort::sort_batch; use crate::{ DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, - PlanPropertiesCache, SendableRecordBatchStream, Statistics, + PlanProperties, SendableRecordBatchStream, Statistics, }; use arrow::compute::concat_batches; @@ -94,7 +94,7 @@ pub struct PartialSortExec { /// Fetch highest/lowest n results fetch: Option, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl PartialSortExec { @@ -106,7 +106,7 @@ impl PartialSortExec { ) -> Self { assert!(common_prefix_length > 0); let preserve_partitioning = false; - let cache = Self::create_cache(&input, expr.clone(), preserve_partitioning); + let cache = Self::compute_properties(&input, expr.clone(), preserve_partitioning); Self { input, expr, @@ -181,11 +181,11 @@ impl PartialSortExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( input: &Arc, sort_exprs: LexOrdering, preserve_partitioning: bool, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties; i.e. reset the ordering equivalence // class with the new ordering: let eq_properties = input @@ -200,7 +200,7 @@ impl PartialSortExec { // Determine execution mode: let mode = input.execution_mode(); - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } } @@ -230,7 +230,7 @@ impl ExecutionPlan for PartialSortExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 713ff86a5072..7109d730f1dd 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -37,8 +37,7 @@ use crate::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter}; use crate::topk::TopK; use crate::{ DisplayAs, DisplayFormatType, Distribution, EmptyRecordBatchStream, ExecutionMode, - ExecutionPlan, Partitioning, PlanPropertiesCache, SendableRecordBatchStream, - Statistics, + ExecutionPlan, Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, }; use arrow::compute::{concat_batches, lexsort_to_indices, take}; @@ -678,7 +677,7 @@ pub struct SortExec { /// Fetch highest/lowest n results fetch: Option, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl SortExec { @@ -696,7 +695,7 @@ impl SortExec { /// sorted output partition. pub fn new(expr: Vec, input: Arc) -> Self { let preserve_partitioning = false; - let cache = Self::create_cache(&input, expr.clone(), preserve_partitioning); + let cache = Self::compute_properties(&input, expr.clone(), preserve_partitioning); Self { expr, input, @@ -787,11 +786,11 @@ impl SortExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( input: &Arc, sort_exprs: LexOrdering, preserve_partitioning: bool, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties; i.e. reset the ordering equivalence // class with the new ordering: let eq_properties = input @@ -811,7 +810,7 @@ impl SortExec { ExecutionMode::Bounded => ExecutionMode::Bounded, }; - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } } @@ -840,7 +839,7 @@ impl ExecutionPlan for SortExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index 16bdecd0f384..862146e10549 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -26,7 +26,7 @@ use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use crate::sorts::streaming_merge; use crate::{ DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, - PlanPropertiesCache, SendableRecordBatchStream, Statistics, + PlanProperties, SendableRecordBatchStream, Statistics, }; use datafusion_common::{internal_err, DataFusionError, Result}; @@ -74,13 +74,13 @@ pub struct SortPreservingMergeExec { /// Optional number of rows to fetch. Stops producing rows after this fetch fetch: Option, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl SortPreservingMergeExec { /// Create a new sort execution plan pub fn new(expr: Vec, input: Arc) -> Self { - let cache = Self::create_cache(&input); + let cache = Self::compute_properties(&input); Self { input, expr, @@ -111,8 +111,8 @@ impl SortPreservingMergeExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(input: &Arc) -> PlanPropertiesCache { - PlanPropertiesCache::new( + fn compute_properties(input: &Arc) -> PlanProperties { + PlanProperties::new( input.equivalence_properties().clone(), // Equivalence Properties Partitioning::UnknownPartitioning(1), // Output Partitioning input.execution_mode(), // Execution Mode @@ -149,7 +149,7 @@ impl ExecutionPlan for SortPreservingMergeExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/streaming.rs b/datafusion/physical-plan/src/streaming.rs index 90e8600d78e5..123588c34dff 100644 --- a/datafusion/physical-plan/src/streaming.rs +++ b/datafusion/physical-plan/src/streaming.rs @@ -20,7 +20,7 @@ use std::any::Any; use std::sync::Arc; -use super::{DisplayAs, DisplayFormatType, ExecutionMode, PlanPropertiesCache}; +use super::{DisplayAs, DisplayFormatType, ExecutionMode, PlanProperties}; use crate::display::{display_orderings, ProjectSchemaDisplay}; use crate::stream::RecordBatchStreamAdapter; use crate::{ExecutionPlan, Partitioning, SendableRecordBatchStream}; @@ -58,7 +58,7 @@ pub struct StreamingTableExec { projected_schema: SchemaRef, projected_output_ordering: Vec, infinite: bool, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl StreamingTableExec { @@ -87,7 +87,7 @@ impl StreamingTableExec { }; let projected_output_ordering = projected_output_ordering.into_iter().collect::>(); - let cache = Self::create_cache( + let cache = Self::compute_properties( projected_schema.clone(), &projected_output_ordering, &partitions, @@ -128,12 +128,12 @@ impl StreamingTableExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( schema: SchemaRef, orderings: &[LexOrdering], partitions: &[Arc], is_infinite: bool, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties: let eq_properties = EquivalenceProperties::new_with_orderings(schema, orderings); @@ -147,7 +147,7 @@ impl StreamingTableExec { ExecutionMode::Bounded }; - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } } @@ -195,7 +195,7 @@ impl ExecutionPlan for StreamingTableExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/test/exec.rs b/datafusion/physical-plan/src/test/exec.rs index a70e05809923..23df3753e817 100644 --- a/datafusion/physical-plan/src/test/exec.rs +++ b/datafusion/physical-plan/src/test/exec.rs @@ -27,7 +27,7 @@ use std::{ use crate::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter}; use crate::{ common, DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, - PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, Statistics, + PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; @@ -121,7 +121,7 @@ pub struct MockExec { /// if true (the default), sends data using a separate task to to ensure the /// batches are not available without this stream yielding first use_task: bool, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl MockExec { @@ -133,7 +133,7 @@ impl MockExec { /// ensure any poll loops are correct. This behavior can be /// changed with `with_use_task` pub fn new(data: Vec>, schema: SchemaRef) -> Self { - let cache = Self::create_cache(schema.clone()); + let cache = Self::compute_properties(schema.clone()); Self { data, schema, @@ -151,10 +151,10 @@ impl MockExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(1), ExecutionMode::Bounded, @@ -181,7 +181,7 @@ impl ExecutionPlan for MockExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } @@ -282,7 +282,7 @@ pub struct BarrierExec { /// all streams wait on this barrier to produce barrier: Arc, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl BarrierExec { @@ -290,7 +290,7 @@ impl BarrierExec { pub fn new(data: Vec>, schema: SchemaRef) -> Self { // wait for all streams and the input let barrier = Arc::new(Barrier::new(data.len() + 1)); - let cache = Self::create_cache(schema.clone(), &data); + let cache = Self::compute_properties(schema.clone(), &data); Self { data, schema, @@ -307,9 +307,12 @@ impl BarrierExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef, data: &[Vec]) -> PlanPropertiesCache { + fn compute_properties( + schema: SchemaRef, + data: &[Vec], + ) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(data.len()), ExecutionMode::Bounded, @@ -336,7 +339,7 @@ impl ExecutionPlan for BarrierExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } @@ -394,7 +397,7 @@ impl ExecutionPlan for BarrierExec { /// A mock execution plan that errors on a call to execute #[derive(Debug)] pub struct ErrorExec { - cache: PlanPropertiesCache, + cache: PlanProperties, } impl Default for ErrorExec { @@ -410,15 +413,15 @@ impl ErrorExec { DataType::Int64, true, )])); - let cache = Self::create_cache(schema.clone()); + let cache = Self::compute_properties(schema.clone()); Self { cache } } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(1), ExecutionMode::Bounded, @@ -445,7 +448,7 @@ impl ExecutionPlan for ErrorExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } @@ -475,7 +478,7 @@ impl ExecutionPlan for ErrorExec { pub struct StatisticsExec { stats: Statistics, schema: Arc, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl StatisticsExec { pub fn new(stats: Statistics, schema: Schema) -> Self { @@ -484,7 +487,7 @@ impl StatisticsExec { .column_statistics.len(), schema.fields().len(), "if defined, the column statistics vector length should be the number of fields" ); - let cache = Self::create_cache(Arc::new(schema.clone())); + let cache = Self::compute_properties(Arc::new(schema.clone())); Self { stats, schema: Arc::new(schema), @@ -493,10 +496,10 @@ impl StatisticsExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(2), ExecutionMode::Bounded, @@ -528,7 +531,7 @@ impl ExecutionPlan for StatisticsExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } @@ -566,13 +569,13 @@ pub struct BlockingExec { /// Ref-counting helper to check if the plan and the produced stream are still in memory. refs: Arc<()>, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl BlockingExec { /// Create new [`BlockingExec`] with a give schema and number of partitions. pub fn new(schema: SchemaRef, n_partitions: usize) -> Self { - let cache = Self::create_cache(schema.clone(), n_partitions); + let cache = Self::compute_properties(schema.clone(), n_partitions); Self { schema, refs: Default::default(), @@ -590,10 +593,10 @@ impl BlockingExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef, n_partitions: usize) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef, n_partitions: usize) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(n_partitions), ExecutionMode::Bounded, @@ -620,7 +623,7 @@ impl ExecutionPlan for BlockingExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } @@ -704,7 +707,7 @@ pub struct PanicExec { /// Number of output partitions. Each partition will produce this /// many empty output record batches prior to panicing batches_until_panics: Vec, - cache: PlanPropertiesCache, + cache: PlanProperties, } impl PanicExec { @@ -712,7 +715,7 @@ impl PanicExec { /// partitions, which will each panic immediately. pub fn new(schema: SchemaRef, n_partitions: usize) -> Self { let batches_until_panics = vec![0; n_partitions]; - let cache = Self::create_cache(schema.clone(), &batches_until_panics); + let cache = Self::compute_properties(schema.clone(), &batches_until_panics); Self { schema, batches_until_panics, @@ -727,14 +730,14 @@ impl PanicExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( schema: SchemaRef, batches_until_panics: &[usize], - ) -> PlanPropertiesCache { + ) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); let num_partitions = batches_until_panics.len(); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(num_partitions), ExecutionMode::Bounded, @@ -761,7 +764,7 @@ impl ExecutionPlan for PanicExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 4d5377a9bdcc..a533f2249588 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -27,10 +27,10 @@ use std::task::{Context, Poll}; use std::{any::Any, sync::Arc}; use super::{ - exec_mode_flatten, + execution_mode_from_children, metrics::{ExecutionPlanMetricsSet, MetricsSet}, ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, - PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, Statistics, + PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; use crate::metrics::BaselineMetrics; use crate::stream::ObservedStream; @@ -91,14 +91,14 @@ pub struct UnionExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl UnionExec { /// Create a new UnionExec pub fn new(inputs: Vec>) -> Self { let schema = union_schema(&inputs); - let cache = Self::create_cache(&inputs, schema); + let cache = Self::compute_properties(&inputs, schema); UnionExec { inputs, metrics: ExecutionPlanMetricsSet::new(), @@ -112,10 +112,10 @@ impl UnionExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( inputs: &[Arc], schema: SchemaRef, - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties: // TODO: In some cases, we should be able to preserve some equivalence // classes and constants. Add support for such cases. @@ -161,9 +161,9 @@ impl UnionExec { let output_partitioning = Partitioning::UnknownPartitioning(num_partitions); // Determine execution mode: - let mode = exec_mode_flatten(inputs.iter()); + let mode = execution_mode_from_children(inputs.iter()); - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } } @@ -187,7 +187,7 @@ impl ExecutionPlan for UnionExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } @@ -317,7 +317,7 @@ pub struct InterleaveExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl InterleaveExec { @@ -328,7 +328,7 @@ impl InterleaveExec { "Not all InterleaveExec children have a consistent hash partitioning" ); } - let cache = Self::create_cache(&inputs); + let cache = Self::compute_properties(&inputs); Ok(InterleaveExec { inputs, metrics: ExecutionPlanMetricsSet::new(), @@ -342,15 +342,15 @@ impl InterleaveExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(inputs: &[Arc]) -> PlanPropertiesCache { + fn compute_properties(inputs: &[Arc]) -> PlanProperties { let schema = union_schema(inputs); let eq_properties = EquivalenceProperties::new(schema); // Get output partitioning: let output_partitioning = inputs[0].output_partitioning().clone(); // Determine execution mode: - let mode = exec_mode_flatten(inputs.iter()); + let mode = execution_mode_from_children(inputs.iter()); - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } } @@ -374,7 +374,7 @@ impl ExecutionPlan for InterleaveExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index d727091fd1c3..72fcbfd2ffb9 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -20,7 +20,7 @@ use std::{any::Any, sync::Arc}; use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; -use super::{DisplayAs, PlanPropertiesCache}; +use super::{DisplayAs, PlanProperties}; use crate::{ expressions::Column, DisplayFormatType, Distribution, ExecutionPlan, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream, @@ -60,7 +60,7 @@ pub struct UnnestExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl UnnestExec { @@ -71,7 +71,7 @@ impl UnnestExec { schema: SchemaRef, options: UnnestOptions, ) -> Self { - let cache = Self::create_cache(&input, schema.clone()); + let cache = Self::compute_properties(&input, schema.clone()); UnnestExec { input, schema, @@ -83,13 +83,13 @@ impl UnnestExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( input: &Arc, schema: SchemaRef, - ) -> PlanPropertiesCache { + ) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, input.output_partitioning().clone(), input.execution_mode(), @@ -116,7 +116,7 @@ impl ExecutionPlan for UnnestExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/values.rs b/datafusion/physical-plan/src/values.rs index f31272879279..1e535b43d7c0 100644 --- a/datafusion/physical-plan/src/values.rs +++ b/datafusion/physical-plan/src/values.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::sync::Arc; use super::{ - common, DisplayAs, ExecutionMode, PlanPropertiesCache, SendableRecordBatchStream, + common, DisplayAs, ExecutionMode, PlanProperties, SendableRecordBatchStream, Statistics, }; use crate::{ @@ -43,7 +43,7 @@ pub struct ValuesExec { /// The data data: Vec, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl ValuesExec { @@ -114,7 +114,7 @@ impl ValuesExec { } } - let cache = Self::create_cache(schema.clone()); + let cache = Self::compute_properties(schema.clone()); Ok(ValuesExec { schema, data: batches, @@ -128,10 +128,10 @@ impl ValuesExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(1), ExecutionMode::Bounded, @@ -159,7 +159,7 @@ impl ExecutionPlan for ValuesExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index d7579cdc041d..eb4a27341785 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -35,7 +35,7 @@ use crate::windows::{ }; use crate::{ ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, - InputOrderMode, PlanPropertiesCache, RecordBatchStream, SendableRecordBatchStream, + InputOrderMode, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, WindowExpr, }; @@ -90,7 +90,7 @@ pub struct BoundedWindowAggExec { // See `get_ordered_partition_by_indices` for more details. ordered_partition_by_indices: Vec, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl BoundedWindowAggExec { @@ -121,7 +121,7 @@ impl BoundedWindowAggExec { vec![] } }; - let cache = Self::create_cache(&input, &schema, &window_expr); + let cache = Self::compute_properties(&input, &schema, &window_expr); Ok(Self { input, window_expr, @@ -183,11 +183,11 @@ impl BoundedWindowAggExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( input: &Arc, schema: &SchemaRef, window_expr: &[Arc], - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties: let eq_properties = window_equivalence_properties(schema, input, window_expr); @@ -197,7 +197,7 @@ impl BoundedWindowAggExec { let output_partitioning = input.output_partitioning().clone(); // Construct properties cache - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, // Equivalence Properties output_partitioning, // Output Partitioning input.execution_mode(), // Execution Mode @@ -240,7 +240,7 @@ impl ExecutionPlan for BoundedWindowAggExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/windows/window_agg_exec.rs b/datafusion/physical-plan/src/windows/window_agg_exec.rs index f143d228f381..efc84526c9ad 100644 --- a/datafusion/physical-plan/src/windows/window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/window_agg_exec.rs @@ -31,7 +31,7 @@ use crate::windows::{ }; use crate::{ ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionMode, - ExecutionPlan, PhysicalExpr, PlanPropertiesCache, RecordBatchStream, + ExecutionPlan, PhysicalExpr, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, WindowExpr, }; @@ -65,7 +65,7 @@ pub struct WindowAggExec { // see `get_ordered_partition_by_indices` for more details. ordered_partition_by_indices: Vec, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl WindowAggExec { @@ -80,7 +80,7 @@ impl WindowAggExec { let ordered_partition_by_indices = get_ordered_partition_by_indices(window_expr[0].partition_by(), &input); - let cache = Self::create_cache(schema.clone(), &input, &window_expr); + let cache = Self::compute_properties(schema.clone(), &input, &window_expr); Ok(Self { input, window_expr, @@ -117,11 +117,11 @@ impl WindowAggExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache( + fn compute_properties( schema: SchemaRef, input: &Arc, window_expr: &[Arc], - ) -> PlanPropertiesCache { + ) -> PlanProperties { // Calculate equivalence properties: let eq_properties = window_equivalence_properties(&schema, input, window_expr); @@ -139,7 +139,7 @@ impl WindowAggExec { }; // Construct properties cache: - PlanPropertiesCache::new(eq_properties, output_partitioning, mode) + PlanProperties::new(eq_properties, output_partitioning, mode) } } @@ -177,7 +177,7 @@ impl ExecutionPlan for WindowAggExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs index 44a42a4fcf92..9acabf2447eb 100644 --- a/datafusion/physical-plan/src/work_table.rs +++ b/datafusion/physical-plan/src/work_table.rs @@ -25,9 +25,7 @@ use super::{ SendableRecordBatchStream, Statistics, }; use crate::memory::MemoryStream; -use crate::{ - DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, PlanPropertiesCache, -}; +use crate::{DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, PlanProperties}; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; @@ -85,13 +83,13 @@ pub struct WorkTableExec { /// Execution metrics metrics: ExecutionPlanMetricsSet, /// Cache holding plan properties like equivalences, output partitioning etc. - cache: PlanPropertiesCache, + cache: PlanProperties, } impl WorkTableExec { /// Create a new execution plan for a worktable exec. pub fn new(name: String, schema: SchemaRef) -> Self { - let cache = Self::create_cache(schema.clone()); + let cache = Self::compute_properties(schema.clone()); Self { name, schema, @@ -112,10 +110,10 @@ impl WorkTableExec { } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. - fn create_cache(schema: SchemaRef) -> PlanPropertiesCache { + fn compute_properties(schema: SchemaRef) -> PlanProperties { let eq_properties = EquivalenceProperties::new(schema); - PlanPropertiesCache::new( + PlanProperties::new( eq_properties, Partitioning::UnknownPartitioning(1), ExecutionMode::Bounded, @@ -142,7 +140,7 @@ impl ExecutionPlan for WorkTableExec { self } - fn cache(&self) -> &PlanPropertiesCache { + fn properties(&self) -> &PlanProperties { &self.cache } From a0128449970fa4cefa659b110ea5de8123ed5f8b Mon Sep 17 00:00:00 2001 From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com> Date: Tue, 27 Feb 2024 19:09:54 +0300 Subject: [PATCH 30/45] Update datafusion/physical-plan/src/lib.rs Co-authored-by: Andrew Lamb --- datafusion/physical-plan/src/lib.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index d18b8f238859..cebe3a8837ad 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -460,6 +460,9 @@ pub enum ExecutionMode { /// Represents the mode where generated stream is unbounded, e.g. infinite. /// Even though the operator generates an unbounded stream of results, it /// works with bounded memory and execution can still continue successfully. + /// + /// The stream that results from calling `execute` on an `ExecutionPlan` that is `Unbounded` + /// will never be done (return `None`), except in case of error. Unbounded, /// Represents the mode where some of the operator's input stream(s) are /// unbounded; however, the operator cannot generate streaming results from From e4a994772b17d0e3f30e467ce1a685533062c7e7 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 27 Feb 2024 19:13:08 +0300 Subject: [PATCH 31/45] Update comments --- datafusion/physical-plan/src/lib.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index cebe3a8837ad..e21a96cee2ff 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -461,7 +461,7 @@ pub enum ExecutionMode { /// Even though the operator generates an unbounded stream of results, it /// works with bounded memory and execution can still continue successfully. /// - /// The stream that results from calling `execute` on an `ExecutionPlan` that is `Unbounded` + /// The stream that results from calling `execute` on an `ExecutionPlan` that is `Unbounded` /// will never be done (return `None`), except in case of error. Unbounded, /// Represents the mode where some of the operator's input stream(s) are @@ -511,8 +511,8 @@ fn execution_mode_from_children<'a>( /// Stores the plan properties used in query optimization. /// -/// This struct holds various properties useful for the query planning, which are used -/// during optimization and execution phases. +/// These properties are in a single structure to permit this information to be computed +/// once and then those cached results used multiple times without recomputation (aka a cache) #[derive(Debug, Clone)] pub struct PlanProperties { /// Stores the [`EquivalenceProperties`] of the [`ExecutionPlan`]. From a940a46a6677c6f9639272f12164ba8a71b2b7c4 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Tue, 27 Feb 2024 19:44:12 +0300 Subject: [PATCH 32/45] Move properties to another trait. --- datafusion/core/src/dataframe/mod.rs | 2 +- .../core/src/datasource/listing/table.rs | 1 + datafusion/core/src/datasource/memory.rs | 5 +- .../datasource/physical_plan/arrow_file.rs | 2 +- .../core/src/datasource/physical_plan/csv.rs | 3 +- .../core/src/datasource/physical_plan/json.rs | 3 +- .../datasource/physical_plan/parquet/mod.rs | 12 +- .../enforce_distribution.rs | 2 + .../src/physical_optimizer/enforce_sorting.rs | 5 +- .../src/physical_optimizer/join_selection.rs | 2 + .../limited_distinct_aggregation.rs | 1 + .../physical_optimizer/output_requirements.rs | 4 +- .../physical_optimizer/pipeline_checker.rs | 1 + .../physical_optimizer/projection_pushdown.rs | 1 + .../replace_with_order_preserving_variants.rs | 1 + .../src/physical_optimizer/sort_pushdown.rs | 3 +- .../physical_optimizer/topk_aggregation.rs | 3 +- .../core/src/physical_optimizer/utils.rs | 1 + datafusion/core/src/physical_planner.rs | 1 + .../physical-plan/src/aggregates/mod.rs | 4 +- .../physical-plan/src/aggregates/row_hash.rs | 3 +- datafusion/physical-plan/src/analyze.rs | 5 +- .../physical-plan/src/coalesce_batches.rs | 2 +- .../physical-plan/src/coalesce_partitions.rs | 5 +- datafusion/physical-plan/src/common.rs | 2 +- datafusion/physical-plan/src/filter.rs | 4 +- datafusion/physical-plan/src/insert.rs | 4 +- .../physical-plan/src/joins/cross_join.rs | 1 + .../physical-plan/src/joins/hash_join.rs | 1 + .../src/joins/nested_loop_join.rs | 4 +- .../src/joins/sort_merge_join.rs | 12 +- .../src/joins/symmetric_hash_join.rs | 4 +- datafusion/physical-plan/src/lib.rs | 134 +++++++++++------- datafusion/physical-plan/src/limit.rs | 4 +- datafusion/physical-plan/src/projection.rs | 3 +- .../physical-plan/src/repartition/mod.rs | 4 +- .../physical-plan/src/sorts/partial_sort.rs | 4 +- datafusion/physical-plan/src/sorts/sort.rs | 3 +- .../src/sorts/sort_preserving_merge.rs | 4 +- datafusion/physical-plan/src/union.rs | 7 +- datafusion/physical-plan/src/unnest.rs | 2 +- .../src/windows/bounded_window_agg_exec.rs | 4 +- datafusion/physical-plan/src/windows/mod.rs | 2 +- .../src/windows/window_agg_exec.rs | 4 +- 44 files changed, 170 insertions(+), 109 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 7898b71e23f6..1372570179fe 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1517,7 +1517,7 @@ mod tests { WindowFunctionDefinition, }; use datafusion_physical_expr::expressions::Column; - use datafusion_physical_plan::get_plan_string; + use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties}; // Get string representation of the plan async fn assert_physical_plan(df: &DataFrame, expected: Vec<&str>) { diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 56e64f556c12..a1f3d14aacca 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -920,6 +920,7 @@ mod tests { use datafusion_common::{assert_contains, GetExt, ScalarValue}; use datafusion_expr::{BinaryExpr, LogicalPlanBuilder, Operator}; use datafusion_physical_expr::PhysicalSortExpr; + use datafusion_physical_plan::ExecutionPlanProperties; use tempfile::TempDir; #[tokio::test] diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index 901e74dfc218..b4a51be264eb 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -32,6 +32,7 @@ use datafusion_common::{ not_impl_err, plan_err, Constraints, DFSchema, DataFusionError, SchemaExt, }; use datafusion_execution::TaskContext; +use datafusion_physical_plan::ExecutionPlanProperties; use parking_lot::Mutex; use tokio::sync::RwLock; use tokio::task::JoinSet; @@ -161,10 +162,10 @@ impl MemTable { let exec = MemoryExec::try_new(&data, schema.clone(), None)?; if let Some(num_partitions) = output_partitions { - let exec = RepartitionExec::try_new( + let exec = Arc::new(RepartitionExec::try_new( Arc::new(exec), Partitioning::RoundRobinBatch(num_partitions), - )?; + )?) as Arc; // execute and collect results let mut output_partitions = vec![]; diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 8f010f1dcbf8..817f330097ac 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -152,7 +152,7 @@ impl ExecutionPlan for ArrowExec { let repartitioned_file_groups_option = FileGroupPartitioner::new() .with_target_partitions(target_partitions) .with_repartition_file_min_size(repartition_file_min_size) - .with_preserve_order_within_groups(self.output_ordering().is_some()) + .with_preserve_order_within_groups(self.cache.output_ordering().is_some()) .repartition_file_groups(&self.base_config.file_groups); if let Some(repartitioned_file_groups) = repartitioned_file_groups_option { diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index a509121a82c8..3066d77acf45 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -43,6 +43,7 @@ use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; use bytes::{Buf, Bytes}; +use datafusion_physical_plan::ExecutionPlanProperties; use futures::{ready, StreamExt, TryStreamExt}; use object_store::{GetOptions, GetResultPayload, ObjectStore}; use tokio::io::AsyncWriteExt; @@ -197,7 +198,7 @@ impl ExecutionPlan for CsvExec { let repartitioned_file_groups_option = FileGroupPartitioner::new() .with_target_partitions(target_partitions) - .with_preserve_order_within_groups(self.output_ordering().is_some()) + .with_preserve_order_within_groups(self.cache.output_ordering().is_some()) .with_repartition_file_min_size(repartition_file_min_size) .repartition_file_groups(&self.base_config.file_groups); diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 7b0e84c4410b..c471035bd286 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -42,6 +42,7 @@ use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; use bytes::{Buf, Bytes}; +use datafusion_physical_plan::ExecutionPlanProperties; use futures::{ready, StreamExt, TryStreamExt}; use object_store::{self, GetOptions, GetResultPayload, ObjectStore}; use tokio::io::AsyncWriteExt; @@ -150,7 +151,7 @@ impl ExecutionPlan for NdJsonExec { config: &datafusion_common::config::ConfigOptions, ) -> Result>> { let repartition_file_min_size = config.optimizer.repartition_file_min_size; - let preserve_order_within_groups = self.output_ordering().is_some(); + let preserve_order_within_groups = self.cache.output_ordering().is_some(); let file_groups = &self.base_config.file_groups; let repartitioned_file_groups_option = FileGroupPartitioner::new() diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 997bdae762b8..e6962040e8ac 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -48,6 +48,7 @@ use arrow::error::ArrowError; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalExpr}; use bytes::Bytes; +use datafusion_physical_plan::ExecutionPlanProperties; use futures::future::BoxFuture; use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; @@ -362,7 +363,7 @@ impl ExecutionPlan for ParquetExec { let repartitioned_file_groups_option = FileGroupPartitioner::new() .with_target_partitions(target_partitions) .with_repartition_file_min_size(repartition_file_min_size) - .with_preserve_order_within_groups(self.output_ordering().is_some()) + .with_preserve_order_within_groups(self.cache.output_ordering().is_some()) .repartition_file_groups(&self.base_config.file_groups); let mut new_plan = self.clone(); @@ -1557,7 +1558,7 @@ mod tests { expected_row_num: Option, file_schema: SchemaRef, ) -> Result<()> { - let parquet_exec = ParquetExec::new( + let parquet_exec = Arc::new(ParquetExec::new( FileScanConfig { object_store_url: ObjectStoreUrl::local_filesystem(), file_groups, @@ -1570,7 +1571,7 @@ mod tests { }, None, None, - ); + )) as Arc; assert_eq!(parquet_exec.output_partitioning().partition_count(), 1); let results = parquet_exec.execute(0, state.task_ctx())?.next().await; @@ -1685,7 +1686,10 @@ mod tests { None, None, ); - assert_eq!(parquet_exec.output_partitioning().partition_count(), 1); + assert_eq!( + parquet_exec.cache.output_partitioning().partition_count(), + 1 + ); assert_eq!(parquet_exec.schema().as_ref(), &expected_schema); let mut results = parquet_exec.execute(0, task_ctx)?; diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index c7ffc7838b36..c08aadc33c74 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -56,6 +56,7 @@ use datafusion_physical_expr::{ use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::windows::{get_best_fitting_window, BoundedWindowAggExec}; +use datafusion_physical_plan::ExecutionPlanProperties; use itertools::izip; /// The `EnforceDistribution` rule ensures that distribution requirements are @@ -1404,6 +1405,7 @@ pub(crate) mod tests { // model that it requires the output ordering of its input fn required_input_ordering(&self) -> Vec>> { vec![self + .cache .output_ordering() .map(PhysicalSortRequirement::from_sort_exprs)] } diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs index b459c86518b6..25280261c0a0 100644 --- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs @@ -65,6 +65,7 @@ use datafusion_physical_expr::{PhysicalSortExpr, PhysicalSortRequirement}; use datafusion_physical_plan::repartition::RepartitionExec; use datafusion_physical_plan::sorts::partial_sort::PartialSortExec; +use datafusion_physical_plan::ExecutionPlanProperties; use itertools::izip; /// This rule inspects [`SortExec`]'s in the given physical plan and removes the @@ -390,7 +391,7 @@ fn analyze_immediate_sort_removal( // If this sort is unnecessary, we should remove it: if sort_input .equivalence_properties() - .ordering_satisfy(sort_exec.output_ordering().unwrap_or(&[])) + .ordering_satisfy(node.plan.output_ordering().unwrap_or(&[])) { node.plan = if !sort_exec.preserve_partitioning() && sort_input.output_partitioning().partition_count() > 1 @@ -573,7 +574,7 @@ fn remove_corresponding_sort_from_sub_plan( { node.plan = Arc::new(RepartitionExec::try_new( node.children[0].plan.clone(), - repartition.output_partitioning().clone(), + repartition.properties().output_partitioning().clone(), )?) as _; } }; diff --git a/datafusion/core/src/physical_optimizer/join_selection.rs b/datafusion/core/src/physical_optimizer/join_selection.rs index f74732305372..349e33dae251 100644 --- a/datafusion/core/src/physical_optimizer/join_selection.rs +++ b/datafusion/core/src/physical_optimizer/join_selection.rs @@ -42,6 +42,7 @@ use datafusion_common::{internal_err, DataFusionError, JoinSide, JoinType}; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::sort_properties::SortProperties; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; +use datafusion_physical_plan::ExecutionPlanProperties; /// The [`JoinSelection`] rule tries to modify a given plan so that it can /// accommodate infinite sources and optimize joins in the plan according to @@ -1377,6 +1378,7 @@ mod hash_join_tests { use arrow::record_batch::RecordBatch; use datafusion_common::utils::DataPtr; use datafusion_common::JoinType; + use datafusion_physical_plan::ExecutionPlanProperties; use std::sync::Arc; struct TestCase { diff --git a/datafusion/core/src/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/src/physical_optimizer/limited_distinct_aggregation.rs index 9855247151b8..036c938c1ca6 100644 --- a/datafusion/core/src/physical_optimizer/limited_distinct_aggregation.rs +++ b/datafusion/core/src/physical_optimizer/limited_distinct_aggregation.rs @@ -25,6 +25,7 @@ use crate::physical_plan::ExecutionPlan; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::Result; +use datafusion_physical_plan::ExecutionPlanProperties; use itertools::Itertools; use std::sync::Arc; diff --git a/datafusion/core/src/physical_optimizer/output_requirements.rs b/datafusion/core/src/physical_optimizer/output_requirements.rs index 992a6e7f82c0..da0697eb9aba 100644 --- a/datafusion/core/src/physical_optimizer/output_requirements.rs +++ b/datafusion/core/src/physical_optimizer/output_requirements.rs @@ -33,7 +33,7 @@ use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{Result, Statistics}; use datafusion_physical_expr::{Distribution, LexRequirement, PhysicalSortRequirement}; use datafusion_physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use datafusion_physical_plan::PlanProperties; +use datafusion_physical_plan::{ExecutionPlanProperties, PlanProperties}; /// This rule either adds or removes [`OutputRequirements`]s to/from the physical /// plan according to its `mode` attribute, which is set by the constructors @@ -242,7 +242,7 @@ fn require_top_ordering_helper( if children.len() != 1 { Ok((plan, false)) } else if let Some(sort_exec) = plan.as_any().downcast_ref::() { - let req_ordering = sort_exec.output_ordering().unwrap_or(&[]); + let req_ordering = plan.output_ordering().unwrap_or(&[]); let req_dist = sort_exec.required_input_distribution()[0].clone(); let reqs = PhysicalSortRequirement::from_sort_exprs(req_ordering); Ok(( diff --git a/datafusion/core/src/physical_optimizer/pipeline_checker.rs b/datafusion/core/src/physical_optimizer/pipeline_checker.rs index 9a7afcb6409d..dcf9ef6f48b7 100644 --- a/datafusion/core/src/physical_optimizer/pipeline_checker.rs +++ b/datafusion/core/src/physical_optimizer/pipeline_checker.rs @@ -31,6 +31,7 @@ use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{plan_err, DataFusionError}; use datafusion_physical_expr::intervals::utils::{check_support, is_datatype_supported}; use datafusion_physical_plan::joins::SymmetricHashJoinExec; +use datafusion_physical_plan::ExecutionPlanProperties; /// The PipelineChecker rule rejects non-runnable query plans that use /// pipeline-breaking operators on infinite input(s). diff --git a/datafusion/core/src/physical_optimizer/projection_pushdown.rs b/datafusion/core/src/physical_optimizer/projection_pushdown.rs index 79d22374f9c2..1b9bf17bffa4 100644 --- a/datafusion/core/src/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/projection_pushdown.rs @@ -53,6 +53,7 @@ use datafusion_physical_expr::{ use datafusion_physical_plan::streaming::StreamingTableExec; use datafusion_physical_plan::union::UnionExec; +use datafusion_physical_plan::ExecutionPlanProperties; use itertools::Itertools; /// This rule inspects [`ProjectionExec`]'s in the given physical plan and tries to diff --git a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs index 8825feb45e98..be4b7d13bc7e 100644 --- a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs @@ -32,6 +32,7 @@ use datafusion_common::tree_node::Transformed; use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion_physical_plan::tree_node::PlanContext; +use datafusion_physical_plan::ExecutionPlanProperties; use itertools::izip; /// For a given `plan`, this object carries the information one needs from its diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 0efa908cf5fc..a5b1f12147f3 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -37,6 +37,7 @@ use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::{ LexRequirementRef, PhysicalSortExpr, PhysicalSortRequirement, }; +use datafusion_physical_plan::ExecutionPlanProperties; /// This is a "data class" we use within the [`EnforceSorting`] rule to push /// down [`SortExec`] in the plan. In some cases, we can reduce the total @@ -262,7 +263,7 @@ fn try_pushdown_requirements_to_join( &smj.maintains_input_order(), Some(probe_side), ); - let mut smj_eqs = smj.equivalence_properties().clone(); + let mut smj_eqs = smj.properties().equivalence_properties().clone(); // smj will have this ordering when its input changes. smj_eqs = smj_eqs.with_reorder(new_output_ordering.unwrap_or_default()); let should_pushdown = smj_eqs.ordering_satisfy_requirement(parent_required); diff --git a/datafusion/core/src/physical_optimizer/topk_aggregation.rs b/datafusion/core/src/physical_optimizer/topk_aggregation.rs index dd0261420304..7459deb1f72a 100644 --- a/datafusion/core/src/physical_optimizer/topk_aggregation.rs +++ b/datafusion/core/src/physical_optimizer/topk_aggregation.rs @@ -30,6 +30,7 @@ use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::Result; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::PhysicalSortExpr; +use datafusion_physical_plan::ExecutionPlanProperties; use itertools::Itertools; use std::sync::Arc; @@ -86,7 +87,7 @@ impl TopKAggregation { let children = sort.children(); let child = children.iter().exactly_one().ok()?; - let order = sort.output_ordering()?; + let order = plan.output_ordering()?; let order = order.iter().exactly_one().ok()?; let limit = sort.fetch()?; diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs index 4f4b17345ef8..f4c2c3873f68 100644 --- a/datafusion/core/src/physical_optimizer/utils.rs +++ b/datafusion/core/src/physical_optimizer/utils.rs @@ -30,6 +30,7 @@ use crate::physical_plan::ExecutionPlan; use datafusion_physical_expr::{LexRequirement, PhysicalSortRequirement}; use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion_physical_plan::tree_node::PlanContext; +use datafusion_physical_plan::ExecutionPlanProperties; /// This utility function adds a `SortExec` above an operator according to the /// given ordering requirements while preserving the original partitioning. diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index debde6aa4571..b16cc77a43f2 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -96,6 +96,7 @@ use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_sql::utils::window_expr_common_partition_keys; use async_trait::async_trait; +use datafusion_physical_plan::ExecutionPlanProperties; use futures::future::BoxFuture; use futures::{FutureExt, StreamExt, TryStreamExt}; use itertools::{multiunzip, Itertools}; diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 98d41cca6764..f96eacf45896 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -20,7 +20,7 @@ use std::any::Any; use std::sync::Arc; -use super::{DisplayAs, ExecutionMode, PlanProperties}; +use super::{DisplayAs, ExecutionMode, ExecutionPlanProperties, PlanProperties}; use crate::aggregates::{ no_grouping::AggregateStream, row_hash::GroupedHashAggregateStream, topk_stream::GroupedTopKAggregateStream, @@ -496,7 +496,7 @@ impl AggregateExec { return false; } // ensure there is no output ordering; can this rule be relaxed? - if self.output_ordering().is_some() { + if self.cache.output_ordering().is_some() { return false; } // ensure no ordering is required on the input diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index f9db0a050cfc..8036012b7262 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -32,7 +32,7 @@ use crate::metrics::{BaselineMetrics, RecordOutput}; use crate::sorts::sort::{read_spill_as_stream, sort_batch}; use crate::sorts::streaming_merge; use crate::stream::RecordBatchStreamAdapter; -use crate::{aggregates, ExecutionPlan, PhysicalExpr}; +use crate::{aggregates, PhysicalExpr}; use crate::{RecordBatchStream, SendableRecordBatchStream}; use arrow::array::*; @@ -341,6 +341,7 @@ impl GroupedHashAggregateStream { .with_can_spill(true) .register(context.memory_pool()); let (ordering, _) = agg + .cache .equivalence_properties() .find_longest_permutation(&agg_group_by.output_exprs()); let group_ordering = GroupOrdering::try_new( diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs index fed4b97d2afb..f771ac238887 100644 --- a/datafusion/physical-plan/src/analyze.rs +++ b/datafusion/physical-plan/src/analyze.rs @@ -21,7 +21,10 @@ use std::sync::Arc; use std::{any::Any, time::Instant}; use super::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter}; -use super::{DisplayAs, Distribution, PlanProperties, SendableRecordBatchStream}; +use super::{ + DisplayAs, Distribution, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; use crate::display::DisplayableExecutionPlan; use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; diff --git a/datafusion/physical-plan/src/coalesce_batches.rs b/datafusion/physical-plan/src/coalesce_batches.rs index 055f16288f95..0b9ecebbb1e8 100644 --- a/datafusion/physical-plan/src/coalesce_batches.rs +++ b/datafusion/physical-plan/src/coalesce_batches.rs @@ -24,7 +24,7 @@ use std::sync::Arc; use std::task::{Context, Poll}; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; -use super::{DisplayAs, PlanProperties, Statistics}; +use super::{DisplayAs, ExecutionPlanProperties, PlanProperties, Statistics}; use crate::{ DisplayFormatType, ExecutionPlan, RecordBatchStream, SendableRecordBatchStream, }; diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index 7037445164a3..3c5b7e9c13fb 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -23,7 +23,10 @@ use std::sync::Arc; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use super::stream::{ObservedStream, RecordBatchReceiverStream}; -use super::{DisplayAs, PlanProperties, SendableRecordBatchStream, Statistics}; +use super::{ + DisplayAs, ExecutionPlanProperties, PlanProperties, SendableRecordBatchStream, + Statistics, +}; use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; diff --git a/datafusion/physical-plan/src/common.rs b/datafusion/physical-plan/src/common.rs index 5172bc9b2a3c..003c60edd9a8 100644 --- a/datafusion/physical-plan/src/common.rs +++ b/datafusion/physical-plan/src/common.rs @@ -22,7 +22,7 @@ use std::fs::{metadata, File}; use std::path::{Path, PathBuf}; use std::sync::Arc; -use super::SendableRecordBatchStream; +use super::{ExecutionPlanProperties, SendableRecordBatchStream}; use crate::stream::RecordBatchReceiverStream; use crate::{ColumnStatistics, ExecutionPlan, Statistics}; diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index 95c09f541cc2..4155b00820f4 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -24,8 +24,8 @@ use std::sync::Arc; use std::task::{Context, Poll}; use super::{ - ColumnStatistics, DisplayAs, PlanProperties, RecordBatchStream, - SendableRecordBatchStream, Statistics, + ColumnStatistics, DisplayAs, ExecutionPlanProperties, PlanProperties, + RecordBatchStream, SendableRecordBatchStream, Statistics, }; use crate::{ metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, diff --git a/datafusion/physical-plan/src/insert.rs b/datafusion/physical-plan/src/insert.rs index fd0bec108e03..7d44828ac243 100644 --- a/datafusion/physical-plan/src/insert.rs +++ b/datafusion/physical-plan/src/insert.rs @@ -23,8 +23,8 @@ use std::fmt::Debug; use std::sync::Arc; use super::{ - DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, - SendableRecordBatchStream, + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning, + PlanProperties, SendableRecordBatchStream, }; use crate::metrics::MetricsSet; use crate::stream::RecordBatchStreamAdapter; diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index f73f3d36cdb4..9f8dc0ce56b0 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -26,6 +26,7 @@ use super::utils::{ use crate::coalesce_batches::concat_batches; use crate::coalesce_partitions::CoalescePartitionsExec; use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; +use crate::ExecutionPlanProperties; use crate::{ execution_mode_from_children, ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionMode, ExecutionPlan, PlanProperties, RecordBatchStream, diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs index 2fdb2a17ebe8..ee3438c6a363 100644 --- a/datafusion/physical-plan/src/joins/hash_join.rs +++ b/datafusion/physical-plan/src/joins/hash_join.rs @@ -27,6 +27,7 @@ use super::{ utils::{OnceAsync, OnceFut}, PartitionMode, }; +use crate::ExecutionPlanProperties; use crate::{ coalesce_partitions::CoalescePartitionsExec, execution_mode_from_children, handle_state, diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index 5d2175d4a820..6fe28c8b54f3 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -35,8 +35,8 @@ use crate::joins::utils::{ use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use crate::{ execution_mode_from_children, DisplayAs, DisplayFormatType, Distribution, - ExecutionMode, ExecutionPlan, PlanProperties, RecordBatchStream, - SendableRecordBatchStream, + ExecutionMode, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + RecordBatchStream, SendableRecordBatchStream, }; use arrow::array::{ diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index bde831b731ba..7b70a2952b4c 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -38,8 +38,8 @@ use crate::joins::utils::{ use crate::metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; use crate::{ execution_mode_from_children, metrics, DisplayAs, DisplayFormatType, Distribution, - ExecutionPlan, PhysicalExpr, PlanProperties, RecordBatchStream, - SendableRecordBatchStream, Statistics, + ExecutionPlan, ExecutionPlanProperties, PhysicalExpr, PlanProperties, + RecordBatchStream, SendableRecordBatchStream, Statistics, }; use arrow::array::*; @@ -189,16 +189,16 @@ impl SortMergeJoinExec { &self.on } - pub fn right(&self) -> &dyn ExecutionPlan { - self.right.as_ref() + pub fn right(&self) -> &Arc { + &self.right } pub fn join_type(&self) -> JoinType { self.join_type } - pub fn left(&self) -> &dyn ExecutionPlan { - self.left.as_ref() + pub fn left(&self) -> &Arc { + &self.left } /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 77871a8b5483..6460b2b478ef 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -50,8 +50,8 @@ use crate::{ expressions::PhysicalSortExpr, joins::StreamJoinPartitionMode, metrics::{ExecutionPlanMetricsSet, MetricsSet}, - DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, PlanProperties, - RecordBatchStream, SendableRecordBatchStream, Statistics, + DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties, + PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; use arrow::array::{ diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index e21a96cee2ff..e37f84fb0150 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -127,33 +127,6 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { fn properties(&self) -> &PlanProperties; - /// Specifies how the output of this `ExecutionPlan` is split into - /// partitions. - fn output_partitioning(&self) -> &Partitioning { - &self.properties().partitioning - } - - /// Specifies whether this plan generates an infinite stream of records. - /// If the plan does not support pipelining, but its input(s) are - /// infinite, returns an error to indicate this. - fn execution_mode(&self) -> ExecutionMode { - self.properties().exec_mode - } - - /// If the output of this `ExecutionPlan` within each partition is sorted, - /// returns `Some(keys)` with the description of how it was sorted. - /// - /// For example, Sort, (obviously) produces sorted output as does - /// SortPreservingMergeStream. Less obviously `Projection` - /// produces sorted output if its input was sorted as it does not - /// reorder the input rows, - /// - /// It is safe to return `None` here if your `ExecutionPlan` does not - /// have any particular output order here - fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { - self.properties().output_ordering.as_deref() - } - /// Specifies the data distribution requirements for all the /// children for this `ExecutionPlan`, By default it's [[Distribution::UnspecifiedDistribution]] for each child, fn required_input_distribution(&self) -> Vec { @@ -212,27 +185,6 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { .collect() } - /// Get the [`EquivalenceProperties`] within the plan. - /// - /// Equivalence properties tell DataFusion what columns are known to be - /// equal, during various optimization passes. By default, this returns "no - /// known equivalences" which is always correct, but may cause DataFusion to - /// unnecessarily resort data. - /// - /// If this ExecutionPlan makes no changes to the schema of the rows flowing - /// through it or how columns within each row relate to each other, it - /// should return the equivalence properties of its input. For - /// example, since `FilterExec` may remove rows from its input, but does not - /// otherwise modify them, it preserves its input equivalence properties. - /// However, since `ProjectionExec` may calculate derived expressions, it - /// needs special handling. - /// - /// See also [`Self::maintains_input_order`] and [`Self::output_ordering`] - /// for related concepts. - fn equivalence_properties(&self) -> &EquivalenceProperties { - &self.properties().eq_properties - } - /// Get a list of children `ExecutionPlan`s that act as inputs to this plan. /// The returned list will be empty for leaf nodes such as scans, will contain /// a single value for unary nodes, or two values for binary nodes (such as @@ -450,6 +402,66 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { } } +pub trait ExecutionPlanProperties { + fn output_partitioning(&self) -> &Partitioning; + + fn execution_mode(&self) -> ExecutionMode; + + fn output_ordering(&self) -> Option<&[PhysicalSortExpr]>; + + fn equivalence_properties(&self) -> &EquivalenceProperties; +} + +impl ExecutionPlanProperties for Arc { + /// Specifies how the output of this `ExecutionPlan` is split into + /// partitions. + fn output_partitioning(&self) -> &Partitioning { + self.properties().output_partitioning() + } + + /// Specifies whether this plan generates an infinite stream of records. + /// If the plan does not support pipelining, but its input(s) are + /// infinite, returns an error to indicate this. + fn execution_mode(&self) -> ExecutionMode { + self.properties().execution_mode() + } + + /// If the output of this `ExecutionPlan` within each partition is sorted, + /// returns `Some(keys)` with the description of how it was sorted. + /// + /// For example, Sort, (obviously) produces sorted output as does + /// SortPreservingMergeStream. Less obviously `Projection` + /// produces sorted output if its input was sorted as it does not + /// reorder the input rows, + /// + /// It is safe to return `None` here if your `ExecutionPlan` does not + /// have any particular output order here + fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + self.properties().output_ordering() + } + + /// Get the [`EquivalenceProperties`] within the plan. + /// + /// Equivalence properties tell DataFusion what columns are known to be + /// equal, during various optimization passes. By default, this returns "no + /// known equivalences" which is always correct, but may cause DataFusion to + /// unnecessarily resort data. + /// + /// If this ExecutionPlan makes no changes to the schema of the rows flowing + /// through it or how columns within each row relate to each other, it + /// should return the equivalence properties of its input. For + /// example, since `FilterExec` may remove rows from its input, but does not + /// otherwise modify them, it preserves its input equivalence properties. + /// However, since `ProjectionExec` may calculate derived expressions, it + /// needs special handling. + /// + /// See also [`Self::maintains_input_order`] and [`Self::output_ordering`] + /// for related concepts. + fn equivalence_properties(&self) -> &EquivalenceProperties { + self.properties().equivalence_properties() + } +} + /// Describes the execution mode of an operator's resulting stream with respect /// to its size and behavior. There are three possible execution modes: `Bounded`, /// `Unbounded` and `PipelineBreaking`. @@ -564,6 +576,22 @@ impl PlanProperties { self } + pub fn equivalence_properties(&self) -> &EquivalenceProperties { + &self.eq_properties + } + + pub fn output_partitioning(&self) -> &Partitioning { + &self.partitioning + } + + pub fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + self.output_ordering.as_deref() + } + + pub fn execution_mode(&self) -> ExecutionMode { + self.exec_mode + } + /// Get schema of the node. fn schema(&self) -> &SchemaRef { self.eq_properties.schema() @@ -577,11 +605,8 @@ impl PlanProperties { /// 2. CoalescePartitionsExec for collapsing all of the partitions into one without ordering guarantee /// 3. SortPreservingMergeExec for collapsing all of the sorted partitions into one with ordering guarantee pub fn need_data_exchange(plan: Arc) -> bool { - if let Some(repart) = plan.as_any().downcast_ref::() { - !matches!( - repart.output_partitioning(), - Partitioning::RoundRobinBatch(_) - ) + if let Some(_) = plan.as_any().downcast_ref::() { + !matches!(plan.output_partitioning(), Partitioning::RoundRobinBatch(_)) } else if let Some(coalesce) = plan.as_any().downcast_ref::() { coalesce.input().output_partitioning().partition_count() > 1 @@ -652,7 +677,8 @@ pub fn execute_stream( 1 => plan.execute(0, context), _ => { // merge into a single partition - let plan = CoalescePartitionsExec::new(plan.clone()); + let plan = Arc::new(CoalescePartitionsExec::new(plan.clone())) + as Arc; // CoalescePartitionsExec must produce a single partition assert_eq!(1, plan.output_partitioning().partition_count()); plan.execute(0, context) diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index 1c09ec88ae08..3520d45f9e6c 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -24,7 +24,7 @@ use std::task::{Context, Poll}; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use super::{ - DisplayAs, ExecutionMode, PlanProperties, RecordBatchStream, + DisplayAs, ExecutionMode, ExecutionPlanProperties, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, }; use crate::{DisplayFormatType, Distribution, ExecutionPlan, Partitioning}; @@ -410,7 +410,7 @@ impl ExecutionPlan for LocalLimitExec { _ => Statistics { // the result output row number will always be no greater than the limit number num_rows: Precision::Inexact( - self.fetch * self.output_partitioning().partition_count(), + self.fetch * self.cache.output_partitioning().partition_count(), ), column_statistics: col_stats, diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 402feeaf80ba..8fe82e7de3eb 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -29,7 +29,8 @@ use std::task::{Context, Poll}; use super::expressions::Column; use super::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use super::{ - DisplayAs, PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, + DisplayAs, ExecutionPlanProperties, PlanProperties, RecordBatchStream, + SendableRecordBatchStream, Statistics, }; use crate::{ ColumnStatistics, DisplayFormatType, ExecutionPlan, Partitioning, PhysicalExpr, diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 1cbc71471eb2..d1befb7c53c0 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -26,7 +26,9 @@ use std::{any::Any, vec}; use super::common::SharedMemoryReservation; use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; -use super::{DisplayAs, RecordBatchStream, SendableRecordBatchStream}; +use super::{ + DisplayAs, ExecutionPlanProperties, RecordBatchStream, SendableRecordBatchStream, +}; use crate::common::{transpose, SpawnedTask}; use crate::hash_utils::create_hashes; use crate::metrics::BaselineMetrics; diff --git a/datafusion/physical-plan/src/sorts/partial_sort.rs b/datafusion/physical-plan/src/sorts/partial_sort.rs index 01e592d29f7b..500df6153fdb 100644 --- a/datafusion/physical-plan/src/sorts/partial_sort.rs +++ b/datafusion/physical-plan/src/sorts/partial_sort.rs @@ -61,8 +61,8 @@ use crate::expressions::PhysicalSortExpr; use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use crate::sorts::sort::sort_batch; use crate::{ - DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, - PlanProperties, SendableRecordBatchStream, Statistics, + DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties, + Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, }; use arrow::compute::concat_batches; diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index a96834cf0553..f46958663252 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -37,7 +37,8 @@ use crate::stream::{RecordBatchReceiverStream, RecordBatchStreamAdapter}; use crate::topk::TopK; use crate::{ DisplayAs, DisplayFormatType, Distribution, EmptyRecordBatchStream, ExecutionMode, - ExecutionPlan, Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, + ExecutionPlan, ExecutionPlanProperties, Partitioning, PlanProperties, + SendableRecordBatchStream, Statistics, }; use arrow::compute::{concat_batches, lexsort_to_indices, take}; diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index 862146e10549..e1e197e205e2 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -25,8 +25,8 @@ use crate::expressions::PhysicalSortExpr; use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; use crate::sorts::streaming_merge; use crate::{ - DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, - PlanProperties, SendableRecordBatchStream, Statistics, + DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties, + Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, }; use datafusion_common::{internal_err, DataFusionError, Result}; diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index a533f2249588..9f637361ff8f 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -29,8 +29,9 @@ use std::{any::Any, sync::Arc}; use super::{ execution_mode_from_children, metrics::{ExecutionPlanMetricsSet, MetricsSet}, - ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, - PlanProperties, RecordBatchStream, SendableRecordBatchStream, Statistics, + ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, + ExecutionPlanProperties, Partitioning, PlanProperties, RecordBatchStream, + SendableRecordBatchStream, Statistics, }; use crate::metrics::BaselineMetrics; use crate::stream::ObservedStream; @@ -204,7 +205,7 @@ impl ExecutionPlan for UnionExec { // which is the "meet" of all input orderings. In this example, this // function will return vec![false, true, true], indicating that we // preserve the orderings for the 2nd and the 3rd children. - if let Some(output_ordering) = self.output_ordering() { + if let Some(output_ordering) = self.cache.output_ordering() { self.inputs() .iter() .map(|child| { diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index 72fcbfd2ffb9..776d98f5b6ea 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -20,7 +20,7 @@ use std::{any::Any, sync::Arc}; use super::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}; -use super::{DisplayAs, PlanProperties}; +use super::{DisplayAs, ExecutionPlanProperties, PlanProperties}; use crate::{ expressions::Column, DisplayFormatType, Distribution, ExecutionPlan, PhysicalExpr, RecordBatchStream, SendableRecordBatchStream, diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index eb4a27341785..c99ec599596d 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -35,8 +35,8 @@ use crate::windows::{ }; use crate::{ ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, - InputOrderMode, PlanProperties, RecordBatchStream, SendableRecordBatchStream, - Statistics, WindowExpr, + ExecutionPlanProperties, InputOrderMode, PlanProperties, RecordBatchStream, + SendableRecordBatchStream, Statistics, WindowExpr, }; use arrow::{ diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index 0e20c88619c2..c19694aef8b7 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -27,7 +27,7 @@ use crate::{ cume_dist, dense_rank, lag, lead, percent_rank, rank, Literal, NthValue, Ntile, PhysicalSortExpr, RowNumber, }, - udaf, ExecutionPlan, InputOrderMode, PhysicalExpr, + udaf, ExecutionPlan, ExecutionPlanProperties, InputOrderMode, PhysicalExpr, }; use arrow::datatypes::Schema; diff --git a/datafusion/physical-plan/src/windows/window_agg_exec.rs b/datafusion/physical-plan/src/windows/window_agg_exec.rs index efc84526c9ad..66d31ec94d1a 100644 --- a/datafusion/physical-plan/src/windows/window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/window_agg_exec.rs @@ -31,8 +31,8 @@ use crate::windows::{ }; use crate::{ ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionMode, - ExecutionPlan, PhysicalExpr, PlanProperties, RecordBatchStream, - SendableRecordBatchStream, Statistics, WindowExpr, + ExecutionPlan, ExecutionPlanProperties, PhysicalExpr, PlanProperties, + RecordBatchStream, SendableRecordBatchStream, Statistics, WindowExpr, }; use arrow::array::ArrayRef; From ea30b93194cfdfe4148a45fc0e33549884ba81b1 Mon Sep 17 00:00:00 2001 From: Eddy Oyieko <67474838+mobley-trent@users.noreply.github.com> Date: Wed, 28 Feb 2024 01:37:46 +0300 Subject: [PATCH 33/45] feat : Support for deregistering user defined functions (#9239) * Initial commit * Updated mod.rs - Docstrings, Initial test * Updated mod.rs - Fixed udf test * Added udaf test, Updated udf test * Added test for udwf * Linting with rustfmt * Update datafusion/core/src/execution/context/mod.rs Co-authored-by: Andrew Lamb * Moved tests to core/tests/user_defined * fix fmt --------- Co-authored-by: Andrew Lamb --- datafusion/core/src/execution/context/mod.rs | 33 +++++++++++++++++++ .../user_defined/user_defined_aggregates.rs | 23 +++++++++++++ .../user_defined_scalar_functions.rs | 16 +++++++++ .../user_defined_window_functions.rs | 15 +++++++++ datafusion/execution/src/registry.rs | 27 +++++++++++++++ 5 files changed, 114 insertions(+) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 453a00a1a5cf..3aa4edfe3adc 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -849,6 +849,21 @@ impl SessionContext { self.state.write().register_udwf(Arc::new(f)).ok(); } + /// Deregisters a UDF within this context. + pub fn deregister_udf(&self, name: &str) { + self.state.write().deregister_udf(name).ok(); + } + + /// Deregisters a UDAF within this context. + pub fn deregister_udaf(&self, name: &str) { + self.state.write().deregister_udaf(name).ok(); + } + + /// Deregisters a UDWF within this context. + pub fn deregister_udwf(&self, name: &str) { + self.state.write().deregister_udwf(name).ok(); + } + /// Creates a [`DataFrame`] for reading a data source. /// /// For more control such as reading multiple files, you can use @@ -2026,6 +2041,24 @@ impl FunctionRegistry for SessionState { fn register_udwf(&mut self, udwf: Arc) -> Result>> { Ok(self.window_functions.insert(udwf.name().into(), udwf)) } + + fn deregister_udf(&mut self, name: &str) -> Result>> { + let udf = self.scalar_functions.remove(name); + if let Some(udf) = &udf { + for alias in udf.aliases() { + self.scalar_functions.remove(alias); + } + } + Ok(udf) + } + + fn deregister_udaf(&mut self, name: &str) -> Result>> { + Ok(self.aggregate_functions.remove(name)) + } + + fn deregister_udwf(&mut self, name: &str) -> Result>> { + Ok(self.window_functions.remove(name)) + } } impl OptimizerConfig for SessionState { diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index 0b29ad10d670..8daeefd236f7 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -255,6 +255,29 @@ async fn simple_udaf() -> Result<()> { Ok(()) } +#[tokio::test] +async fn deregister_udaf() -> Result<()> { + let ctx = SessionContext::new(); + let my_avg = create_udaf( + "my_avg", + vec![DataType::Float64], + Arc::new(DataType::Float64), + Volatility::Immutable, + Arc::new(|_| Ok(Box::::default())), + Arc::new(vec![DataType::UInt64, DataType::Float64]), + ); + + ctx.register_udaf(my_avg.clone()); + + assert!(ctx.state().aggregate_functions().contains_key("my_avg")); + + ctx.deregister_udaf("my_avg"); + + assert!(!ctx.state().aggregate_functions().contains_key("my_avg")); + + Ok(()) +} + #[tokio::test] async fn case_sensitive_identifiers_user_defined_aggregates() -> Result<()> { let ctx = SessionContext::new(); diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index 9812789740f7..a255498eb5f7 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -498,6 +498,22 @@ async fn test_user_defined_functions_zero_argument() -> Result<()> { Ok(()) } +#[tokio::test] +async fn deregister_udf() -> Result<()> { + let random_normal_udf = ScalarUDF::from(RandomUDF::new()); + let ctx = SessionContext::new(); + + ctx.register_udf(random_normal_udf.clone()); + + assert!(ctx.udfs().contains("random_udf")); + + ctx.deregister_udf("random_udf"); + + assert!(!ctx.udfs().contains("random_udf")); + + Ok(()) +} + #[derive(Debug)] struct TakeUDF { signature: Signature, diff --git a/datafusion/core/tests/user_defined/user_defined_window_functions.rs b/datafusion/core/tests/user_defined/user_defined_window_functions.rs index 54eab4315a97..cfd74f8861e3 100644 --- a/datafusion/core/tests/user_defined/user_defined_window_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_window_functions.rs @@ -103,6 +103,21 @@ async fn test_udwf() { assert_eq!(test_state.evaluate_all_called(), 2); } +#[tokio::test] +async fn test_deregister_udwf() -> Result<()> { + let test_state = Arc::new(TestState::new()); + let mut ctx = SessionContext::new(); + OddCounter::register(&mut ctx, Arc::clone(&test_state)); + + assert!(ctx.state().window_functions().contains_key("odd_counter")); + + ctx.deregister_udwf("odd_counter"); + + assert!(!ctx.state().window_functions().contains_key("odd_counter")); + + Ok(()) +} + /// Basic user defined window function with bounded window #[tokio::test] async fn test_udwf_bounded_window_ignores_frame() { diff --git a/datafusion/execution/src/registry.rs b/datafusion/execution/src/registry.rs index 4569967acb08..6e0a932f0bc5 100644 --- a/datafusion/execution/src/registry.rs +++ b/datafusion/execution/src/registry.rs @@ -66,6 +66,33 @@ pub trait FunctionRegistry { fn register_udwf(&mut self, _udaf: Arc) -> Result>> { not_impl_err!("Registering WindowUDF") } + + /// Deregisters a [`ScalarUDF`], returning the implementation that was + /// deregistered. + /// + /// Returns an error (the default) if the function can not be deregistered, + /// for example if the registry is read only. + fn deregister_udf(&mut self, _name: &str) -> Result>> { + not_impl_err!("Deregistering ScalarUDF") + } + + /// Deregisters a [`AggregateUDF`], returning the implementation that was + /// deregistered. + /// + /// Returns an error (the default) if the function can not be deregistered, + /// for example if the registry is read only. + fn deregister_udaf(&mut self, _name: &str) -> Result>> { + not_impl_err!("Deregistering AggregateUDF") + } + + /// Deregisters a [`WindowUDF`], returning the implementation that was + /// deregistered. + /// + /// Returns an error (the default) if the function can not be deregistered, + /// for example if the registry is read only. + fn deregister_udwf(&mut self, _name: &str) -> Result>> { + not_impl_err!("Deregistering WindowUDF") + } } /// Serializer and deserializer registry for extensions like [UserDefinedLogicalNode]. From 544b3d9e8f7b9cca0a649b425b334baf6cae1dbd Mon Sep 17 00:00:00 2001 From: junxiangMu <63799833+guojidan@users.noreply.github.com> Date: Wed, 28 Feb 2024 06:43:34 +0800 Subject: [PATCH 34/45] fix return type (#9357) --- datafusion/functions/src/core/nvl.rs | 6 +----- datafusion/sqllogictest/test_files/nvl.slt | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/datafusion/functions/src/core/nvl.rs b/datafusion/functions/src/core/nvl.rs index 6d6ad1cdeb21..caf095ecbbbd 100644 --- a/datafusion/functions/src/core/nvl.rs +++ b/datafusion/functions/src/core/nvl.rs @@ -73,11 +73,7 @@ impl ScalarUDFImpl for NVLFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - // NVL has two args and they might get coerced, get a preview of this - let coerced_types = datafusion_expr::type_coercion::functions::data_types(arg_types, &self.signature); - coerced_types.map(|typs| typs[0].clone()) - .map_err(|e| e.context("Failed to coerce arguments for NVL") - ) + Ok(arg_types[0].clone()) } fn invoke(&self, args: &[ColumnarValue]) -> Result { diff --git a/datafusion/sqllogictest/test_files/nvl.slt b/datafusion/sqllogictest/test_files/nvl.slt index 81e79e1eb5b0..c77214cc302a 100644 --- a/datafusion/sqllogictest/test_files/nvl.slt +++ b/datafusion/sqllogictest/test_files/nvl.slt @@ -114,7 +114,7 @@ SELECT NVL(1, 3); ---- 1 -query I +query ? SELECT NVL(NULL, NULL); ---- NULL From 77e5c35e9164ad99683ac41c04089145ae46a482 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Tue, 27 Feb 2024 15:57:44 -0800 Subject: [PATCH 35/45] Final review --- .../src/physical_optimizer/enforce_distribution.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index c7ffc7838b36..a6d5aa763253 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -443,10 +443,10 @@ where join_plan.plan = join_constructor((new_join_on, new_sort_options))?; } } - let mut requirements = join_plan; - requirements.children[0].data = left_keys; - requirements.children[1].data = right_keys; - Ok(requirements) + + join_plan.children[0].data = left_keys; + join_plan.children[1].data = right_keys; + Ok(join_plan) } fn reorder_aggregate_keys( @@ -1299,8 +1299,7 @@ pub(crate) mod tests { use crate::datasource::file_format::file_compression_type::FileCompressionType; use crate::datasource::listing::PartitionedFile; use crate::datasource::object_store::ObjectStoreUrl; - use crate::datasource::physical_plan::ParquetExec; - use crate::datasource::physical_plan::{CsvExec, FileScanConfig}; + use crate::datasource::physical_plan::{CsvExec, FileScanConfig, ParquetExec}; use crate::physical_optimizer::enforce_sorting::EnforceSorting; use crate::physical_optimizer::output_requirements::OutputRequirements; use crate::physical_optimizer::test_utils::{ From 935ebcae1c869875d08e665710ed27e2f83e0108 Mon Sep 17 00:00:00 2001 From: SteveLauC Date: Wed, 28 Feb 2024 08:44:39 +0800 Subject: [PATCH 36/45] refactor: move acos() to function crate (#9297) --- datafusion-cli/Cargo.lock | 193 +++++++++--------- datafusion/expr/src/built_in_function.rs | 10 +- datafusion/expr/src/expr_fn.rs | 2 - datafusion/functions/src/math/acos.rs | 110 ++++++++++ datafusion/functions/src/math/mod.rs | 19 +- datafusion/functions/src/math/nans.rs | 2 +- .../optimizer/src/analyzer/type_coercion.rs | 4 +- datafusion/physical-expr/src/functions.rs | 1 - datafusion/proto/proto/datafusion.proto | 2 +- datafusion/proto/src/generated/pbjson.rs | 3 - datafusion/proto/src/generated/prost.rs | 4 +- .../proto/src/logical_plan/from_proto.rs | 20 +- datafusion/proto/src/logical_plan/to_proto.rs | 1 - .../tests/cases/roundtrip_physical_plan.rs | 14 +- 14 files changed, 240 insertions(+), 145 deletions(-) create mode 100644 datafusion/functions/src/math/acos.rs diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 69456446f52b..416df5d17f25 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -25,9 +25,9 @@ checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" [[package]] name = "ahash" -version = "0.8.8" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42cd52102d3df161c77a887b608d7a4897d7cc112886a9537b738a887a03aaff" +checksum = "8b79b82693f705137f8fb9b37871d99e4f9a7df12b917eed79c3d3954830a60b" dependencies = [ "cfg-if", "const-random", @@ -345,9 +345,9 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "2.0.13" +version = "2.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00ad3f3a942eee60335ab4342358c161ee296829e0d16ff42fc1d6cb07815467" +checksum = "ed72493ac66d5804837f480ab3766c72bdfab91a65e565fc54fa9e42db0073a8" dependencies = [ "anstyle", "bstr", @@ -384,7 +384,7 @@ checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -791,9 +791,9 @@ dependencies = [ [[package]] name = "bstr" -version = "1.9.0" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" +checksum = "05efc5cfd9110c8416e471df0e96702d58690178e206e61b7173706673c93706" dependencies = [ "memchr", "regex-automata", @@ -802,9 +802,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.15.0" +version = "3.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d32a994c2b3ca201d9b263612a374263f05e7adde37c4707f693dcd375076d1f" +checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" [[package]] name = "byteorder" @@ -851,11 +851,10 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.83" +version = "1.0.88" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +checksum = "02f341c093d19155a6e41631ce5971aac4e9a868262212153124c15fa22d1cdc" dependencies = [ - "jobserver", "libc", ] @@ -875,7 +874,7 @@ dependencies = [ "iana-time-zone", "num-traits", "serde", - "windows-targets 0.52.0", + "windows-targets 0.52.3", ] [[package]] @@ -1069,12 +1068,12 @@ dependencies = [ [[package]] name = "ctor" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30d2b3721e861707777e3195b0158f950ae6dc4a27e4d02ff9f67e3eb3de199e" +checksum = "ad291aa74992b9b7a7e88c38acbbf6ad7e107f1d90ee8775b7bc1fc3394f485c" dependencies = [ "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -1607,7 +1606,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -1624,9 +1623,9 @@ checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] name = "futures-timer" -version = "3.0.2" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] name = "futures-util" @@ -1700,9 +1699,9 @@ dependencies = [ [[package]] name = "half" -version = "2.3.1" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +checksum = "b5eceaaeec696539ddaf7b333340f1af35a5aa87ae3e4f3ead0532f72affab2e" dependencies = [ "cfg-if", "crunchy", @@ -1751,9 +1750,9 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.3.6" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd5256b483761cd23699d0da46cc6fd2ee3be420bbe6d020ae4a091e70b7e9fd" +checksum = "379dada1584ad501b383485dd706b8afb7a70fcbc7f4da7d780638a5a6124a60" [[package]] name = "hex" @@ -1952,15 +1951,6 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" -[[package]] -name = "jobserver" -version = "0.1.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" -dependencies = [ - "libc", -] - [[package]] name = "js-sys" version = "0.3.68" @@ -2304,7 +2294,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.3.6", + "hermit-abi 0.3.8", "libc", ] @@ -2336,7 +2326,7 @@ dependencies = [ "quick-xml", "rand", "reqwest", - "ring 0.17.7", + "ring 0.17.8", "rustls-pemfile 2.1.0", "serde", "serde_json", @@ -2524,7 +2514,7 @@ checksum = "266c042b60c9c76b8d53061e52b2e0d1116abc57cefc8c5cd671619a56ac3690" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -2800,16 +2790,17 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.7" +version = "0.17.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "688c63d65483050968b2a8937f7995f443e27041a0f7700aa59b0822aedebb74" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", + "cfg-if", "getrandom", "libc", "spin 0.9.8", "untrusted 0.9.0", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -2891,7 +2882,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" dependencies = [ "log", - "ring 0.17.7", + "ring 0.17.8", "rustls-webpki", "sct", ] @@ -2929,9 +2920,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "048a63e5b3ac996d78d402940b5fa47973d2d080c6c6fffa1d0f19c4445310b7" +checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8" [[package]] name = "rustls-webpki" @@ -2939,7 +2930,7 @@ version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ - "ring 0.17.7", + "ring 0.17.8", "untrusted 0.9.0", ] @@ -2974,9 +2965,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.16" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" +checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" [[package]] name = "same-file" @@ -3008,7 +2999,7 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "ring 0.17.7", + "ring 0.17.8", "untrusted 0.9.0", ] @@ -3037,9 +3028,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.21" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0" +checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" [[package]] name = "seq-macro" @@ -3049,29 +3040,29 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.196" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "870026e60fa08c69f064aa766c10f10b1d62db9ccd4d0abb206472bee0ce3b32" +checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.196" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" +checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] name = "serde_json" -version = "1.0.113" +version = "1.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69801b70b1c3dac963ecb03a364ba0ceda9cf60c71cfe475e99864759c8b8a79" +checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" dependencies = [ "itoa", "ryu", @@ -3161,12 +3152,12 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.5.5" +version = "0.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9" +checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" dependencies = [ "libc", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] @@ -3199,7 +3190,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -3245,7 +3236,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -3258,7 +3249,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -3280,9 +3271,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.49" +version = "2.0.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "915aea9e586f80826ee59f8453c1101f9d1c4b3964cd2460185ee8e299ada496" +checksum = "6ab617d94515e94ae53b8406c628598680aa0c9587474ecbe58188f7b345d66c" dependencies = [ "proc-macro2", "quote", @@ -3318,9 +3309,9 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.10.0" +version = "3.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a365e8cd18e44762ef95d87f284f4b5cd04107fec2ff3052bd6a3e6069669e67" +checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" dependencies = [ "cfg-if", "fastrand 2.0.1", @@ -3345,9 +3336,9 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "textwrap" -version = "0.16.0" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" +checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9" [[package]] name = "thiserror" @@ -3366,7 +3357,7 @@ checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -3461,7 +3452,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -3558,7 +3549,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -3603,7 +3594,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] @@ -3626,9 +3617,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-normalization" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" dependencies = [ "tinyvec", ] @@ -3757,7 +3748,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", "wasm-bindgen-shared", ] @@ -3791,7 +3782,7 @@ checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -3831,7 +3822,7 @@ version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53" dependencies = [ - "ring 0.17.7", + "ring 0.17.8", "untrusted 0.9.0", ] @@ -3872,7 +3863,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.0", + "windows-targets 0.52.3", ] [[package]] @@ -3890,7 +3881,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.0", + "windows-targets 0.52.3", ] [[package]] @@ -3910,17 +3901,17 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +checksum = "d380ba1dc7187569a8a9e91ed34b8ccfc33123bbacb8c0aed2d1ad7f3ef2dc5f" dependencies = [ - "windows_aarch64_gnullvm 0.52.0", - "windows_aarch64_msvc 0.52.0", - "windows_i686_gnu 0.52.0", - "windows_i686_msvc 0.52.0", - "windows_x86_64_gnu 0.52.0", - "windows_x86_64_gnullvm 0.52.0", - "windows_x86_64_msvc 0.52.0", + "windows_aarch64_gnullvm 0.52.3", + "windows_aarch64_msvc 0.52.3", + "windows_i686_gnu 0.52.3", + "windows_i686_msvc 0.52.3", + "windows_x86_64_gnu 0.52.3", + "windows_x86_64_gnullvm 0.52.3", + "windows_x86_64_msvc 0.52.3", ] [[package]] @@ -3931,9 +3922,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" +checksum = "68e5dcfb9413f53afd9c8f86e56a7b4d86d9a2fa26090ea2dc9e40fba56c6ec6" [[package]] name = "windows_aarch64_msvc" @@ -3943,9 +3934,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" +checksum = "8dab469ebbc45798319e69eebf92308e541ce46760b49b18c6b3fe5e8965b30f" [[package]] name = "windows_i686_gnu" @@ -3955,9 +3946,9 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" +checksum = "2a4e9b6a7cac734a8b4138a4e1044eac3404d8326b6c0f939276560687a033fb" [[package]] name = "windows_i686_msvc" @@ -3967,9 +3958,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" +checksum = "28b0ec9c422ca95ff34a78755cfa6ad4a51371da2a5ace67500cf7ca5f232c58" [[package]] name = "windows_x86_64_gnu" @@ -3979,9 +3970,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" +checksum = "704131571ba93e89d7cd43482277d6632589b18ecf4468f591fbae0a8b101614" [[package]] name = "windows_x86_64_gnullvm" @@ -3991,9 +3982,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" +checksum = "42079295511643151e98d61c38c0acc444e52dd42ab456f7ccfd5152e8ecf21c" [[package]] name = "windows_x86_64_msvc" @@ -4003,9 +3994,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.0" +version = "0.52.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +checksum = "0770833d60a970638e989b3fa9fd2bb1aaadcf88963d1659fd7d9990196ed2d6" [[package]] name = "winreg" @@ -4049,7 +4040,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.49", + "syn 2.0.51", ] [[package]] diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index cf1e73f780ad..e04106595876 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -42,8 +42,6 @@ use strum_macros::EnumIter; #[derive(Debug, Clone, PartialEq, Eq, Hash, EnumIter, Copy)] pub enum BuiltinScalarFunction { // math functions - /// acos - Acos, /// asin Asin, /// atan @@ -362,7 +360,6 @@ impl BuiltinScalarFunction { pub fn volatility(&self) -> Volatility { match self { // Immutable scalar builtins - BuiltinScalarFunction::Acos => Volatility::Immutable, BuiltinScalarFunction::Asin => Volatility::Immutable, BuiltinScalarFunction::Atan => Volatility::Immutable, BuiltinScalarFunction::Atan2 => Volatility::Immutable, @@ -873,8 +870,7 @@ impl BuiltinScalarFunction { utf8_to_int_type(&input_expr_types[0], "levenshtein") } - BuiltinScalarFunction::Acos - | BuiltinScalarFunction::Asin + BuiltinScalarFunction::Asin | BuiltinScalarFunction::Atan | BuiltinScalarFunction::Acosh | BuiltinScalarFunction::Asinh @@ -1346,8 +1342,7 @@ impl BuiltinScalarFunction { vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])], self.volatility(), ), - BuiltinScalarFunction::Acos - | BuiltinScalarFunction::Asin + BuiltinScalarFunction::Asin | BuiltinScalarFunction::Atan | BuiltinScalarFunction::Acosh | BuiltinScalarFunction::Asinh @@ -1438,7 +1433,6 @@ impl BuiltinScalarFunction { /// Returns all names that can be used to call this function pub fn aliases(&self) -> &'static [&'static str] { match self { - BuiltinScalarFunction::Acos => &["acos"], BuiltinScalarFunction::Acosh => &["acosh"], BuiltinScalarFunction::Asin => &["asin"], BuiltinScalarFunction::Asinh => &["asinh"], diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 55bd40a18900..67bf39050d58 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -531,7 +531,6 @@ scalar_expr!(Sinh, sinh, num, "hyperbolic sine"); scalar_expr!(Cosh, cosh, num, "hyperbolic cosine"); scalar_expr!(Tanh, tanh, num, "hyperbolic tangent"); scalar_expr!(Asin, asin, num, "inverse sine"); -scalar_expr!(Acos, acos, num, "inverse cosine"); scalar_expr!(Atan, atan, num, "inverse tangent"); scalar_expr!(Asinh, asinh, num, "inverse hyperbolic sine"); scalar_expr!(Acosh, acosh, num, "inverse hyperbolic cosine"); @@ -1339,7 +1338,6 @@ mod test { test_unary_scalar_expr!(Cosh, cosh); test_unary_scalar_expr!(Tanh, tanh); test_unary_scalar_expr!(Asin, asin); - test_unary_scalar_expr!(Acos, acos); test_unary_scalar_expr!(Atan, atan); test_unary_scalar_expr!(Asinh, asinh); test_unary_scalar_expr!(Acosh, acosh); diff --git a/datafusion/functions/src/math/acos.rs b/datafusion/functions/src/math/acos.rs new file mode 100644 index 000000000000..22dfd37a0159 --- /dev/null +++ b/datafusion/functions/src/math/acos.rs @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Math function: `acos()`. + +use arrow::array::{ArrayRef, Float32Array, Float64Array}; +use arrow::datatypes::DataType; +use datafusion_common::{exec_err, plan_datafusion_err, DataFusionError, Result}; +use datafusion_expr::ColumnarValue; +use datafusion_expr::{ + utils::generate_signature_error_msg, ScalarUDFImpl, Signature, Volatility, +}; +use std::any::Any; +use std::sync::Arc; + +#[derive(Debug)] +pub struct AcosFunc { + signature: Signature, +} + +impl AcosFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::uniform( + 1, + vec![Float64, Float32], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for AcosFunc { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "acos" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + if arg_types.len() != 1 { + return Err(plan_datafusion_err!( + "{}", + generate_signature_error_msg( + self.name(), + self.signature().clone(), + arg_types, + ) + )); + } + + let arg_type = &arg_types[0]; + + match arg_type { + DataType::Float64 => Ok(DataType::Float64), + DataType::Float32 => Ok(DataType::Float32), + + // For other types (possible values null/int), use Float 64 + _ => Ok(DataType::Float64), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + + let arr: ArrayRef = match args[0].data_type() { + DataType::Float64 => Arc::new(make_function_scalar_inputs_return_type!( + &args[0], + self.name(), + Float64Array, + Float64Array, + { f64::acos } + )), + DataType::Float32 => Arc::new(make_function_scalar_inputs_return_type!( + &args[0], + self.name(), + Float32Array, + Float32Array, + { f32::acos } + )), + other => { + return exec_err!( + "Unsupported data type {other:?} for function {}", + self.name() + ) + } + }; + Ok(ColumnarValue::Array(arr)) + } +} diff --git a/datafusion/functions/src/math/mod.rs b/datafusion/functions/src/math/mod.rs index 9d13103ef23f..65752a0fe4d9 100644 --- a/datafusion/functions/src/math/mod.rs +++ b/datafusion/functions/src/math/mod.rs @@ -17,15 +17,26 @@ //! "math" DataFusion functions -mod nans; mod abs; +mod acos; +mod nans; // create UDFs make_udf_function!(nans::IsNanFunc, ISNAN, isnan); make_udf_function!(abs::AbsFunc, ABS, abs); +make_udf_function!(acos::AcosFunc, ACOS, acos); // Export the functions out of this package, both as expr_fn as well as a list of functions export_functions!( - (isnan, num, "returns true if a given number is +NaN or -NaN otherwise returns false"), - (abs, num, "returns the absolute value of a given number") -); \ No newline at end of file + ( + isnan, + num, + "returns true if a given number is +NaN or -NaN otherwise returns false" + ), + (abs, num, "returns the absolute value of a given number"), + ( + acos, + num, + "returns the arc cosine or inverse cosine of a number" + ) +); diff --git a/datafusion/functions/src/math/nans.rs b/datafusion/functions/src/math/nans.rs index c7868e6d5eca..96a50f9aa568 100644 --- a/datafusion/functions/src/math/nans.rs +++ b/datafusion/functions/src/math/nans.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Encoding expressions +//! Math function: `isnan()`. use arrow::datatypes::DataType; use datafusion_common::{exec_err, DataFusionError, Result}; diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 8cdb4d7dbdf6..3a43e3cd7c20 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -889,14 +889,14 @@ mod test { // test that automatic argument type coercion for scalar functions work let empty = empty(); let lit_expr = lit(10i64); - let fun: BuiltinScalarFunction = BuiltinScalarFunction::Acos; + let fun: BuiltinScalarFunction = BuiltinScalarFunction::Floor; let scalar_function_expr = Expr::ScalarFunction(ScalarFunction::new(fun, vec![lit_expr])); let plan = LogicalPlan::Projection(Projection::try_new( vec![scalar_function_expr], empty, )?); - let expected = "Projection: acos(CAST(Int64(10) AS Float64))\n EmptyRelation"; + let expected = "Projection: floor(CAST(Int64(10) AS Float64))\n EmptyRelation"; assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), &plan, expected) } diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index c91b96d67a22..af079dbd2d12 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -261,7 +261,6 @@ pub fn create_physical_fun( ) -> Result { Ok(match fun { // math functions - BuiltinScalarFunction::Acos => Arc::new(math_expressions::acos), BuiltinScalarFunction::Asin => Arc::new(math_expressions::asin), BuiltinScalarFunction::Atan => Arc::new(math_expressions::atan), BuiltinScalarFunction::Acosh => Arc::new(math_expressions::acosh), diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index d91373f8f8d2..2d729ffc5b3e 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -548,7 +548,7 @@ enum ScalarFunction { // 0 was Abs before // The first enum value must be zero for open enums unknown = 0; - Acos = 1; + // 1 was Acos Asin = 2; Atan = 3; Ascii = 4; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 964b8890184c..5f05b8546f68 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22322,7 +22322,6 @@ impl serde::Serialize for ScalarFunction { { let variant = match self { Self::Unknown => "unknown", - Self::Acos => "Acos", Self::Asin => "Asin", Self::Atan => "Atan", Self::Ascii => "Ascii", @@ -22465,7 +22464,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { { const FIELDS: &[&str] = &[ "unknown", - "Acos", "Asin", "Atan", "Ascii", @@ -22637,7 +22635,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { { match value { "unknown" => Ok(ScalarFunction::Unknown), - "Acos" => Ok(ScalarFunction::Acos), "Asin" => Ok(ScalarFunction::Asin), "Atan" => Ok(ScalarFunction::Atan), "Ascii" => Ok(ScalarFunction::Ascii), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 292aef4402a2..252089d5c14d 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2636,7 +2636,7 @@ pub enum ScalarFunction { /// 0 was Abs before /// The first enum value must be zero for open enums Unknown = 0, - Acos = 1, + /// 1 was Acos Asin = 2, Atan = 3, Ascii = 4, @@ -2779,7 +2779,6 @@ impl ScalarFunction { pub fn as_str_name(&self) -> &'static str { match self { ScalarFunction::Unknown => "unknown", - ScalarFunction::Acos => "Acos", ScalarFunction::Asin => "Asin", ScalarFunction::Atan => "Atan", ScalarFunction::Ascii => "Ascii", @@ -2916,7 +2915,6 @@ impl ScalarFunction { pub fn from_str_name(value: &str) -> ::core::option::Option { match value { "unknown" => Some(Self::Unknown), - "Acos" => Some(Self::Acos), "Asin" => Some(Self::Asin), "Atan" => Some(Self::Atan), "Ascii" => Some(Self::Ascii), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 69114fd74595..acfa043b88af 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -47,15 +47,15 @@ use datafusion_common::{ use datafusion_expr::expr::Unnest; use datafusion_expr::window_frame::{check_window_frame, regularize_window_order_by}; use datafusion_expr::{ - acos, acosh, array, array_append, array_concat, array_dims, array_distinct, - array_element, array_empty, array_except, array_has, array_has_all, array_has_any, - array_intersect, array_length, array_ndims, array_pop_back, array_pop_front, - array_position, array_positions, array_prepend, array_remove, array_remove_all, - array_remove_n, array_repeat, array_replace, array_replace_all, array_replace_n, - array_resize, array_slice, array_sort, array_union, arrow_typeof, ascii, asin, asinh, - atan, atan2, atanh, bit_length, btrim, cardinality, cbrt, ceil, character_length, - chr, coalesce, concat_expr, concat_ws_expr, cos, cosh, cot, current_date, - current_time, date_bin, date_part, date_trunc, degrees, digest, ends_with, exp, + acosh, array, array_append, array_concat, array_dims, array_distinct, array_element, + array_empty, array_except, array_has, array_has_all, array_has_any, array_intersect, + array_length, array_ndims, array_pop_back, array_pop_front, array_position, + array_positions, array_prepend, array_remove, array_remove_all, array_remove_n, + array_repeat, array_replace, array_replace_all, array_replace_n, array_resize, + array_slice, array_sort, array_union, arrow_typeof, ascii, asin, asinh, atan, atan2, + atanh, bit_length, btrim, cardinality, cbrt, ceil, character_length, chr, coalesce, + concat_expr, concat_ws_expr, cos, cosh, cot, current_date, current_time, date_bin, + date_part, date_trunc, degrees, digest, ends_with, exp, expr::{self, InList, Sort, WindowFunction}, factorial, find_in_set, flatten, floor, from_unixtime, gcd, gen_range, initcap, instr, iszero, lcm, left, levenshtein, ln, log, log10, log2, @@ -450,7 +450,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Tan => Self::Tan, ScalarFunction::Cot => Self::Cot, ScalarFunction::Asin => Self::Asin, - ScalarFunction::Acos => Self::Acos, ScalarFunction::Atan => Self::Atan, ScalarFunction::Sinh => Self::Sinh, ScalarFunction::Cosh => Self::Cosh, @@ -1362,7 +1361,6 @@ pub fn parse_expr( match scalar_function { ScalarFunction::Unknown => Err(proto_error("Unknown scalar function")), ScalarFunction::Asin => Ok(asin(parse_expr(&args[0], registry)?)), - ScalarFunction::Acos => Ok(acos(parse_expr(&args[0], registry)?)), ScalarFunction::Asinh => Ok(asinh(parse_expr(&args[0], registry)?)), ScalarFunction::Acosh => Ok(acosh(parse_expr(&args[0], registry)?)), ScalarFunction::Array => Ok(array( diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 9603df209ce4..d19830db98ce 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1432,7 +1432,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Cosh => Self::Cosh, BuiltinScalarFunction::Tanh => Self::Tanh, BuiltinScalarFunction::Asin => Self::Asin, - BuiltinScalarFunction::Acos => Self::Acos, BuiltinScalarFunction::Atan => Self::Atan, BuiltinScalarFunction::Asinh => Self::Asinh, BuiltinScalarFunction::Acosh => Self::Acosh, diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index e44f1863891a..7df22e01469b 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -16,10 +16,6 @@ // under the License. use arrow::csv::WriterBuilder; -use std::ops::Deref; -use std::sync::Arc; -use std::vec; - use datafusion::arrow::array::ArrayRef; use datafusion::arrow::compute::kernels::sort::SortOptions; use datafusion::arrow::datatypes::{DataType, Field, Fields, IntervalUnit, Schema}; @@ -52,6 +48,7 @@ use datafusion::physical_plan::expressions::{ StringAgg, Sum, }; use datafusion::physical_plan::filter::FilterExec; +use datafusion::physical_plan::functions; use datafusion::physical_plan::insert::FileSinkExec; use datafusion::physical_plan::joins::{ HashJoinExec, NestedLoopJoinExec, PartitionMode, StreamJoinPartitionMode, @@ -66,7 +63,7 @@ use datafusion::physical_plan::windows::{ BuiltInWindowExpr, PlainAggregateWindowExpr, WindowAggExec, }; use datafusion::physical_plan::{ - functions, udaf, AggregateExpr, ExecutionPlan, Partitioning, PhysicalExpr, Statistics, + udaf, AggregateExpr, ExecutionPlan, Partitioning, PhysicalExpr, Statistics, }; use datafusion::prelude::SessionContext; use datafusion::scalar::ScalarValue; @@ -82,6 +79,9 @@ use datafusion_expr::{ }; use datafusion_proto::physical_plan::{AsExecutionPlan, DefaultPhysicalExtensionCodec}; use datafusion_proto::protobuf; +use std::ops::Deref; +use std::sync::Arc; +use std::vec; /// Perform a serde roundtrip and assert that the string representation of the before and after plans /// are identical. Note that this often isn't sufficient to guarantee that no information is @@ -600,10 +600,10 @@ fn roundtrip_builtin_scalar_function() -> Result<()> { let execution_props = ExecutionProps::new(); let fun_expr = - functions::create_physical_fun(&BuiltinScalarFunction::Acos, &execution_props)?; + functions::create_physical_fun(&BuiltinScalarFunction::Sin, &execution_props)?; let expr = ScalarFunctionExpr::new( - "acos", + "sin", fun_expr, vec![col("a", &schema)?], DataType::Float64, From fa8508e72fe7a9cbbbdd3f641205195e202366c8 Mon Sep 17 00:00:00 2001 From: SteveLauC Date: Wed, 28 Feb 2024 09:02:14 +0800 Subject: [PATCH 37/45] docs: put flatten in top fn list (#9376) --- docs/source/user-guide/sql/scalar_functions.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index d4eb5944ad09..41b5a354abc1 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1956,6 +1956,7 @@ from_unixtime(expression) - [array_to_string](#array_to_string) - [cardinality](#cardinality) - [empty](#empty) +- [flatten](#flatten) - [generate_series](#generate_series) - [list_append](#list_append) - [list_sort](#list_sort) From 32d906fc9622af3a67b3828700272092fe0982a0 Mon Sep 17 00:00:00 2001 From: Clide S <109172241+Monkwire3@users.noreply.github.com> Date: Tue, 27 Feb 2024 21:10:15 -0500 Subject: [PATCH 38/45] Update list_to_string alias to point to array_to_string (#9374) --- docs/source/user-guide/sql/scalar_functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 41b5a354abc1..38da3fd74c26 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -3030,7 +3030,7 @@ _Alias of [array_slice](#array_slice)._ ### `list_to_string` -_Alias of [list_to_string](#list_to_string)._ +_Alias of [array_to_string](#array_to_string)._ ### `make_array` From 930ac87d547f38b07be7cc4f1b2b5f934ca4e9c5 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 28 Feb 2024 09:33:39 +0300 Subject: [PATCH 39/45] Resolve linter errors --- datafusion-cli/src/exec.rs | 2 +- datafusion/core/benches/sort.rs | 1 + datafusion/core/src/datasource/memory.rs | 6 ++--- .../datasource/physical_plan/arrow_file.rs | 4 +++- .../core/src/datasource/physical_plan/avro.rs | 24 ++++++++++++++++--- .../core/src/datasource/physical_plan/csv.rs | 6 +++-- .../core/src/datasource/physical_plan/json.rs | 4 ++-- .../datasource/physical_plan/parquet/mod.rs | 18 ++++++++++---- .../enforce_distribution.rs | 4 ++-- .../src/physical_optimizer/enforce_sorting.rs | 4 ++-- .../physical_optimizer/output_requirements.rs | 2 +- .../physical_optimizer/projection_pushdown.rs | 2 +- .../replace_with_order_preserving_variants.rs | 2 +- .../physical_optimizer/topk_aggregation.rs | 3 +-- datafusion/core/src/physical_planner.rs | 2 +- .../physical-plan/src/aggregates/mod.rs | 2 +- .../physical-plan/src/aggregates/row_hash.rs | 4 ++-- .../physical-plan/src/coalesce_partitions.rs | 5 +++- datafusion/physical-plan/src/display.rs | 4 ---- .../physical-plan/src/joins/test_utils.rs | 2 +- datafusion/physical-plan/src/lib.rs | 18 +++++++------- datafusion/physical-plan/src/limit.rs | 3 ++- datafusion/physical-plan/src/memory.rs | 9 ++++--- datafusion/physical-plan/src/stream.rs | 2 +- datafusion/physical-plan/src/union.rs | 12 +++++++--- 25 files changed, 92 insertions(+), 53 deletions(-) diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs index 7e46f4a513fb..59581e91e857 100644 --- a/datafusion-cli/src/exec.rs +++ b/datafusion-cli/src/exec.rs @@ -36,7 +36,7 @@ use datafusion::datasource::listing::ListingTableUrl; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::dml::CopyTo; use datafusion::logical_expr::{CreateExternalTable, DdlStatement, LogicalPlan}; -use datafusion::physical_plan::{collect, execute_stream}; +use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties}; use datafusion::prelude::SessionContext; use datafusion::sql::parser::{DFParser, Statement}; use datafusion::sql::sqlparser::dialect::dialect_from_str; diff --git a/datafusion/core/benches/sort.rs b/datafusion/core/benches/sort.rs index fbb94d66db58..34b4a5ebf0dc 100644 --- a/datafusion/core/benches/sort.rs +++ b/datafusion/core/benches/sort.rs @@ -95,6 +95,7 @@ use rand::{Rng, SeedableRng}; use tokio::runtime::Runtime; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; +use datafusion_physical_plan::ExecutionPlanProperties; /// Total number of streams to divide each input into /// models 8 partition plan (should it be 16??) diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index b4a51be264eb..1ac492a76040 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -162,14 +162,14 @@ impl MemTable { let exec = MemoryExec::try_new(&data, schema.clone(), None)?; if let Some(num_partitions) = output_partitions { - let exec = Arc::new(RepartitionExec::try_new( + let exec = RepartitionExec::try_new( Arc::new(exec), Partitioning::RoundRobinBatch(num_partitions), - )?) as Arc; + )?; // execute and collect results let mut output_partitions = vec![]; - for i in 0..exec.output_partitioning().partition_count() { + for i in 0..exec.properties().output_partitioning().partition_count() { // execute this *output* partition and collect all batches let task_ctx = state.task_ctx(); let mut stream = exec.execute(i, task_ctx)?; diff --git a/datafusion/core/src/datasource/physical_plan/arrow_file.rs b/datafusion/core/src/datasource/physical_plan/arrow_file.rs index 817f330097ac..82774a6e831c 100644 --- a/datafusion/core/src/datasource/physical_plan/arrow_file.rs +++ b/datafusion/core/src/datasource/physical_plan/arrow_file.rs @@ -152,7 +152,9 @@ impl ExecutionPlan for ArrowExec { let repartitioned_file_groups_option = FileGroupPartitioner::new() .with_target_partitions(target_partitions) .with_repartition_file_min_size(repartition_file_min_size) - .with_preserve_order_within_groups(self.cache.output_ordering().is_some()) + .with_preserve_order_within_groups( + self.properties().output_ordering().is_some(), + ) .repartition_file_groups(&self.base_config.file_groups); if let Some(repartitioned_file_groups) = repartitioned_file_groups_option { diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 2b913d862576..6e7dcf39069c 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -279,7 +279,13 @@ mod tests { table_partition_cols: vec![], output_ordering: vec![], }); - assert_eq!(avro_exec.output_partitioning().partition_count(), 1); + assert_eq!( + avro_exec + .properties() + .output_partitioning() + .partition_count(), + 1 + ); let mut results = avro_exec .execute(0, state.task_ctx()) .expect("plan execution failed"); @@ -350,7 +356,13 @@ mod tests { table_partition_cols: vec![], output_ordering: vec![], }); - assert_eq!(avro_exec.output_partitioning().partition_count(), 1); + assert_eq!( + avro_exec + .properties() + .output_partitioning() + .partition_count(), + 1 + ); let mut results = avro_exec .execute(0, state.task_ctx()) @@ -420,7 +432,13 @@ mod tests { table_partition_cols: vec![Field::new("date", DataType::Utf8, false)], output_ordering: vec![], }); - assert_eq!(avro_exec.output_partitioning().partition_count(), 1); + assert_eq!( + avro_exec + .properties() + .output_partitioning() + .partition_count(), + 1 + ); let mut results = avro_exec .execute(0, state.task_ctx()) diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 3066d77acf45..05a83e8ac0b7 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -41,9 +41,9 @@ use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; +use datafusion_physical_plan::ExecutionPlanProperties; use bytes::{Buf, Bytes}; -use datafusion_physical_plan::ExecutionPlanProperties; use futures::{ready, StreamExt, TryStreamExt}; use object_store::{GetOptions, GetResultPayload, ObjectStore}; use tokio::io::AsyncWriteExt; @@ -198,7 +198,9 @@ impl ExecutionPlan for CsvExec { let repartitioned_file_groups_option = FileGroupPartitioner::new() .with_target_partitions(target_partitions) - .with_preserve_order_within_groups(self.cache.output_ordering().is_some()) + .with_preserve_order_within_groups( + self.properties().output_ordering().is_some(), + ) .with_repartition_file_min_size(repartition_file_min_size) .repartition_file_groups(&self.base_config.file_groups); diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index c471035bd286..6f9af2e6abcf 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -40,9 +40,9 @@ use arrow::json::ReaderBuilder; use arrow::{datatypes::SchemaRef, json}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; +use datafusion_physical_plan::ExecutionPlanProperties; use bytes::{Buf, Bytes}; -use datafusion_physical_plan::ExecutionPlanProperties; use futures::{ready, StreamExt, TryStreamExt}; use object_store::{self, GetOptions, GetResultPayload, ObjectStore}; use tokio::io::AsyncWriteExt; @@ -151,7 +151,7 @@ impl ExecutionPlan for NdJsonExec { config: &datafusion_common::config::ConfigOptions, ) -> Result>> { let repartition_file_min_size = config.optimizer.repartition_file_min_size; - let preserve_order_within_groups = self.cache.output_ordering().is_some(); + let preserve_order_within_groups = self.properties().output_ordering().is_some(); let file_groups = &self.base_config.file_groups; let repartitioned_file_groups_option = FileGroupPartitioner::new() diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index e6962040e8ac..5ccffde26359 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -46,9 +46,9 @@ use crate::{ use arrow::datatypes::{DataType, SchemaRef}; use arrow::error::ArrowError; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalExpr}; +use datafusion_physical_plan::ExecutionPlanProperties; use bytes::Bytes; -use datafusion_physical_plan::ExecutionPlanProperties; use futures::future::BoxFuture; use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; @@ -363,7 +363,9 @@ impl ExecutionPlan for ParquetExec { let repartitioned_file_groups_option = FileGroupPartitioner::new() .with_target_partitions(target_partitions) .with_repartition_file_min_size(repartition_file_min_size) - .with_preserve_order_within_groups(self.cache.output_ordering().is_some()) + .with_preserve_order_within_groups( + self.properties().output_ordering().is_some(), + ) .repartition_file_groups(&self.base_config.file_groups); let mut new_plan = self.clone(); @@ -1558,7 +1560,7 @@ mod tests { expected_row_num: Option, file_schema: SchemaRef, ) -> Result<()> { - let parquet_exec = Arc::new(ParquetExec::new( + let parquet_exec = ParquetExec::new( FileScanConfig { object_store_url: ObjectStoreUrl::local_filesystem(), file_groups, @@ -1571,8 +1573,14 @@ mod tests { }, None, None, - )) as Arc; - assert_eq!(parquet_exec.output_partitioning().partition_count(), 1); + ); + assert_eq!( + parquet_exec + .properties() + .output_partitioning() + .partition_count(), + 1 + ); let results = parquet_exec.execute(0, state.task_ctx())?.next().await; if let Some(expected_row_num) = expected_row_num { diff --git a/datafusion/core/src/physical_optimizer/enforce_distribution.rs b/datafusion/core/src/physical_optimizer/enforce_distribution.rs index dad81b4028bd..eb221a28e2cf 100644 --- a/datafusion/core/src/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/src/physical_optimizer/enforce_distribution.rs @@ -55,8 +55,8 @@ use datafusion_physical_expr::{ }; use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::windows::{get_best_fitting_window, BoundedWindowAggExec}; - use datafusion_physical_plan::ExecutionPlanProperties; + use itertools::izip; /// The `EnforceDistribution` rule ensures that distribution requirements are @@ -1404,7 +1404,7 @@ pub(crate) mod tests { // model that it requires the output ordering of its input fn required_input_ordering(&self) -> Vec>> { vec![self - .cache + .properties() .output_ordering() .map(PhysicalSortRequirement::from_sort_exprs)] } diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs index 25280261c0a0..ee5ff7ec59fd 100644 --- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs @@ -64,8 +64,8 @@ use datafusion_common::{plan_err, DataFusionError}; use datafusion_physical_expr::{PhysicalSortExpr, PhysicalSortRequirement}; use datafusion_physical_plan::repartition::RepartitionExec; use datafusion_physical_plan::sorts::partial_sort::PartialSortExec; - use datafusion_physical_plan::ExecutionPlanProperties; + use itertools::izip; /// This rule inspects [`SortExec`]'s in the given physical plan and removes the @@ -391,7 +391,7 @@ fn analyze_immediate_sort_removal( // If this sort is unnecessary, we should remove it: if sort_input .equivalence_properties() - .ordering_satisfy(node.plan.output_ordering().unwrap_or(&[])) + .ordering_satisfy(sort_exec.properties().output_ordering().unwrap_or(&[])) { node.plan = if !sort_exec.preserve_partitioning() && sort_input.output_partitioning().partition_count() > 1 diff --git a/datafusion/core/src/physical_optimizer/output_requirements.rs b/datafusion/core/src/physical_optimizer/output_requirements.rs index da0697eb9aba..7fea375725a5 100644 --- a/datafusion/core/src/physical_optimizer/output_requirements.rs +++ b/datafusion/core/src/physical_optimizer/output_requirements.rs @@ -242,7 +242,7 @@ fn require_top_ordering_helper( if children.len() != 1 { Ok((plan, false)) } else if let Some(sort_exec) = plan.as_any().downcast_ref::() { - let req_ordering = plan.output_ordering().unwrap_or(&[]); + let req_ordering = sort_exec.properties().output_ordering().unwrap_or(&[]); let req_dist = sort_exec.required_input_distribution()[0].clone(); let reqs = PhysicalSortRequirement::from_sort_exprs(req_ordering); Ok(( diff --git a/datafusion/core/src/physical_optimizer/projection_pushdown.rs b/datafusion/core/src/physical_optimizer/projection_pushdown.rs index 1b9bf17bffa4..9cb2d6ecbc71 100644 --- a/datafusion/core/src/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/projection_pushdown.rs @@ -52,8 +52,8 @@ use datafusion_physical_expr::{ }; use datafusion_physical_plan::streaming::StreamingTableExec; use datafusion_physical_plan::union::UnionExec; - use datafusion_physical_plan::ExecutionPlanProperties; + use itertools::Itertools; /// This rule inspects [`ProjectionExec`]'s in the given physical plan and tries to diff --git a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs index be4b7d13bc7e..c0abde26c300 100644 --- a/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs +++ b/datafusion/core/src/physical_optimizer/replace_with_order_preserving_variants.rs @@ -31,8 +31,8 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::Transformed; use datafusion_physical_plan::coalesce_partitions::CoalescePartitionsExec; use datafusion_physical_plan::tree_node::PlanContext; - use datafusion_physical_plan::ExecutionPlanProperties; + use itertools::izip; /// For a given `plan`, this object carries the information one needs from its diff --git a/datafusion/core/src/physical_optimizer/topk_aggregation.rs b/datafusion/core/src/physical_optimizer/topk_aggregation.rs index 7459deb1f72a..2006402ac59e 100644 --- a/datafusion/core/src/physical_optimizer/topk_aggregation.rs +++ b/datafusion/core/src/physical_optimizer/topk_aggregation.rs @@ -30,7 +30,6 @@ use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::Result; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::PhysicalSortExpr; -use datafusion_physical_plan::ExecutionPlanProperties; use itertools::Itertools; use std::sync::Arc; @@ -87,7 +86,7 @@ impl TopKAggregation { let children = sort.children(); let child = children.iter().exactly_one().ok()?; - let order = plan.output_ordering()?; + let order = sort.properties().output_ordering()?; let order = order.iter().exactly_one().ok()?; let limit = sort.fetch()?; diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index b16cc77a43f2..41a6e4d75be7 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -93,10 +93,10 @@ use datafusion_expr::{ }; use datafusion_physical_expr::expressions::Literal; use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; +use datafusion_physical_plan::ExecutionPlanProperties; use datafusion_sql::utils::window_expr_common_partition_keys; use async_trait::async_trait; -use datafusion_physical_plan::ExecutionPlanProperties; use futures::future::BoxFuture; use futures::{FutureExt, StreamExt, TryStreamExt}; use itertools::{multiunzip, Itertools}; diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index f96eacf45896..46351a0d430f 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -496,7 +496,7 @@ impl AggregateExec { return false; } // ensure there is no output ordering; can this rule be relaxed? - if self.cache.output_ordering().is_some() { + if self.properties().output_ordering().is_some() { return false; } // ensure no ordering is required on the input diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 8036012b7262..45d408bb4de7 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -32,7 +32,7 @@ use crate::metrics::{BaselineMetrics, RecordOutput}; use crate::sorts::sort::{read_spill_as_stream, sort_batch}; use crate::sorts::streaming_merge; use crate::stream::RecordBatchStreamAdapter; -use crate::{aggregates, PhysicalExpr}; +use crate::{aggregates, ExecutionPlan, PhysicalExpr}; use crate::{RecordBatchStream, SendableRecordBatchStream}; use arrow::array::*; @@ -341,7 +341,7 @@ impl GroupedHashAggregateStream { .with_can_spill(true) .register(context.memory_pool()); let (ordering, _) = agg - .cache + .properties() .equivalence_properties() .find_longest_permutation(&agg_group_by.output_exprs()); let group_ordering = GroupOrdering::try_new( diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index 3c5b7e9c13fb..5e7c459a7a98 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -192,7 +192,10 @@ mod tests { let merge = CoalescePartitionsExec::new(csv); // output of CoalescePartitionsExec should have a single partition - assert_eq!(merge.output_partitioning().partition_count(), 1); + assert_eq!( + merge.properties().output_partitioning().partition_count(), + 1 + ); // the result should contain 4 batches (one per input partition) let iter = merge.execute(0, task_ctx)?; diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs index 38c23331983e..4b7b35e53e1b 100644 --- a/datafusion/physical-plan/src/display.rs +++ b/datafusion/physical-plan/src/display.rs @@ -497,10 +497,6 @@ mod tests { unimplemented!() } - fn output_partitioning(&self) -> &datafusion_physical_expr::Partitioning { - &datafusion_physical_expr::Partitioning::UnknownPartitioning(1) - } - fn children(&self) -> Vec> { vec![] } diff --git a/datafusion/physical-plan/src/joins/test_utils.rs b/datafusion/physical-plan/src/joins/test_utils.rs index 37faae873745..920e7b9d7a7c 100644 --- a/datafusion/physical-plan/src/joins/test_utils.rs +++ b/datafusion/physical-plan/src/joins/test_utils.rs @@ -26,7 +26,7 @@ use crate::joins::{ }; use crate::memory::MemoryExec; use crate::repartition::RepartitionExec; -use crate::{common, ExecutionPlan, Partitioning}; +use crate::{common, ExecutionPlan, ExecutionPlanProperties, Partitioning}; use arrow::util::pretty::pretty_format_batches; use arrow_array::{ diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index e37f84fb0150..c41d94bf969e 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -101,7 +101,7 @@ pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream}; /// `ExecutionPlan`'s output from its input. See [`Partitioning`] for more /// details on partitioning. /// -/// Methods such as [`schema`] and [`output_partitioning`] communicate +/// Methods such as [`Self::schema`] and [`ExecutionPlanProperties::output_partitioning`] communicate /// properties of this output to the DataFusion optimizer, and methods such as /// [`required_input_distribution`] and [`required_input_ordering`] express /// requirements of the `ExecutionPlan` from its input. @@ -111,8 +111,6 @@ pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream}; /// quite verbose) `Debug` output. /// /// [`execute`]: ExecutionPlan::execute -/// [`schema`]: ExecutionPlan::schema -/// [`output_partitioning`]: ExecutionPlan::output_partitioning /// [`required_input_distribution`]: ExecutionPlan::required_input_distribution /// [`required_input_ordering`]: ExecutionPlan::required_input_ordering pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { @@ -455,7 +453,7 @@ impl ExecutionPlanProperties for Arc { /// However, since `ProjectionExec` may calculate derived expressions, it /// needs special handling. /// - /// See also [`Self::maintains_input_order`] and [`Self::output_ordering`] + /// See also [`ExecutionPlan::maintains_input_order`] and [`Self::output_ordering`] /// for related concepts. fn equivalence_properties(&self) -> &EquivalenceProperties { self.properties().equivalence_properties() @@ -605,8 +603,11 @@ impl PlanProperties { /// 2. CoalescePartitionsExec for collapsing all of the partitions into one without ordering guarantee /// 3. SortPreservingMergeExec for collapsing all of the sorted partitions into one with ordering guarantee pub fn need_data_exchange(plan: Arc) -> bool { - if let Some(_) = plan.as_any().downcast_ref::() { - !matches!(plan.output_partitioning(), Partitioning::RoundRobinBatch(_)) + if let Some(repartition) = plan.as_any().downcast_ref::() { + !matches!( + repartition.properties().output_partitioning(), + Partitioning::RoundRobinBatch(_) + ) } else if let Some(coalesce) = plan.as_any().downcast_ref::() { coalesce.input().output_partitioning().partition_count() > 1 @@ -677,10 +678,9 @@ pub fn execute_stream( 1 => plan.execute(0, context), _ => { // merge into a single partition - let plan = Arc::new(CoalescePartitionsExec::new(plan.clone())) - as Arc; + let plan = CoalescePartitionsExec::new(plan.clone()); // CoalescePartitionsExec must produce a single partition - assert_eq!(1, plan.output_partitioning().partition_count()); + assert_eq!(1, plan.properties().output_partitioning().partition_count()); plan.execute(0, context) } } diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index 3520d45f9e6c..2d54db3c3cbd 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -410,7 +410,8 @@ impl ExecutionPlan for LocalLimitExec { _ => Statistics { // the result output row number will always be no greater than the limit number num_rows: Precision::Inexact( - self.fetch * self.cache.output_partitioning().partition_count(), + self.fetch + * self.properties().output_partitioning().partition_count(), ), column_statistics: col_stats, diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index 23699295e121..04b7c78b77f4 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -287,8 +287,8 @@ mod tests { use std::sync::Arc; use crate::memory::MemoryExec; - use crate::ExecutionPlan; + use crate::ExecutionPlan; use arrow_schema::{DataType, Field, Schema, SortOptions}; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::PhysicalSortExpr; @@ -322,8 +322,11 @@ mod tests { let mem_exec = MemoryExec::try_new(&[vec![]], schema, None)? .with_sort_information(sort_information); - assert_eq!(mem_exec.output_ordering().unwrap(), expected_output_order); - let eq_properties = mem_exec.equivalence_properties(); + assert_eq!( + mem_exec.properties().output_ordering().unwrap(), + expected_output_order + ); + let eq_properties = mem_exec.properties().equivalence_properties(); assert!(eq_properties.oeq_class().contains(&sort1)); assert!(eq_properties.oeq_class().contains(&sort2)); Ok(()) diff --git a/datafusion/physical-plan/src/stream.rs b/datafusion/physical-plan/src/stream.rs index b780a50cdc90..597b60537f6e 100644 --- a/datafusion/physical-plan/src/stream.rs +++ b/datafusion/physical-plan/src/stream.rs @@ -555,7 +555,7 @@ mod test { let task_ctx = Arc::new(TaskContext::default()); let input = Arc::new(input); - let num_partitions = input.output_partitioning().partition_count(); + let num_partitions = input.properties().output_partitioning().partition_count(); // Configure a RecordBatchReceiverStream to consume all the input partitions let mut builder = diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 9f637361ff8f..2889a506f3ff 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -205,7 +205,7 @@ impl ExecutionPlan for UnionExec { // which is the "meet" of all input orderings. In this example, this // function will return vec![false, true, true], indicating that we // preserve the orderings for the 2nd and the 3rd children. - if let Some(output_ordering) = self.cache.output_ordering() { + if let Some(output_ordering) = self.properties().output_ordering() { self.inputs() .iter() .map(|child| { @@ -635,7 +635,13 @@ mod tests { let union_exec = Arc::new(UnionExec::new(vec![csv, csv2])); // Should have 9 partitions and 9 output batches - assert_eq!(union_exec.output_partitioning().partition_count(), 9); + assert_eq!( + union_exec + .properties() + .output_partitioning() + .partition_count(), + 9 + ); let result: Vec = collect(union_exec, task_ctx).await?; assert_eq!(result.len(), 9); @@ -806,7 +812,7 @@ mod tests { ); let union = UnionExec::new(vec![child1, child2]); - let union_eq_properties = union.equivalence_properties(); + let union_eq_properties = union.properties().equivalence_properties(); let union_actual_orderings = union_eq_properties.oeq_class(); let err_msg = format!( "Error in test id: {:?}, test case: {:?}", From a8fac85429f519ea5ae258a2a6425eaa9ab333c8 Mon Sep 17 00:00:00 2001 From: Mustafa Akur Date: Wed, 28 Feb 2024 09:35:09 +0300 Subject: [PATCH 40/45] Bring docs yaml --- .github/workflows/docs.yaml | 64 +++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 .github/workflows/docs.yaml diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml new file mode 100644 index 000000000000..ab6a615ab60b --- /dev/null +++ b/.github/workflows/docs.yaml @@ -0,0 +1,64 @@ +on: + push: + branches: + - main + paths: + - .asf.yaml + - .github/workflows/docs.yaml + - docs/** + +name: Deploy DataFusion site + +jobs: + build-docs: + name: Build docs + runs-on: ubuntu-latest + steps: + - name: Checkout docs sources + uses: actions/checkout@v4 + + - name: Checkout asf-site branch + uses: actions/checkout@v4 + with: + ref: asf-site + path: asf-site + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Install dependencies + run: | + set -x + python3 -m venv venv + source venv/bin/activate + pip install -r docs/requirements.txt + + - name: Build docs + run: | + set -x + source venv/bin/activate + cd docs + ./build.sh + + - name: Copy & push the generated HTML + run: | + set -x + cd asf-site/ + rsync \ + -a \ + --delete \ + --exclude '/.git/' \ + ../docs/build/html/ \ + ./ + cp ../.asf.yaml . + touch .nojekyll + git status --porcelain + if [ "$(git status --porcelain)" != "" ]; then + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add --all + git commit -m 'Publish built docs triggered by ${{ github.sha }}' + git push || git push --force + fi From e62240969135e2236d100c8c0c01546a87950a80 Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Wed, 28 Feb 2024 04:55:03 -0600 Subject: [PATCH 41/45] feat: issue_9285: port builtin reg function into datafusion-function-* (1/3 regexpmatch) (#9329) * feat: issue_9285: port builtin reg function into datafusion-function-* crate (1/3: RegexpMatch part) * fix fmt * refact * modify test * fix msrv verify problem * port test and delete useless lines --- .../tests/dataframe/dataframe_functions.rs | 2 +- datafusion/expr/src/built_in_function.rs | 22 --- datafusion/expr/src/expr_fn.rs | 7 - datafusion/functions/Cargo.toml | 3 +- datafusion/functions/src/lib.rs | 7 +- datafusion/functions/src/regex/mod.rs | 29 ++++ datafusion/functions/src/regex/regexpmatch.rs | 145 ++++++++++++++++++ datafusion/physical-expr/src/functions.rs | 108 ------------- datafusion/proto/proto/datafusion.proto | 2 +- datafusion/proto/src/generated/pbjson.rs | 3 - datafusion/proto/src/generated/prost.rs | 4 +- .../proto/src/logical_plan/from_proto.rs | 17 +- datafusion/proto/src/logical_plan/to_proto.rs | 1 - datafusion/sqllogictest/test_files/regexp.slt | 8 +- 14 files changed, 196 insertions(+), 162 deletions(-) create mode 100644 datafusion/functions/src/regex/mod.rs create mode 100644 datafusion/functions/src/regex/regexpmatch.rs diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 95c13fc17c90..ff553a48888b 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -467,7 +467,7 @@ async fn test_fn_regexp_like() -> Result<()> { #[tokio::test] #[cfg(feature = "unicode_expressions")] async fn test_fn_regexp_match() -> Result<()> { - let expr = regexp_match(vec![col("a"), lit("[a-z]")]); + let expr = regexp_match(col("a"), lit("[a-z]")); let expected = [ "+------------------------------------+", diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index e04106595876..8df2f4e88d41 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -233,7 +233,6 @@ pub enum BuiltinScalarFunction { /// regexp_like RegexpLike, /// regexp_match - RegexpMatch, /// regexp_replace RegexpReplace, /// repeat @@ -449,7 +448,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::OctetLength => Volatility::Immutable, BuiltinScalarFunction::Radians => Volatility::Immutable, BuiltinScalarFunction::RegexpLike => Volatility::Immutable, - BuiltinScalarFunction::RegexpMatch => Volatility::Immutable, BuiltinScalarFunction::RegexpReplace => Volatility::Immutable, BuiltinScalarFunction::Repeat => Volatility::Immutable, BuiltinScalarFunction::Replace => Volatility::Immutable, @@ -814,16 +812,6 @@ impl BuiltinScalarFunction { ); } }), - BuiltinScalarFunction::RegexpMatch => Ok(match &input_expr_types[0] { - LargeUtf8 => List(Arc::new(Field::new("item", LargeUtf8, true))), - Utf8 => List(Arc::new(Field::new("item", Utf8, true))), - Null => Null, - other => { - return plan_err!( - "The regexp_match function can only accept strings. Got {other}" - ); - } - }), BuiltinScalarFunction::Factorial | BuiltinScalarFunction::Gcd @@ -1263,15 +1251,6 @@ impl BuiltinScalarFunction { ], self.volatility(), ), - BuiltinScalarFunction::RegexpMatch => Signature::one_of( - vec![ - Exact(vec![Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8]), - Exact(vec![Utf8, Utf8, Utf8]), - Exact(vec![LargeUtf8, Utf8, Utf8]), - ], - self.volatility(), - ), BuiltinScalarFunction::RegexpReplace => Signature::one_of( vec![ Exact(vec![Utf8, Utf8, Utf8]), @@ -1514,7 +1493,6 @@ impl BuiltinScalarFunction { // regex functions BuiltinScalarFunction::RegexpLike => &["regexp_like"], - BuiltinScalarFunction::RegexpMatch => &["regexp_match"], BuiltinScalarFunction::RegexpReplace => &["regexp_replace"], // time/date functions diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 67bf39050d58..7ffd2f76e783 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -854,11 +854,6 @@ nary_scalar_expr!( regexp_like, "matches a regular expression against a string and returns true or false if there was at least one match or not" ); -nary_scalar_expr!( - RegexpMatch, - regexp_match, - "matches a regular expression against a string and returns matched substrings." -); nary_scalar_expr!( RegexpReplace, regexp_replace, @@ -1380,8 +1375,6 @@ mod test { test_scalar_expr!(OctetLength, octet_length, string); test_nary_scalar_expr!(RegexpLike, regexp_like, string, pattern); test_nary_scalar_expr!(RegexpLike, regexp_like, string, pattern, flags); - test_nary_scalar_expr!(RegexpMatch, regexp_match, string, pattern); - test_nary_scalar_expr!(RegexpMatch, regexp_match, string, pattern, flags); test_nary_scalar_expr!( RegexpReplace, regexp_replace, diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index f63f18f955de..89b7de9ee11a 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -32,11 +32,12 @@ rust-version = { workspace = true } # enable core functions core_expressions = [] # Enable encoding by default so the doctests work. In general don't automatically enable all packages. -default = ["core_expressions", "encoding_expressions", "math_expressions"] +default = ["core_expressions", "encoding_expressions", "math_expressions", "regex_expressions"] # enable encode/decode functions encoding_expressions = ["base64", "hex"] # enable math functions math_expressions = [] +regex_expressions = [] [lib] name = "datafusion_functions" diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs index 981174c141d6..d2f0270959ee 100644 --- a/datafusion/functions/src/lib.rs +++ b/datafusion/functions/src/lib.rs @@ -93,7 +93,7 @@ make_package!( ); make_package!(math, "math_expressions", "Mathematical functions."); - +make_package!(regex, "regex_expressions", "Regex functions"); /// Fluent-style API for creating `Expr`s pub mod expr_fn { #[cfg(feature = "core_expressions")] @@ -102,6 +102,8 @@ pub mod expr_fn { pub use super::encoding::expr_fn::*; #[cfg(feature = "math_expressions")] pub use super::math::expr_fn::*; + #[cfg(feature = "regex_expressions")] + pub use super::regex::expr_fn::*; } /// Registers all enabled packages with a [`FunctionRegistry`] @@ -109,7 +111,8 @@ pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { let mut all_functions = core::functions() .into_iter() .chain(encoding::functions()) - .chain(math::functions()); + .chain(math::functions()) + .chain(regex::functions()); all_functions.try_for_each(|udf| { let existing_udf = registry.register_udf(udf)?; diff --git a/datafusion/functions/src/regex/mod.rs b/datafusion/functions/src/regex/mod.rs new file mode 100644 index 000000000000..862e8b77a2d6 --- /dev/null +++ b/datafusion/functions/src/regex/mod.rs @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! "regx" DataFusion functions + +mod regexpmatch; +// create UDFs +make_udf_function!(regexpmatch::RegexpMatchFunc, REGEXP_MATCH, regexp_match); + +export_functions!(( + regexp_match, + input_arg1 + input_arg2, + "returns a list of regular expression matches in a string. " +)); diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs new file mode 100644 index 000000000000..7ab99f96b142 --- /dev/null +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -0,0 +1,145 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Encoding expressions +use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow::compute::kernels::regexp; +use arrow::datatypes::DataType; +use arrow::datatypes::Field; +use datafusion_common::ScalarValue; +use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; +use datafusion_common::exec_err; +use datafusion_common::{arrow_datafusion_err, plan_err}; +use datafusion_common::{ + cast::as_generic_string_array, internal_err, DataFusionError, Result, +}; +use datafusion_expr::ColumnarValue; +use std::sync::Arc; + +#[derive(Debug)] +pub(super) struct RegexpMatchFunc { + signature: Signature, +} +impl RegexpMatchFunc { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Utf8, Utf8]), + Exact(vec![LargeUtf8, Utf8]), + Exact(vec![Utf8, Utf8, Utf8]), + Exact(vec![LargeUtf8, Utf8, Utf8]), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for RegexpMatchFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "regexp_match" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + + Ok(match &arg_types[0] { + LargeUtf8 => List(Arc::new(Field::new("item", LargeUtf8, true))), + Utf8 => List(Arc::new(Field::new("item", Utf8, true))), + Null => Null, + other => { + return plan_err!( + "The regexp_match function can only accept strings. Got {other}" + ); + } + }) + } + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let len = args + .iter() + .fold(Option::::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + + let is_scalar = len.is_none(); + let inferred_length = len.unwrap_or(1); + let args = args + .iter() + .map(|arg| arg.clone().into_array(inferred_length)) + .collect::>>()?; + + let result = regexp_match_func(&args); + if is_scalar { + // If all inputs are scalar, keeps output as scalar + let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); + result.map(ColumnarValue::Scalar) + } else { + result.map(ColumnarValue::Array) + } + } +} +fn regexp_match_func(args: &[ArrayRef]) -> Result { + match args[0].data_type() { + DataType::Utf8 => { + regexp_match::(args) + } + DataType::LargeUtf8 => { + regexp_match::(args) + } + other => { + internal_err!("Unsupported data type {other:?} for function regexp_match") + } + } +} +pub fn regexp_match(args: &[ArrayRef]) -> Result { + match args.len() { + 2 => { + let values = as_generic_string_array::(&args[0])?; + let regex = as_generic_string_array::(&args[1])?; + regexp::regexp_match(values, regex, None) + .map_err(|e| arrow_datafusion_err!(e)) + } + 3 => { + let values = as_generic_string_array::(&args[0])?; + let regex = as_generic_string_array::(&args[1])?; + let flags = as_generic_string_array::(&args[2])?; + + if flags.iter().any(|s| s == Some("g")) { + return plan_err!("regexp_match() does not support the \"global\" option") + } + + regexp::regexp_match(values, regex, Some(flags)) + .map_err(|e| arrow_datafusion_err!(e)) + } + other => exec_err!( + "regexp_match was called with {other} arguments. It requires at least 2 and at most 3." + ), + } +} diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index af079dbd2d12..186de0609b9a 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -243,7 +243,6 @@ where .collect::>>()?; let result = (inner)(&args); - if is_scalar { // If all inputs are scalar, keeps output as scalar let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); @@ -619,29 +618,6 @@ pub fn create_physical_fun( exec_err!("Unsupported data type {other:?} for function regexp_like") } }), - BuiltinScalarFunction::RegexpMatch => { - Arc::new(|args| match args[0].data_type() { - DataType::Utf8 => { - let func = invoke_on_array_if_regex_expressions_feature_flag!( - regexp_match, - i32, - "regexp_match" - ); - make_scalar_function_inner(func)(args) - } - DataType::LargeUtf8 => { - let func = invoke_on_array_if_regex_expressions_feature_flag!( - regexp_match, - i64, - "regexp_match" - ); - make_scalar_function_inner(func)(args) - } - other => { - exec_err!("Unsupported data type {other:?} for function regexp_match") - } - }) - } BuiltinScalarFunction::RegexpReplace => { Arc::new(|args| match args[0].data_type() { DataType::Utf8 => { @@ -3185,90 +3161,6 @@ mod tests { Ok(()) } - #[test] - #[cfg(feature = "regex_expressions")] - fn test_regexp_match() -> Result<()> { - use datafusion_common::cast::{as_list_array, as_string_array}; - let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]); - let execution_props = ExecutionProps::new(); - - let col_value: ArrayRef = Arc::new(StringArray::from(vec!["aaa-555"])); - let pattern = lit(r".*-(\d*)"); - let columns: Vec = vec![col_value]; - let expr = create_physical_expr_with_type_coercion( - &BuiltinScalarFunction::RegexpMatch, - &[col("a", &schema)?, pattern], - &schema, - &execution_props, - )?; - - // type is correct - assert_eq!( - expr.data_type(&schema)?, - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))) - ); - - // evaluate works - let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?; - let result = expr - .evaluate(&batch)? - .into_array(batch.num_rows()) - .expect("Failed to convert to array"); - - // downcast works - let result = as_list_array(&result)?; - let first_row = result.value(0); - let first_row = as_string_array(&first_row)?; - - // value is correct - let expected = "555".to_string(); - assert_eq!(first_row.value(0), expected); - - Ok(()) - } - - #[test] - #[cfg(feature = "regex_expressions")] - fn test_regexp_match_all_literals() -> Result<()> { - use datafusion_common::cast::{as_list_array, as_string_array}; - let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); - let execution_props = ExecutionProps::new(); - - let col_value = lit("aaa-555"); - let pattern = lit(r".*-(\d*)"); - let columns: Vec = vec![Arc::new(Int32Array::from(vec![1]))]; - let expr = create_physical_expr_with_type_coercion( - &BuiltinScalarFunction::RegexpMatch, - &[col_value, pattern], - &schema, - &execution_props, - )?; - - // type is correct - assert_eq!( - expr.data_type(&schema)?, - DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))) - ); - - // evaluate works - let batch = RecordBatch::try_new(Arc::new(schema.clone()), columns)?; - let result = expr - .evaluate(&batch)? - .into_array(batch.num_rows()) - .expect("Failed to convert to array"); - - // downcast works - let result = as_list_array(&result)?; - let first_row = result.value(0); - let first_row = as_string_array(&first_row)?; - - // value is correct - let expected = "555".to_string(); - assert_eq!(first_row.value(0), expected); - - Ok(()) - } - // Helper function just for testing. // Returns `expressions` coerced to types compatible with // `signature`, if possible. diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 2d729ffc5b3e..1f659469aa3a 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -568,7 +568,7 @@ enum ScalarFunction { Tan = 18; Trunc = 19; Array = 20; - RegexpMatch = 21; + // RegexpMatch = 21; BitLength = 22; Btrim = 23; CharacterLength = 24; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 5f05b8546f68..8959dd37cf13 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -22341,7 +22341,6 @@ impl serde::Serialize for ScalarFunction { Self::Tan => "Tan", Self::Trunc => "Trunc", Self::Array => "Array", - Self::RegexpMatch => "RegexpMatch", Self::BitLength => "BitLength", Self::Btrim => "Btrim", Self::CharacterLength => "CharacterLength", @@ -22483,7 +22482,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Tan", "Trunc", "Array", - "RegexpMatch", "BitLength", "Btrim", "CharacterLength", @@ -22654,7 +22652,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Tan" => Ok(ScalarFunction::Tan), "Trunc" => Ok(ScalarFunction::Trunc), "Array" => Ok(ScalarFunction::Array), - "RegexpMatch" => Ok(ScalarFunction::RegexpMatch), "BitLength" => Ok(ScalarFunction::BitLength), "Btrim" => Ok(ScalarFunction::Btrim), "CharacterLength" => Ok(ScalarFunction::CharacterLength), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 252089d5c14d..09152d99c12f 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2656,7 +2656,7 @@ pub enum ScalarFunction { Tan = 18, Trunc = 19, Array = 20, - RegexpMatch = 21, + /// RegexpMatch = 21; BitLength = 22, Btrim = 23, CharacterLength = 24, @@ -2798,7 +2798,6 @@ impl ScalarFunction { ScalarFunction::Tan => "Tan", ScalarFunction::Trunc => "Trunc", ScalarFunction::Array => "Array", - ScalarFunction::RegexpMatch => "RegexpMatch", ScalarFunction::BitLength => "BitLength", ScalarFunction::Btrim => "Btrim", ScalarFunction::CharacterLength => "CharacterLength", @@ -2934,7 +2933,6 @@ impl ScalarFunction { "Tan" => Some(Self::Tan), "Trunc" => Some(Self::Trunc), "Array" => Some(Self::Array), - "RegexpMatch" => Some(Self::RegexpMatch), "BitLength" => Some(Self::BitLength), "Btrim" => Some(Self::Btrim), "CharacterLength" => Some(Self::CharacterLength), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index acfa043b88af..e8059482b1b9 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -61,11 +61,11 @@ use datafusion_expr::{ instr, iszero, lcm, left, levenshtein, ln, log, log10, log2, logical_plan::{PlanType, StringifiedPlan}, lower, lpad, ltrim, md5, nanvl, now, octet_length, overlay, pi, power, radians, - random, regexp_like, regexp_match, regexp_replace, repeat, replace, reverse, right, - round, rpad, rtrim, sha224, sha256, sha384, sha512, signum, sin, sinh, split_part, - sqrt, starts_with, string_to_array, strpos, struct_fun, substr, substr_index, - substring, tan, tanh, to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, - Between, BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, + random, regexp_like, regexp_replace, repeat, replace, reverse, right, round, rpad, + rtrim, sha224, sha256, sha384, sha512, signum, sin, sinh, split_part, sqrt, + starts_with, string_to_array, strpos, struct_fun, substr, substr_index, substring, + tan, tanh, to_hex, translate, trim, trunc, upper, uuid, AggregateFunction, Between, + BinaryExpr, BuiltInWindowFunction, BuiltinScalarFunction, Case, Cast, Expr, GetFieldAccess, GetIndexedField, GroupingSet, GroupingSet::GroupingSets, JoinConstraint, JoinType, Like, Operator, TryCast, WindowFrame, WindowFrameBound, @@ -535,7 +535,6 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Lpad => Self::Lpad, ScalarFunction::Random => Self::Random, ScalarFunction::RegexpLike => Self::RegexpLike, - ScalarFunction::RegexpMatch => Self::RegexpMatch, ScalarFunction::RegexpReplace => Self::RegexpReplace, ScalarFunction::Repeat => Self::Repeat, ScalarFunction::Replace => Self::Replace, @@ -1638,12 +1637,6 @@ pub fn parse_expr( .map(|expr| parse_expr(expr, registry)) .collect::, _>>()?, )), - ScalarFunction::RegexpMatch => Ok(regexp_match( - args.to_owned() - .iter() - .map(|expr| parse_expr(expr, registry)) - .collect::, _>>()?, - )), ScalarFunction::RegexpReplace => Ok(regexp_replace( args.to_owned() .iter() diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index d19830db98ce..6f126729cb29 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1518,7 +1518,6 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Random => Self::Random, BuiltinScalarFunction::Uuid => Self::Uuid, BuiltinScalarFunction::RegexpLike => Self::RegexpLike, - BuiltinScalarFunction::RegexpMatch => Self::RegexpMatch, BuiltinScalarFunction::RegexpReplace => Self::RegexpReplace, BuiltinScalarFunction::Repeat => Self::Repeat, BuiltinScalarFunction::Replace => Self::Replace, diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index 1e951e2962ff..a80b08c41ee3 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -220,6 +220,12 @@ SELECT regexp_match('(?<=[A-Z]\w )Smith', 'John Smith', 'i'); ---- NULL +# ported test +query ? +SELECT regexp_match('aaa-555', '.*-(\d*)'); +---- +[555] + # # regexp_replace tests # @@ -300,4 +306,4 @@ SELECT regexp_replace(arrow_cast('foobar', 'Dictionary(Int32, Utf8)'), 'bar', 'x fooxx statement ok -drop table t; \ No newline at end of file +drop table t; From d896ebe4d7466e52a1e8fad4252067d69e62298a Mon Sep 17 00:00:00 2001 From: Jonah Gao Date: Wed, 28 Feb 2024 20:16:44 +0800 Subject: [PATCH 42/45] Add test to verify issue #9161 (#9265) * Add test to verify issue #9161 * fmt --- datafusion/sqllogictest/test_files/aggregate.slt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 109c64f06055..b78c6287746c 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -3313,5 +3313,13 @@ query I SELECT 0 AS "t.a" FROM t HAVING MAX(t.a) = 0; ---- +# Test issue: https://github.com/apache/arrow-datafusion/issues/9161 +query I rowsort +SELECT CAST(a AS INT) FROM t GROUP BY t.a; +---- +1 +2 +3 + statement ok DROP TABLE t; From a1ae15826245097e7c12d4f0ed3425b25af6c431 Mon Sep 17 00:00:00 2001 From: Marco Neumann Date: Wed, 28 Feb 2024 13:25:36 +0100 Subject: [PATCH 43/45] refactor: fix error macros hygiene (#9366) --- benchmarks/src/tpch/convert.rs | 2 +- benchmarks/src/tpch/mod.rs | 2 +- datafusion-cli/src/catalog.rs | 2 +- datafusion-cli/src/functions.rs | 2 +- datafusion-examples/examples/advanced_udf.rs | 1 - datafusion-examples/examples/rewrite_expr.rs | 2 +- datafusion-examples/examples/simple_udtf.rs | 2 +- datafusion/common/src/error.rs | 16 +++++++- datafusion/common/src/hash_utils.rs | 2 +- .../common/src/scalar/struct_builder.rs | 2 +- datafusion/core/src/catalog/mod.rs | 2 +- datafusion/core/src/catalog/schema.rs | 4 +- .../src/datasource/default_table_source.rs | 2 +- .../core/src/datasource/file_format/json.rs | 2 +- .../core/src/datasource/file_format/mod.rs | 2 +- datafusion/core/src/datasource/memory.rs | 5 +-- datafusion/core/src/datasource/mod.rs | 2 +- .../physical_plan/file_scan_config.rs | 5 +-- .../datasource/physical_plan/file_stream.rs | 2 +- .../core/src/datasource/physical_plan/mod.rs | 2 +- datafusion/core/src/datasource/provider.rs | 2 +- datafusion/core/src/datasource/streaming.rs | 2 +- .../src/physical_optimizer/enforce_sorting.rs | 2 +- .../src/physical_optimizer/join_selection.rs | 2 +- .../physical_optimizer/pipeline_checker.rs | 2 +- .../src/physical_optimizer/sort_pushdown.rs | 2 +- datafusion/core/tests/core_integration.rs | 3 ++ .../provider_filter_pushdown.rs | 2 +- datafusion/core/tests/macro_hygiene/mod.rs | 39 +++++++++++++++++++ .../user_defined/user_defined_aggregates.rs | 4 +- .../user_defined_scalar_functions.rs | 2 +- datafusion/execution/src/registry.rs | 2 +- datafusion/expr/src/accumulator.rs | 2 +- datafusion/expr/src/columnar_value.rs | 2 +- .../expr/src/conditional_expressions.rs | 2 +- datafusion/expr/src/expr.rs | 2 +- datafusion/expr/src/expr_schema.rs | 3 +- datafusion/expr/src/field_util.rs | 4 +- datafusion/expr/src/interval_arithmetic.rs | 2 +- datafusion/expr/src/partition_evaluator.rs | 2 +- datafusion/expr/src/tree_node/expr.rs | 2 +- .../expr/src/type_coercion/aggregates.rs | 2 +- datafusion/expr/src/type_coercion/binary.rs | 4 +- .../expr/src/type_coercion/functions.rs | 4 +- datafusion/expr/src/udaf.rs | 2 +- datafusion/expr/src/utils.rs | 28 ++++++------- datafusion/functions-array/src/udf.rs | 2 +- datafusion/functions/src/core/nullif.rs | 2 +- datafusion/functions/src/core/nvl.rs | 2 +- datafusion/optimizer/src/analyzer/subquery.rs | 2 +- datafusion/optimizer/src/decorrelate.rs | 2 +- .../src/decorrelate_predicate_subquery.rs | 2 +- .../optimizer/src/eliminate_cross_join.rs | 2 +- datafusion/optimizer/src/optimizer.rs | 4 +- .../optimizer/src/propagate_empty_relation.rs | 2 +- datafusion/optimizer/src/push_down_filter.rs | 4 +- .../optimizer/src/scalar_subquery_to_join.rs | 2 +- .../src/simplify_expressions/utils.rs | 2 +- .../src/unwrap_cast_in_comparison.rs | 4 +- .../optimizer/tests/optimizer_integration.rs | 2 +- .../src/aggregate/array_agg_distinct.rs | 2 +- .../src/aggregate/array_agg_ordered.rs | 2 +- .../physical-expr/src/aggregate/average.rs | 2 +- .../physical-expr/src/aggregate/build_in.rs | 4 +- .../physical-expr/src/aggregate/grouping.rs | 2 +- datafusion/physical-expr/src/aggregate/mod.rs | 2 +- .../physical-expr/src/aggregate/nth_value.rs | 2 +- .../physical-expr/src/aggregate/stddev.rs | 2 +- .../physical-expr/src/aggregate/string_agg.rs | 2 +- datafusion/physical-expr/src/aggregate/sum.rs | 2 +- .../src/aggregate/sum_distinct.rs | 2 +- datafusion/physical-expr/src/analysis.rs | 4 +- .../src/conditional_expressions.rs | 2 +- .../physical-expr/src/equivalence/mod.rs | 2 +- .../physical-expr/src/expressions/binary.rs | 2 +- .../src/expressions/binary/kernels.rs | 2 +- .../physical-expr/src/expressions/cast.rs | 2 +- .../physical-expr/src/expressions/column.rs | 2 +- .../src/expressions/get_indexed_field.rs | 2 +- .../physical-expr/src/expressions/in_list.rs | 4 +- .../physical-expr/src/expressions/like.rs | 2 +- .../physical-expr/src/expressions/mod.rs | 4 +- .../physical-expr/src/expressions/negative.rs | 3 +- .../physical-expr/src/expressions/no_op.rs | 2 +- .../physical-expr/src/expressions/try_cast.rs | 2 +- datafusion/physical-expr/src/functions.rs | 4 +- .../physical-expr/src/intervals/cp_solver.rs | 2 +- .../physical-expr/src/intervals/utils.rs | 4 +- datafusion/physical-expr/src/physical_expr.rs | 2 +- datafusion/physical-expr/src/planner.rs | 3 +- .../physical-expr/src/string_expressions.rs | 2 +- .../physical-expr/src/struct_expressions.rs | 2 +- .../physical-expr/src/unicode_expressions.rs | 2 +- .../physical-expr/src/window/nth_value.rs | 2 +- datafusion/physical-expr/src/window/rank.rs | 2 +- .../physical-plan/src/aggregates/mod.rs | 2 +- .../physical-plan/src/coalesce_partitions.rs | 2 +- datafusion/physical-plan/src/empty.rs | 2 +- datafusion/physical-plan/src/explain.rs | 2 +- datafusion/physical-plan/src/insert.rs | 2 +- .../physical-plan/src/joins/cross_join.rs | 2 +- .../src/joins/nested_loop_join.rs | 2 +- .../src/joins/symmetric_hash_join.rs | 4 +- datafusion/physical-plan/src/lib.rs | 2 +- datafusion/physical-plan/src/limit.rs | 2 +- datafusion/physical-plan/src/memory.rs | 2 +- .../physical-plan/src/placeholder_row.rs | 2 +- .../src/sorts/sort_preserving_merge.rs | 2 +- .../src/sorts/streaming_merge.rs | 2 +- datafusion/physical-plan/src/stream.rs | 2 +- datafusion/physical-plan/src/streaming.rs | 2 +- datafusion/physical-plan/src/udaf.rs | 2 +- datafusion/physical-plan/src/union.rs | 2 +- datafusion/physical-plan/src/unnest.rs | 2 +- datafusion/physical-plan/src/values.rs | 2 +- .../src/windows/window_agg_exec.rs | 2 +- datafusion/physical-plan/src/work_table.rs | 2 +- datafusion/proto/src/bytes/mod.rs | 2 +- datafusion/proto/src/bytes/registry.rs | 2 +- datafusion/sql/examples/sql.rs | 2 +- datafusion/sql/src/expr/binary_op.rs | 2 +- datafusion/sql/src/expr/function.rs | 3 +- datafusion/sql/src/expr/grouping_set.rs | 2 +- datafusion/sql/src/expr/json_access.rs | 2 +- datafusion/sql/src/expr/mod.rs | 3 +- datafusion/sql/src/expr/order_by.rs | 4 +- datafusion/sql/src/expr/substring.rs | 2 +- datafusion/sql/src/expr/unary_op.rs | 2 +- datafusion/sql/src/relation/join.rs | 2 +- datafusion/sql/src/relation/mod.rs | 4 +- datafusion/sql/src/set_expr.rs | 2 +- .../tests/cases/roundtrip_logical_plan.rs | 2 +- 132 files changed, 205 insertions(+), 181 deletions(-) create mode 100644 datafusion/core/tests/macro_hygiene/mod.rs diff --git a/benchmarks/src/tpch/convert.rs b/benchmarks/src/tpch/convert.rs index 2fc74ce38888..12b562421e53 100644 --- a/benchmarks/src/tpch/convert.rs +++ b/benchmarks/src/tpch/convert.rs @@ -20,7 +20,7 @@ use std::path::{Path, PathBuf}; use std::time::Instant; use datafusion::common::not_impl_err; -use datafusion::error::DataFusionError; + use datafusion::error::Result; use datafusion::prelude::*; use parquet::basic::Compression; diff --git a/benchmarks/src/tpch/mod.rs b/benchmarks/src/tpch/mod.rs index 8965ebea7ff6..23d0681f560c 100644 --- a/benchmarks/src/tpch/mod.rs +++ b/benchmarks/src/tpch/mod.rs @@ -21,7 +21,7 @@ use arrow::datatypes::SchemaBuilder; use datafusion::{ arrow::datatypes::{DataType, Field, Schema}, common::plan_err, - error::{DataFusionError, Result}, + error::Result, }; use std::fs; mod run; diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index 67184b8257b8..29211edbb0a4 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -19,7 +19,7 @@ use crate::object_storage::get_object_store; use async_trait::async_trait; use datafusion::catalog::schema::SchemaProvider; use datafusion::catalog::{CatalogProvider, CatalogProviderList}; -use datafusion::common::{plan_datafusion_err, DataFusionError}; +use datafusion::common::plan_datafusion_err; use datafusion::datasource::listing::{ ListingTable, ListingTableConfig, ListingTableUrl, }; diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs index 5390fa9f2271..806e2bb39cd4 100644 --- a/datafusion-cli/src/functions.rs +++ b/datafusion-cli/src/functions.rs @@ -21,7 +21,7 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow::util::pretty::pretty_format_batches; use async_trait::async_trait; -use datafusion::common::DataFusionError; + use datafusion::common::{plan_err, Column}; use datafusion::datasource::function::TableFunctionImpl; use datafusion::datasource::TableProvider; diff --git a/datafusion-examples/examples/advanced_udf.rs b/datafusion-examples/examples/advanced_udf.rs index 3e7dd2e2af08..c8063c0eb1e3 100644 --- a/datafusion-examples/examples/advanced_udf.rs +++ b/datafusion-examples/examples/advanced_udf.rs @@ -176,7 +176,6 @@ impl ScalarUDFImpl for PowUdf { } // if the types were not float, it is a bug in DataFusion _ => { - use datafusion_common::DataFusionError; internal_err!("Invalid argument types to pow function") } } diff --git a/datafusion-examples/examples/rewrite_expr.rs b/datafusion-examples/examples/rewrite_expr.rs index 5e95562033e6..8d13d1201881 100644 --- a/datafusion-examples/examples/rewrite_expr.rs +++ b/datafusion-examples/examples/rewrite_expr.rs @@ -18,7 +18,7 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::{plan_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{plan_err, Result, ScalarValue}; use datafusion_expr::{ AggregateUDF, Between, Expr, Filter, LogicalPlan, ScalarUDF, TableSource, WindowUDF, }; diff --git a/datafusion-examples/examples/simple_udtf.rs b/datafusion-examples/examples/simple_udtf.rs index f1d763ba6e41..09341fbf47fa 100644 --- a/datafusion-examples/examples/simple_udtf.rs +++ b/datafusion-examples/examples/simple_udtf.rs @@ -27,7 +27,7 @@ use datafusion::execution::context::{ExecutionProps, SessionState}; use datafusion::physical_plan::memory::MemoryExec; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::SessionContext; -use datafusion_common::{plan_err, DataFusionError, ScalarValue}; +use datafusion_common::{plan_err, ScalarValue}; use datafusion_expr::{Expr, TableType}; use datafusion_optimizer::simplify_expressions::{ExprSimplifier, SimplifyContext}; use std::fs::File; diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 331f5910d7e5..0f4e97905938 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -495,7 +495,13 @@ macro_rules! make_error { #[macro_export] macro_rules! $NAME_DF_ERR { ($d($d args:expr),*) => { - DataFusionError::$ERR(format!("{}{}", format!($d($d args),*), DataFusionError::get_back_trace()).into()) + $crate::DataFusionError::$ERR( + format!( + "{}{}", + format!($d($d args),*), + $crate::DataFusionError::get_back_trace(), + ).into() + ) } } @@ -503,7 +509,13 @@ macro_rules! make_error { #[macro_export] macro_rules! $NAME_ERR { ($d($d args:expr),*) => { - Err(DataFusionError::$ERR(format!("{}{}", format!($d($d args),*), DataFusionError::get_back_trace()).into())) + Err($crate::DataFusionError::$ERR( + format!( + "{}{}", + format!($d($d args),*), + $crate::DataFusionError::get_back_trace(), + ).into() + )) } } } diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index d5a1b3ee363b..d1a7a675cb45 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -31,7 +31,7 @@ use crate::cast::{ as_large_list_array, as_list_array, as_primitive_array, as_string_array, as_struct_array, }; -use crate::error::{DataFusionError, Result, _internal_err}; +use crate::error::{Result, _internal_err}; // Combines two hashes into one hash #[inline] diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs index 1192757e890b..b1a34e4a61d0 100644 --- a/datafusion/common/src/scalar/struct_builder.rs +++ b/datafusion/common/src/scalar/struct_builder.rs @@ -18,7 +18,7 @@ //! [`ScalarStructBuilder`] for building [`ScalarValue::Struct`] use crate::error::_internal_err; -use crate::{DataFusionError, Result, ScalarValue}; +use crate::{Result, ScalarValue}; use arrow::array::{ArrayRef, StructArray}; use arrow::datatypes::{DataType, FieldRef, Fields}; use arrow_schema::Field; diff --git a/datafusion/core/src/catalog/mod.rs b/datafusion/core/src/catalog/mod.rs index a05a480bef44..8aeeaf9f72d8 100644 --- a/datafusion/core/src/catalog/mod.rs +++ b/datafusion/core/src/catalog/mod.rs @@ -25,7 +25,7 @@ pub use datafusion_sql::{ResolvedTableReference, TableReference}; use crate::catalog::schema::SchemaProvider; use dashmap::DashMap; -use datafusion_common::{exec_err, not_impl_err, DataFusionError, Result}; +use datafusion_common::{exec_err, not_impl_err, Result}; use std::any::Any; use std::sync::Arc; diff --git a/datafusion/core/src/catalog/schema.rs b/datafusion/core/src/catalog/schema.rs index 49f8350ecc5b..8249c3a5330f 100644 --- a/datafusion/core/src/catalog/schema.rs +++ b/datafusion/core/src/catalog/schema.rs @@ -20,12 +20,12 @@ use async_trait::async_trait; use dashmap::DashMap; -use datafusion_common::exec_err; +use datafusion_common::{exec_err, DataFusionError}; use std::any::Any; use std::sync::Arc; use crate::datasource::TableProvider; -use crate::error::{DataFusionError, Result}; +use crate::error::Result; /// Represents a schema, comprising a number of named tables. /// diff --git a/datafusion/core/src/datasource/default_table_source.rs b/datafusion/core/src/datasource/default_table_source.rs index fadf01c74c5d..977e681d6641 100644 --- a/datafusion/core/src/datasource/default_table_source.rs +++ b/datafusion/core/src/datasource/default_table_source.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use crate::datasource::TableProvider; use arrow::datatypes::SchemaRef; -use datafusion_common::{internal_err, Constraints, DataFusionError}; +use datafusion_common::{internal_err, Constraints}; use datafusion_expr::{Expr, TableProviderFilterPushDown, TableSource}; /// DataFusion default table source, wrapping TableProvider. diff --git a/datafusion/core/src/datasource/file_format/json.rs b/datafusion/core/src/datasource/file_format/json.rs index 0f6d3648d120..121fe5e8dcb1 100644 --- a/datafusion/core/src/datasource/file_format/json.rs +++ b/datafusion/core/src/datasource/file_format/json.rs @@ -42,7 +42,7 @@ use arrow::datatypes::SchemaRef; use arrow::json; use arrow::json::reader::{infer_json_schema_from_iterator, ValueIter}; use arrow_array::RecordBatch; -use datafusion_common::{not_impl_err, DataFusionError, FileType}; +use datafusion_common::{not_impl_err, FileType}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; use datafusion_physical_plan::metrics::MetricsSet; diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 12c9fb91adb1..72dc289d4b64 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -41,7 +41,7 @@ use crate::error::Result; use crate::execution::context::SessionState; use crate::physical_plan::{ExecutionPlan, Statistics}; -use datafusion_common::{not_impl_err, DataFusionError, FileType}; +use datafusion_common::{not_impl_err, FileType}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; use async_trait::async_trait; diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index 901e74dfc218..e087b4bcba51 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -28,9 +28,7 @@ use std::sync::Arc; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use async_trait::async_trait; -use datafusion_common::{ - not_impl_err, plan_err, Constraints, DFSchema, DataFusionError, SchemaExt, -}; +use datafusion_common::{not_impl_err, plan_err, Constraints, DFSchema, SchemaExt}; use datafusion_execution::TaskContext; use parking_lot::Mutex; use tokio::sync::RwLock; @@ -370,6 +368,7 @@ mod tests { use arrow::array::{AsArray, Int32Array}; use arrow::datatypes::{DataType, Field, Schema, UInt64Type}; use arrow::error::ArrowError; + use datafusion_common::DataFusionError; use datafusion_expr::LogicalPlanBuilder; use futures::StreamExt; use std::collections::HashMap; diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 8f20da183a93..351967d35324 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -48,7 +48,7 @@ pub use crate::logical_expr::TableType; pub use statistics::get_statistics_with_limit; use arrow_schema::{Schema, SortOptions}; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_err, Result}; use datafusion_expr::Expr; use datafusion_physical_expr::{expressions, LexOrdering, PhysicalSortExpr}; diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index 516755e4d293..4a814c5b9b2c 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -24,10 +24,7 @@ use std::{ use super::{get_projected_output_ordering, FileGroupPartitioner}; use crate::datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl}; -use crate::{ - error::{DataFusionError, Result}, - scalar::ScalarValue, -}; +use crate::{error::Result, scalar::ScalarValue}; use arrow::array::{ArrayData, BufferBuilder}; use arrow::buffer::Buffer; diff --git a/datafusion/core/src/datasource/physical_plan/file_stream.rs b/datafusion/core/src/datasource/physical_plan/file_stream.rs index 9cb58e7032db..0d25189a6124 100644 --- a/datafusion/core/src/datasource/physical_plan/file_stream.rs +++ b/datafusion/core/src/datasource/physical_plan/file_stream.rs @@ -531,7 +531,7 @@ mod tests { }; use arrow_schema::Schema; - use datafusion_common::{internal_err, DataFusionError, Statistics}; + use datafusion_common::{internal_err, Statistics}; use bytes::Bytes; use futures::StreamExt; diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index d6546539993b..2a8bb3b4fbaa 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -52,7 +52,7 @@ use std::{ }; use super::listing::ListingTableUrl; -use crate::error::{DataFusionError, Result}; +use crate::error::Result; use crate::physical_plan::{DisplayAs, DisplayFormatType}; use crate::{ datasource::{ diff --git a/datafusion/core/src/datasource/provider.rs b/datafusion/core/src/datasource/provider.rs index 8de2c6b3ea86..e769084df636 100644 --- a/datafusion/core/src/datasource/provider.rs +++ b/datafusion/core/src/datasource/provider.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::sync::Arc; use async_trait::async_trait; -use datafusion_common::{not_impl_err, Constraints, DataFusionError, Statistics}; +use datafusion_common::{not_impl_err, Constraints, Statistics}; use datafusion_expr::{CreateExternalTable, LogicalPlan}; pub use datafusion_expr::{TableProviderFilterPushDown, TableType}; diff --git a/datafusion/core/src/datasource/streaming.rs b/datafusion/core/src/datasource/streaming.rs index 3eb120653ce3..f85db2280d8e 100644 --- a/datafusion/core/src/datasource/streaming.rs +++ b/datafusion/core/src/datasource/streaming.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use arrow::datatypes::SchemaRef; use async_trait::async_trait; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_err, Result}; use datafusion_expr::{Expr, TableType}; use log::debug; diff --git a/datafusion/core/src/physical_optimizer/enforce_sorting.rs b/datafusion/core/src/physical_optimizer/enforce_sorting.rs index 9b76af2dbb1f..e6f3e12aceaf 100644 --- a/datafusion/core/src/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/src/physical_optimizer/enforce_sorting.rs @@ -59,8 +59,8 @@ use crate::physical_plan::windows::{ }; use crate::physical_plan::{Distribution, ExecutionPlan, InputOrderMode}; +use datafusion_common::plan_err; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::{plan_err, DataFusionError}; use datafusion_physical_expr::{PhysicalSortExpr, PhysicalSortRequirement}; use datafusion_physical_plan::repartition::RepartitionExec; use datafusion_physical_plan::sorts::partial_sort::PartialSortExec; diff --git a/datafusion/core/src/physical_optimizer/join_selection.rs b/datafusion/core/src/physical_optimizer/join_selection.rs index 02626056f6cc..f2e81fb053c0 100644 --- a/datafusion/core/src/physical_optimizer/join_selection.rs +++ b/datafusion/core/src/physical_optimizer/join_selection.rs @@ -41,8 +41,8 @@ use crate::physical_plan::ExecutionPlan; use arrow_schema::Schema; use datafusion_common::tree_node::{Transformed, TreeNode}; +use datafusion_common::JoinType; use datafusion_common::{internal_err, JoinSide}; -use datafusion_common::{DataFusionError, JoinType}; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::sort_properties::SortProperties; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; diff --git a/datafusion/core/src/physical_optimizer/pipeline_checker.rs b/datafusion/core/src/physical_optimizer/pipeline_checker.rs index bb0665c10bcc..c0f071cd3f64 100644 --- a/datafusion/core/src/physical_optimizer/pipeline_checker.rs +++ b/datafusion/core/src/physical_optimizer/pipeline_checker.rs @@ -27,8 +27,8 @@ use crate::physical_optimizer::PhysicalOptimizerRule; use crate::physical_plan::ExecutionPlan; use datafusion_common::config::OptimizerOptions; +use datafusion_common::plan_err; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::{plan_err, DataFusionError}; use datafusion_physical_expr::intervals::utils::{check_support, is_datatype_supported}; use datafusion_physical_plan::joins::SymmetricHashJoinExec; use datafusion_physical_plan::tree_node::PlanContext; diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 3413486c6b46..22e0d804acb1 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -31,7 +31,7 @@ use crate::physical_plan::tree_node::PlanContext; use crate::physical_plan::ExecutionPlan; use datafusion_common::tree_node::Transformed; -use datafusion_common::{plan_err, DataFusionError, JoinSide, Result}; +use datafusion_common::{plan_err, JoinSide, Result}; use datafusion_expr::JoinType; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::{ diff --git a/datafusion/core/tests/core_integration.rs b/datafusion/core/tests/core_integration.rs index af39e1e18abc..befefb1d7ec5 100644 --- a/datafusion/core/tests/core_integration.rs +++ b/datafusion/core/tests/core_integration.rs @@ -21,6 +21,9 @@ mod sql; /// Run all tests that are found in the `dataframe` directory mod dataframe; +/// Run all tests that are found in the `macro_hygiene` directory +mod macro_hygiene; + #[cfg(test)] #[ctor::ctor] fn init() { diff --git a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs index e374abd6e891..bc7f88b39672 100644 --- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs +++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs @@ -34,7 +34,7 @@ use datafusion::physical_plan::{ use datafusion::prelude::*; use datafusion::scalar::ScalarValue; use datafusion_common::cast::as_primitive_array; -use datafusion_common::{internal_err, not_impl_err, DataFusionError}; +use datafusion_common::{internal_err, not_impl_err}; use datafusion_expr::expr::{BinaryExpr, Cast}; use async_trait::async_trait; diff --git a/datafusion/core/tests/macro_hygiene/mod.rs b/datafusion/core/tests/macro_hygiene/mod.rs new file mode 100644 index 000000000000..72ac6e64fb0c --- /dev/null +++ b/datafusion/core/tests/macro_hygiene/mod.rs @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +//! Verifies [Macro Hygene] +//! +//! [Macro Hygene]: https://en.wikipedia.org/wiki/Hygienic_macro +mod plan_err { + // NO other imports! + use datafusion_common::plan_err; + + #[test] + fn test_macro() { + // need type annotation for Ok variant + let _res: Result<(), _> = plan_err!("foo"); + } +} + +mod plan_datafusion_err { + // NO other imports! + use datafusion_common::plan_datafusion_err; + + #[test] + fn test_macro() { + plan_datafusion_err!("foo"); + } +} diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index 8daeefd236f7..9e231d25f298 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -42,9 +42,7 @@ use datafusion::{ prelude::SessionContext, scalar::ScalarValue, }; -use datafusion_common::{ - assert_contains, cast::as_primitive_array, exec_err, DataFusionError, -}; +use datafusion_common::{assert_contains, cast::as_primitive_array, exec_err}; use datafusion_expr::{ create_udaf, AggregateUDFImpl, GroupsAccumulator, SimpleAggregateUDF, }; diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index a255498eb5f7..0546ef59b1d8 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -24,7 +24,7 @@ use datafusion::{execution::registry::FunctionRegistry, test_util}; use datafusion_common::cast::as_float64_array; use datafusion_common::{ assert_batches_eq, assert_batches_sorted_eq, cast::as_int32_array, not_impl_err, - plan_err, DataFusionError, ExprSchema, Result, ScalarValue, + plan_err, ExprSchema, Result, ScalarValue, }; use datafusion_expr::{ create_udaf, create_udf, Accumulator, ColumnarValue, ExprSchemable, diff --git a/datafusion/execution/src/registry.rs b/datafusion/execution/src/registry.rs index 6e0a932f0bc5..5bc9a7a07b6f 100644 --- a/datafusion/execution/src/registry.rs +++ b/datafusion/execution/src/registry.rs @@ -17,7 +17,7 @@ //! FunctionRegistry trait -use datafusion_common::{not_impl_err, plan_datafusion_err, DataFusionError, Result}; +use datafusion_common::{not_impl_err, plan_datafusion_err, Result}; use datafusion_expr::{AggregateUDF, ScalarUDF, UserDefinedLogicalNode, WindowUDF}; use std::collections::HashMap; use std::{collections::HashSet, sync::Arc}; diff --git a/datafusion/expr/src/accumulator.rs b/datafusion/expr/src/accumulator.rs index fa2017586d21..031348269a38 100644 --- a/datafusion/expr/src/accumulator.rs +++ b/datafusion/expr/src/accumulator.rs @@ -18,7 +18,7 @@ //! Accumulator module contains the trait definition for aggregation function's accumulators. use arrow::array::ArrayRef; -use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{internal_err, Result, ScalarValue}; use std::fmt::Debug; /// Tracks an aggregate function's state. diff --git a/datafusion/expr/src/columnar_value.rs b/datafusion/expr/src/columnar_value.rs index 585bee3b9bfa..c845c81cb708 100644 --- a/datafusion/expr/src/columnar_value.rs +++ b/datafusion/expr/src/columnar_value.rs @@ -20,7 +20,7 @@ use arrow::array::ArrayRef; use arrow::array::NullArray; use arrow::datatypes::DataType; -use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{internal_err, Result, ScalarValue}; use std::sync::Arc; /// Represents the result of evaluating an expression: either a single diff --git a/datafusion/expr/src/conditional_expressions.rs b/datafusion/expr/src/conditional_expressions.rs index c31bd04eafa0..1346825f054d 100644 --- a/datafusion/expr/src/conditional_expressions.rs +++ b/datafusion/expr/src/conditional_expressions.rs @@ -19,7 +19,7 @@ use crate::expr::Case; use crate::{expr_schema::ExprSchemable, Expr}; use arrow::datatypes::DataType; -use datafusion_common::{plan_err, DFSchema, DataFusionError, Result}; +use datafusion_common::{plan_err, DFSchema, Result}; use std::collections::HashSet; /// Currently supported types by the coalesce function. diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index c3d9269d1559..43f6ba8f6309 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -29,7 +29,7 @@ use crate::{built_in_window_function, udaf}; use arrow::datatypes::DataType; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{internal_err, DFSchema, OwnedTableReference}; -use datafusion_common::{plan_err, Column, DataFusionError, Result, ScalarValue}; +use datafusion_common::{plan_err, Column, Result, ScalarValue}; use sqlparser::ast::NullTreatment; use std::collections::HashSet; use std::fmt; diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 491b4a852261..a453730a0e71 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -28,8 +28,7 @@ use crate::{utils, LogicalPlan, Projection, Subquery}; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ - internal_err, plan_datafusion_err, plan_err, Column, DFField, DataFusionError, - ExprSchema, Result, + internal_err, plan_datafusion_err, plan_err, Column, DFField, ExprSchema, Result, }; use std::collections::HashMap; use std::sync::Arc; diff --git a/datafusion/expr/src/field_util.rs b/datafusion/expr/src/field_util.rs index c46ec50234dd..3195ce6f2dfc 100644 --- a/datafusion/expr/src/field_util.rs +++ b/datafusion/expr/src/field_util.rs @@ -18,9 +18,7 @@ //! Utility functions for complex field access use arrow::datatypes::{DataType, Field}; -use datafusion_common::{ - plan_datafusion_err, plan_err, DataFusionError, Result, ScalarValue, -}; +use datafusion_common::{plan_datafusion_err, plan_err, Result, ScalarValue}; /// Types of the field access expression of a nested type, such as `Field` or `List` pub enum GetFieldAccessSchema { diff --git a/datafusion/expr/src/interval_arithmetic.rs b/datafusion/expr/src/interval_arithmetic.rs index 5d34fe91c3ac..ca91a8c9da00 100644 --- a/datafusion/expr/src/interval_arithmetic.rs +++ b/datafusion/expr/src/interval_arithmetic.rs @@ -28,7 +28,7 @@ use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::DataType; use arrow::datatypes::{IntervalUnit, TimeUnit}; use datafusion_common::rounding::{alter_fp_rounding_mode, next_down, next_up}; -use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{internal_err, Result, ScalarValue}; macro_rules! get_extreme_value { ($extreme:ident, $value:expr) => { diff --git a/datafusion/expr/src/partition_evaluator.rs b/datafusion/expr/src/partition_evaluator.rs index 4b5357ddf8ba..04b6faf55ae1 100644 --- a/datafusion/expr/src/partition_evaluator.rs +++ b/datafusion/expr/src/partition_evaluator.rs @@ -18,7 +18,7 @@ //! Partition evaluation module use arrow::array::ArrayRef; -use datafusion_common::{exec_err, not_impl_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue}; use std::fmt::Debug; use std::ops::Range; diff --git a/datafusion/expr/src/tree_node/expr.rs b/datafusion/expr/src/tree_node/expr.rs index def25ed9242f..81949f2178f6 100644 --- a/datafusion/expr/src/tree_node/expr.rs +++ b/datafusion/expr/src/tree_node/expr.rs @@ -25,7 +25,7 @@ use crate::expr::{ use crate::{Expr, GetFieldAccess}; use datafusion_common::tree_node::{TreeNode, VisitRecursion}; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; impl TreeNode for Expr { fn apply_children Result>( diff --git a/datafusion/expr/src/type_coercion/aggregates.rs b/datafusion/expr/src/type_coercion/aggregates.rs index ab994c143ac2..866aea06b4d4 100644 --- a/datafusion/expr/src/type_coercion/aggregates.rs +++ b/datafusion/expr/src/type_coercion/aggregates.rs @@ -24,7 +24,7 @@ use arrow::datatypes::{ DataType, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, }; -use datafusion_common::{internal_err, plan_err, DataFusionError, Result}; +use datafusion_common::{internal_err, plan_err, Result}; pub static STRINGS: &[DataType] = &[DataType::Utf8, DataType::LargeUtf8]; diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 70015c699296..118844e4b266 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -28,9 +28,7 @@ use arrow::datatypes::{ DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, }; -use datafusion_common::{ - exec_datafusion_err, plan_datafusion_err, plan_err, DataFusionError, Result, -}; +use datafusion_common::{exec_datafusion_err, plan_datafusion_err, plan_err, Result}; /// The type signature of an instantiation of binary operator expression such as /// `lhs + rhs` diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 2022d67879f8..e9878fd17e8d 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -22,9 +22,7 @@ use arrow::{ datatypes::{DataType, TimeUnit}, }; use datafusion_common::utils::{coerced_fixed_size_list_to_list, list_ndims}; -use datafusion_common::{ - internal_datafusion_err, internal_err, plan_err, DataFusionError, Result, -}; +use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result}; use super::binary::comparison_coercion; diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index fb062e5830eb..6ff7730bd606 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -23,7 +23,7 @@ use crate::{ AccumulatorFactoryFunction, ReturnTypeFunction, Signature, StateTypeFunction, }; use arrow::datatypes::DataType; -use datafusion_common::{not_impl_err, DataFusionError, Result}; +use datafusion_common::{not_impl_err, Result}; use std::any::Any; use std::fmt::{self, Debug, Formatter}; use std::sync::Arc; diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 2fda81d8896f..fe9297b32a8e 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -35,7 +35,7 @@ use datafusion_common::tree_node::{TreeNode, VisitRecursion}; use datafusion_common::utils::get_at_indices; use datafusion_common::{ internal_err, plan_datafusion_err, plan_err, Column, DFField, DFSchema, DFSchemaRef, - DataFusionError, Result, ScalarValue, TableReference, + Result, ScalarValue, TableReference, }; use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem, WildcardAdditionalOptions}; @@ -129,14 +129,15 @@ fn check_grouping_sets_size_limit(size: usize) -> Result<()> { /// Merge two grouping_set /// -/// -/// Example: -/// +/// # Example +/// ```text /// (A, B), (C, D) -> (A, B, C, D) +/// ``` /// -/// Error: +/// # Error +/// - [`DataFusionError`]: The number of group_expression in grouping_set exceeds the maximum limit /// -/// [`DataFusionError`] The number of group_expression in grouping_set exceeds the maximum limit +/// [`DataFusionError`]: datafusion_common::DataFusionError fn merge_grouping_set(left: &[T], right: &[T]) -> Result> { check_grouping_set_size_limit(left.len() + right.len())?; Ok(left.iter().chain(right.iter()).cloned().collect()) @@ -144,15 +145,16 @@ fn merge_grouping_set(left: &[T], right: &[T]) -> Result> { /// Compute the cross product of two grouping_sets /// +/// # Example +/// ```text +/// [(A, B), (C, D)], [(E), (F)] -> [(A, B, E), (A, B, F), (C, D, E), (C, D, F)] +/// ``` /// -/// Example: -/// -/// \[(A, B), (C, D)], [(E), (F)\] -> \[(A, B, E), (A, B, F), (C, D, E), (C, D, F)\] -/// -/// Error: +/// # Error +/// - [`DataFusionError`]: The number of group_expression in grouping_set exceeds the maximum limit +/// - [`DataFusionError`]: The number of grouping_set in grouping_sets exceeds the maximum limit /// -/// [`DataFusionError`] The number of group_expression in grouping_set exceeds the maximum limit \ -/// [`DataFusionError`] The number of grouping_set in grouping_sets exceeds the maximum limit +/// [`DataFusionError`]: datafusion_common::DataFusionError fn cross_join_grouping_sets( left: &[Vec], right: &[Vec], diff --git a/datafusion/functions-array/src/udf.rs b/datafusion/functions-array/src/udf.rs index b7f9d2497fb7..79fb83c059a4 100644 --- a/datafusion/functions-array/src/udf.rs +++ b/datafusion/functions-array/src/udf.rs @@ -18,7 +18,7 @@ //! [`ScalarUDFImpl`] definitions for array functions. use arrow::datatypes::DataType; -use datafusion_common::{plan_err, DataFusionError}; +use datafusion_common::plan_err; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::Expr; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index 73bfba9b38b1..afb308e441f5 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -18,7 +18,7 @@ //! Encoding expressions use arrow::datatypes::DataType; -use datafusion_common::{exec_err, DataFusionError, Result}; +use datafusion_common::{exec_err, Result}; use datafusion_expr::ColumnarValue; use arrow::array::Array; diff --git a/datafusion/functions/src/core/nvl.rs b/datafusion/functions/src/core/nvl.rs index caf095ecbbbd..578aaeda2e89 100644 --- a/datafusion/functions/src/core/nvl.rs +++ b/datafusion/functions/src/core/nvl.rs @@ -16,7 +16,7 @@ // under the License. use arrow::datatypes::DataType; -use datafusion_common::{internal_err, Result, DataFusionError}; +use datafusion_common::{internal_err, Result}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use arrow::compute::kernels::zip::zip; use arrow::compute::is_not_null; diff --git a/datafusion/optimizer/src/analyzer/subquery.rs b/datafusion/optimizer/src/analyzer/subquery.rs index 7c5b70b19af0..a0e972fc703c 100644 --- a/datafusion/optimizer/src/analyzer/subquery.rs +++ b/datafusion/optimizer/src/analyzer/subquery.rs @@ -18,7 +18,7 @@ use crate::analyzer::check_plan; use crate::utils::collect_subquery_cols; use datafusion_common::tree_node::{TreeNode, VisitRecursion}; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_err, Result}; use datafusion_expr::expr_rewriter::strip_outer_reference; use datafusion_expr::utils::split_conjunction; use datafusion_expr::{ diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index b1000f042c98..0f4b39d9eee3 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -21,7 +21,7 @@ use datafusion_common::tree_node::{ RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter, }; use datafusion_common::{plan_err, Result}; -use datafusion_common::{Column, DFSchemaRef, DataFusionError, ScalarValue}; +use datafusion_common::{Column, DFSchemaRef, ScalarValue}; use datafusion_expr::expr::{AggregateFunctionDefinition, Alias}; use datafusion_expr::utils::{conjunction, find_join_exprs, split_conjunction}; use datafusion_expr::{expr, EmptyRelation, Expr, LogicalPlan, LogicalPlanBuilder}; diff --git a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs index 450336376a23..a9e1f1228e5e 100644 --- a/datafusion/optimizer/src/decorrelate_predicate_subquery.rs +++ b/datafusion/optimizer/src/decorrelate_predicate_subquery.rs @@ -21,7 +21,7 @@ use crate::utils::replace_qualified_name; use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::alias::AliasGenerator; use datafusion_common::tree_node::TreeNode; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_err, Result}; use datafusion_expr::expr::{Exists, InSubquery}; use datafusion_expr::expr_rewriter::create_col_from_scalar_expr; use datafusion_expr::logical_plan::{JoinType, Subquery}; diff --git a/datafusion/optimizer/src/eliminate_cross_join.rs b/datafusion/optimizer/src/eliminate_cross_join.rs index d9e96a9f2543..7f65690a4a7c 100644 --- a/datafusion/optimizer/src/eliminate_cross_join.rs +++ b/datafusion/optimizer/src/eliminate_cross_join.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use crate::{utils, OptimizerConfig, OptimizerRule}; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_err, Result}; use datafusion_expr::expr::{BinaryExpr, Expr}; use datafusion_expr::logical_plan::{ CrossJoin, Filter, Join, JoinConstraint, JoinType, LogicalPlan, Projection, diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 633a32996d1c..93b3d6b8b9f2 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -458,9 +458,7 @@ mod tests { use crate::test::test_table_scan; use crate::{OptimizerConfig, OptimizerContext, OptimizerRule}; - use datafusion_common::{ - plan_err, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, - }; + use datafusion_common::{plan_err, DFField, DFSchema, DFSchemaRef, Result}; use datafusion_expr::logical_plan::EmptyRelation; use datafusion_expr::{col, lit, LogicalPlan, LogicalPlanBuilder, Projection}; diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs index 040b69fc8bf3..d1f9f87a32a3 100644 --- a/datafusion/optimizer/src/propagate_empty_relation.rs +++ b/datafusion/optimizer/src/propagate_empty_relation.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_err, Result}; use datafusion_expr::logical_plan::LogicalPlan; use datafusion_expr::{EmptyRelation, JoinType, Projection, Union}; use std::sync::Arc; diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index acdda6833285..40156d43c572 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -24,8 +24,8 @@ use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::{Transformed, TreeNode, VisitRecursion}; use datafusion_common::{ - internal_err, plan_datafusion_err, Column, DFSchema, DFSchemaRef, DataFusionError, - JoinConstraint, Result, + internal_err, plan_datafusion_err, Column, DFSchema, DFSchemaRef, JoinConstraint, + Result, }; use datafusion_expr::expr::Alias; use datafusion_expr::expr_rewriter::replace_col; diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index 34ed4a9475cb..9aa08c37fa35 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -23,7 +23,7 @@ use datafusion_common::alias::AliasGenerator; use datafusion_common::tree_node::{ RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter, }; -use datafusion_common::{plan_err, Column, DataFusionError, Result, ScalarValue}; +use datafusion_common::{plan_err, Column, Result, ScalarValue}; use datafusion_expr::expr_rewriter::create_col_from_scalar_expr; use datafusion_expr::logical_plan::{JoinType, Subquery}; use datafusion_expr::utils::conjunction; diff --git a/datafusion/optimizer/src/simplify_expressions/utils.rs b/datafusion/optimizer/src/simplify_expressions/utils.rs index fa91a3ace2a2..c9736061df90 100644 --- a/datafusion/optimizer/src/simplify_expressions/utils.rs +++ b/datafusion/optimizer/src/simplify_expressions/utils.rs @@ -18,7 +18,7 @@ //! Utility functions for expression simplification use crate::simplify_expressions::SimplifyInfo; -use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::{ expr::{Between, BinaryExpr, InList}, diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 9d3561d12671..4c22742c8635 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -25,9 +25,7 @@ use arrow::datatypes::{ }; use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS}; use datafusion_common::tree_node::{RewriteRecursion, TreeNodeRewriter}; -use datafusion_common::{ - internal_err, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, -}; +use datafusion_common::{internal_err, DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::expr::{BinaryExpr, Cast, InList, TryCast}; use datafusion_expr::expr_rewriter::rewrite_preserving_name; use datafusion_expr::utils::merge_schema; diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs index d857c6154ea9..fe1234de5ab8 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; use datafusion_common::config::ConfigOptions; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_err, Result}; use datafusion_expr::{AggregateUDF, LogicalPlan, ScalarUDF, TableSource, WindowUDF}; use datafusion_optimizer::analyzer::Analyzer; use datafusion_optimizer::optimizer::Optimizer; diff --git a/datafusion/physical-expr/src/aggregate/array_agg_distinct.rs b/datafusion/physical-expr/src/aggregate/array_agg_distinct.rs index b073b00578a5..8e7b9d91ee49 100644 --- a/datafusion/physical-expr/src/aggregate/array_agg_distinct.rs +++ b/datafusion/physical-expr/src/aggregate/array_agg_distinct.rs @@ -188,7 +188,7 @@ mod tests { use arrow_array::Array; use arrow_array::ListArray; use arrow_buffer::OffsetBuffer; - use datafusion_common::{internal_err, DataFusionError}; + use datafusion_common::internal_err; // arrow::compute::sort can't sort nested ListArray directly, so we compare the scalar values pair-wise. fn compare_list_contents( diff --git a/datafusion/physical-expr/src/aggregate/array_agg_ordered.rs b/datafusion/physical-expr/src/aggregate/array_agg_ordered.rs index 587f40081c90..7e2c7bb27144 100644 --- a/datafusion/physical-expr/src/aggregate/array_agg_ordered.rs +++ b/datafusion/physical-expr/src/aggregate/array_agg_ordered.rs @@ -38,7 +38,7 @@ use arrow_schema::{Fields, SortOptions}; use datafusion_common::utils::array_into_list_array; use datafusion_common::utils::{compare_rows, get_row_at_idx}; -use datafusion_common::{exec_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::Accumulator; /// Expression for a `ARRAY_AGG(... ORDER BY ..., ...)` aggregation. In a multi diff --git a/datafusion/physical-expr/src/aggregate/average.rs b/datafusion/physical-expr/src/aggregate/average.rs index 57f8fa211e58..f06355293d7c 100644 --- a/datafusion/physical-expr/src/aggregate/average.rs +++ b/datafusion/physical-expr/src/aggregate/average.rs @@ -39,7 +39,7 @@ use arrow_array::{ Array, ArrowNativeTypeOp, ArrowNumericType, ArrowPrimitiveType, PrimitiveArray, }; use arrow_buffer::{i256, ArrowNativeType}; -use datafusion_common::{not_impl_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{not_impl_err, Result, ScalarValue}; use datafusion_expr::type_coercion::aggregates::avg_return_type; use datafusion_expr::{Accumulator, EmitTo, GroupsAccumulator}; diff --git a/datafusion/physical-expr/src/aggregate/build_in.rs b/datafusion/physical-expr/src/aggregate/build_in.rs index 2918856aa623..0aaf0dc0c8c5 100644 --- a/datafusion/physical-expr/src/aggregate/build_in.rs +++ b/datafusion/physical-expr/src/aggregate/build_in.rs @@ -30,7 +30,7 @@ use std::sync::Arc; use arrow::datatypes::Schema; -use datafusion_common::{exec_err, not_impl_err, DataFusionError, Result}; +use datafusion_common::{exec_err, not_impl_err, Result}; use datafusion_expr::AggregateFunction; use crate::aggregate::regr::RegrType; @@ -416,7 +416,7 @@ pub fn create_aggregate_expr( mod tests { use arrow::datatypes::{DataType, Field}; - use datafusion_common::{plan_err, ScalarValue}; + use datafusion_common::{plan_err, DataFusionError, ScalarValue}; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::{type_coercion, Signature}; diff --git a/datafusion/physical-expr/src/aggregate/grouping.rs b/datafusion/physical-expr/src/aggregate/grouping.rs index 70afda265aea..d43bcd5c7091 100644 --- a/datafusion/physical-expr/src/aggregate/grouping.rs +++ b/datafusion/physical-expr/src/aggregate/grouping.rs @@ -24,7 +24,7 @@ use crate::aggregate::utils::down_cast_any_ref; use crate::{AggregateExpr, PhysicalExpr}; use arrow::datatypes::DataType; use arrow::datatypes::Field; -use datafusion_common::{not_impl_err, DataFusionError, Result}; +use datafusion_common::{not_impl_err, Result}; use datafusion_expr::Accumulator; use crate::expressions::format_state_name; diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs index 2bb205ce90dc..893178f29d08 100644 --- a/datafusion/physical-expr/src/aggregate/mod.rs +++ b/datafusion/physical-expr/src/aggregate/mod.rs @@ -23,7 +23,7 @@ use crate::expressions::{NthValueAgg, OrderSensitiveArrayAgg}; use crate::{PhysicalExpr, PhysicalSortExpr}; use arrow::datatypes::Field; -use datafusion_common::{not_impl_err, DataFusionError, Result}; +use datafusion_common::{not_impl_err, Result}; use datafusion_expr::{Accumulator, GroupsAccumulator}; mod hyperloglog; diff --git a/datafusion/physical-expr/src/aggregate/nth_value.rs b/datafusion/physical-expr/src/aggregate/nth_value.rs index 5d721e3a5e87..dba259a507fd 100644 --- a/datafusion/physical-expr/src/aggregate/nth_value.rs +++ b/datafusion/physical-expr/src/aggregate/nth_value.rs @@ -33,7 +33,7 @@ use arrow_array::cast::AsArray; use arrow_array::{new_empty_array, ArrayRef, StructArray}; use arrow_schema::{DataType, Field, Fields}; use datafusion_common::utils::{array_into_list_array, get_row_at_idx}; -use datafusion_common::{exec_err, internal_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{exec_err, internal_err, Result, ScalarValue}; use datafusion_expr::Accumulator; /// Expression for a `NTH_VALUE(... ORDER BY ..., ...)` aggregation. In a multi diff --git a/datafusion/physical-expr/src/aggregate/stddev.rs b/datafusion/physical-expr/src/aggregate/stddev.rs index dcc2b0e69c02..6033d63cbe21 100644 --- a/datafusion/physical-expr/src/aggregate/stddev.rs +++ b/datafusion/physical-expr/src/aggregate/stddev.rs @@ -27,7 +27,7 @@ use crate::expressions::format_state_name; use crate::{AggregateExpr, PhysicalExpr}; use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field}; use datafusion_common::ScalarValue; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_expr::Accumulator; /// STDDEV and STDDEV_SAMP (standard deviation) aggregate expression diff --git a/datafusion/physical-expr/src/aggregate/string_agg.rs b/datafusion/physical-expr/src/aggregate/string_agg.rs index 7a1da6d62246..8993c630aa49 100644 --- a/datafusion/physical-expr/src/aggregate/string_agg.rs +++ b/datafusion/physical-expr/src/aggregate/string_agg.rs @@ -23,7 +23,7 @@ use crate::{AggregateExpr, PhysicalExpr}; use arrow::array::ArrayRef; use arrow::datatypes::{DataType, Field}; use datafusion_common::cast::as_generic_string_array; -use datafusion_common::{not_impl_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{not_impl_err, Result, ScalarValue}; use datafusion_expr::Accumulator; use std::any::Any; use std::sync::Arc; diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs index 6cf2810ce588..f19be62bbc95 100644 --- a/datafusion/physical-expr/src/aggregate/sum.rs +++ b/datafusion/physical-expr/src/aggregate/sum.rs @@ -33,7 +33,7 @@ use arrow_array::types::{ }; use arrow_array::{Array, ArrowNativeTypeOp, ArrowNumericType}; use arrow_buffer::ArrowNativeType; -use datafusion_common::{not_impl_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{not_impl_err, Result, ScalarValue}; use datafusion_expr::type_coercion::aggregates::sum_return_type; use datafusion_expr::{Accumulator, GroupsAccumulator}; diff --git a/datafusion/physical-expr/src/aggregate/sum_distinct.rs b/datafusion/physical-expr/src/aggregate/sum_distinct.rs index 4c0f94b3a2bb..a62a7b08da35 100644 --- a/datafusion/physical-expr/src/aggregate/sum_distinct.rs +++ b/datafusion/physical-expr/src/aggregate/sum_distinct.rs @@ -31,7 +31,7 @@ use std::collections::HashSet; use crate::aggregate::sum::downcast_sum; use crate::aggregate::utils::{down_cast_any_ref, Hashable}; use crate::{AggregateExpr, PhysicalExpr}; -use datafusion_common::{not_impl_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{not_impl_err, Result, ScalarValue}; use datafusion_expr::type_coercion::aggregates::sum_return_type; use datafusion_expr::Accumulator; diff --git a/datafusion/physical-expr/src/analysis.rs b/datafusion/physical-expr/src/analysis.rs index 6d36e2233cdd..ca25bfd647b6 100644 --- a/datafusion/physical-expr/src/analysis.rs +++ b/datafusion/physical-expr/src/analysis.rs @@ -27,9 +27,7 @@ use crate::PhysicalExpr; use arrow::datatypes::Schema; use datafusion_common::stats::Precision; -use datafusion_common::{ - internal_err, ColumnStatistics, DataFusionError, Result, ScalarValue, -}; +use datafusion_common::{internal_err, ColumnStatistics, Result, ScalarValue}; use datafusion_expr::interval_arithmetic::{cardinality_ratio, Interval}; /// The shared context used during the analysis of an expression. Includes diff --git a/datafusion/physical-expr/src/conditional_expressions.rs b/datafusion/physical-expr/src/conditional_expressions.rs index cc8f3c8dfaf0..87d63bfd32e2 100644 --- a/datafusion/physical-expr/src/conditional_expressions.rs +++ b/datafusion/physical-expr/src/conditional_expressions.rs @@ -19,7 +19,7 @@ use arrow::array::{new_null_array, Array, BooleanArray}; use arrow::compute::kernels::zip::zip; use arrow::compute::{and, is_not_null, is_null}; -use datafusion_common::{exec_err, DataFusionError, Result}; +use datafusion_common::{exec_err, Result}; use datafusion_expr::ColumnarValue; /// coalesce evaluates to the first value which is not NULL diff --git a/datafusion/physical-expr/src/equivalence/mod.rs b/datafusion/physical-expr/src/equivalence/mod.rs index 387dce2cdc8b..a31be06ecf0b 100644 --- a/datafusion/physical-expr/src/equivalence/mod.rs +++ b/datafusion/physical-expr/src/equivalence/mod.rs @@ -68,7 +68,7 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema}; use arrow_array::{ArrayRef, Float64Array, RecordBatch, UInt32Array}; use arrow_schema::{SchemaRef, SortOptions}; - use datafusion_common::{plan_datafusion_err, DataFusionError, Result}; + use datafusion_common::{plan_datafusion_err, Result}; use itertools::izip; use rand::rngs::StdRng; use rand::seq::SliceRandom; diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 3f13030092c1..f1842458d5c4 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -38,7 +38,7 @@ use arrow::datatypes::*; use arrow::record_batch::RecordBatch; use datafusion_common::cast::as_boolean_array; -use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{internal_err, Result, ScalarValue}; use datafusion_expr::interval_arithmetic::{apply_operator, Interval}; use datafusion_expr::type_coercion::binary::get_result_type; use datafusion_expr::{ColumnarValue, Operator}; diff --git a/datafusion/physical-expr/src/expressions/binary/kernels.rs b/datafusion/physical-expr/src/expressions/binary/kernels.rs index 22cadec40940..b0736e140fec 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels.rs @@ -25,7 +25,7 @@ use arrow::compute::kernels::bitwise::{ }; use arrow::datatypes::DataType; use datafusion_common::internal_err; -use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_common::{Result, ScalarValue}; use std::sync::Arc; diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index b0e175e711fe..9125f73048cb 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -28,7 +28,7 @@ use arrow::compute::{can_cast_types, kernels, CastOptions}; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; -use datafusion_common::{not_impl_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{not_impl_err, Result, ScalarValue}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::ColumnarValue; diff --git a/datafusion/physical-expr/src/expressions/column.rs b/datafusion/physical-expr/src/expressions/column.rs index 62da8ff9ed44..a07f36e785e3 100644 --- a/datafusion/physical-expr/src/expressions/column.rs +++ b/datafusion/physical-expr/src/expressions/column.rs @@ -28,7 +28,7 @@ use arrow::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_expr::ColumnarValue; /// Represents the column at a given index in a RecordBatch diff --git a/datafusion/physical-expr/src/expressions/get_indexed_field.rs b/datafusion/physical-expr/src/expressions/get_indexed_field.rs index 58fe4728543d..39eef61f963a 100644 --- a/datafusion/physical-expr/src/expressions/get_indexed_field.rs +++ b/datafusion/physical-expr/src/expressions/get_indexed_field.rs @@ -30,7 +30,7 @@ use arrow::{ }; use datafusion_common::{ cast::{as_map_array, as_struct_array}, - DataFusionError, Result, ScalarValue, + Result, ScalarValue, }; use datafusion_expr::{field_util::GetFieldAccessSchema, ColumnarValue}; use std::fmt::Debug; diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 1a1634081c38..ecdb03e97ee3 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -38,9 +38,7 @@ use datafusion_common::cast::{ as_boolean_array, as_generic_binary_array, as_string_array, }; use datafusion_common::hash_utils::HashValue; -use datafusion_common::{ - exec_err, internal_err, not_impl_err, DataFusionError, Result, ScalarValue, -}; +use datafusion_common::{exec_err, internal_err, not_impl_err, Result, ScalarValue}; use datafusion_expr::ColumnarValue; use ahash::RandomState; diff --git a/datafusion/physical-expr/src/expressions/like.rs b/datafusion/physical-expr/src/expressions/like.rs index 37452e278484..6e0beeb0beea 100644 --- a/datafusion/physical-expr/src/expressions/like.rs +++ b/datafusion/physical-expr/src/expressions/like.rs @@ -23,7 +23,7 @@ use crate::{physical_expr::down_cast_any_ref, PhysicalExpr}; use crate::expressions::datum::apply_cmp; use arrow::record_batch::RecordBatch; use arrow_schema::{DataType, Schema}; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_expr::ColumnarValue; // Like expression diff --git a/datafusion/physical-expr/src/expressions/mod.rs b/datafusion/physical-expr/src/expressions/mod.rs index 09e908586c5b..ec20345569c2 100644 --- a/datafusion/physical-expr/src/expressions/mod.rs +++ b/datafusion/physical-expr/src/expressions/mod.rs @@ -133,7 +133,7 @@ pub(crate) mod tests { assert_eq!(expected, actual); - Ok(()) as Result<(), DataFusionError> + Ok(()) as Result<(), ::datafusion_common::DataFusionError> }}; } @@ -166,7 +166,7 @@ pub(crate) mod tests { let actual = aggregate_new(&batch, agg)?; assert_eq!($EXPECTED, &actual); - Ok(()) as Result<(), DataFusionError> + Ok(()) as Result<(), ::datafusion_common::DataFusionError> }}; } diff --git a/datafusion/physical-expr/src/expressions/negative.rs b/datafusion/physical-expr/src/expressions/negative.rs index 6b5c208bae81..d6dd3ddbea5e 100644 --- a/datafusion/physical-expr/src/expressions/negative.rs +++ b/datafusion/physical-expr/src/expressions/negative.rs @@ -30,7 +30,7 @@ use arrow::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_err, Result}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::{ type_coercion::{is_interval, is_null, is_signed_numeric, is_timestamp}, @@ -179,6 +179,7 @@ mod tests { use arrow::datatypes::*; use arrow_schema::DataType::{Float32, Float64, Int16, Int32, Int64, Int8}; use datafusion_common::cast::as_primitive_array; + use datafusion_common::DataFusionError; use datafusion_common::Result; use paste::paste; diff --git a/datafusion/physical-expr/src/expressions/no_op.rs b/datafusion/physical-expr/src/expressions/no_op.rs index 95e6879a6c2d..b558ccab154d 100644 --- a/datafusion/physical-expr/src/expressions/no_op.rs +++ b/datafusion/physical-expr/src/expressions/no_op.rs @@ -28,7 +28,7 @@ use arrow::{ use crate::physical_expr::down_cast_any_ref; use crate::PhysicalExpr; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_expr::ColumnarValue; /// A place holder expression, can not be evaluated. diff --git a/datafusion/physical-expr/src/expressions/try_cast.rs b/datafusion/physical-expr/src/expressions/try_cast.rs index 0f7909097a10..ddfe49dda7a3 100644 --- a/datafusion/physical-expr/src/expressions/try_cast.rs +++ b/datafusion/physical-expr/src/expressions/try_cast.rs @@ -28,7 +28,7 @@ use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; use compute::can_cast_types; use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; -use datafusion_common::{not_impl_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{not_impl_err, Result, ScalarValue}; use datafusion_expr::ColumnarValue; /// TRY_CAST expression casts an expression to a specific data type and retuns NULL on invalid cast diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 186de0609b9a..d2b9a68ef8b9 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -42,7 +42,7 @@ use arrow::{ datatypes::{DataType, Int32Type, Int64Type, Schema}, }; use arrow_array::Array; -use datafusion_common::{exec_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{exec_err, Result, ScalarValue}; pub use datafusion_expr::FuncMonotonicity; use datafusion_expr::{ type_coercion::functions::data_types, BuiltinScalarFunction, ColumnarValue, @@ -998,7 +998,7 @@ mod tests { }; use datafusion_common::cast::{as_boolean_array, as_uint64_array}; use datafusion_common::{exec_err, internal_err, plan_err}; - use datafusion_common::{Result, ScalarValue}; + use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_expr::type_coercion::functions::data_types; use datafusion_expr::Signature; diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs index b2403dadf05a..3bd059afa6be 100644 --- a/datafusion/physical-expr/src/intervals/cp_solver.rs +++ b/datafusion/physical-expr/src/intervals/cp_solver.rs @@ -29,7 +29,7 @@ use crate::utils::{build_dag, ExprTreeNode}; use crate::PhysicalExpr; use arrow_schema::{DataType, Schema}; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_expr::interval_arithmetic::{apply_operator, satisfy_greater, Interval}; use datafusion_expr::Operator; diff --git a/datafusion/physical-expr/src/intervals/utils.rs b/datafusion/physical-expr/src/intervals/utils.rs index 03d13632104d..e188b2d56bae 100644 --- a/datafusion/physical-expr/src/intervals/utils.rs +++ b/datafusion/physical-expr/src/intervals/utils.rs @@ -25,9 +25,7 @@ use crate::{ }; use arrow_schema::{DataType, SchemaRef}; -use datafusion_common::{ - internal_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, -}; +use datafusion_common::{internal_datafusion_err, internal_err, Result, ScalarValue}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::Operator; diff --git a/datafusion/physical-expr/src/physical_expr.rs b/datafusion/physical-expr/src/physical_expr.rs index e596cb2e6ceb..567054e2b59e 100644 --- a/datafusion/physical-expr/src/physical_expr.rs +++ b/datafusion/physical-expr/src/physical_expr.rs @@ -28,7 +28,7 @@ use arrow::compute::filter_record_batch; use arrow::datatypes::{DataType, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::utils::DataPtr; -use datafusion_common::{internal_err, not_impl_err, DataFusionError, Result}; +use datafusion_common::{internal_err, not_impl_err, Result}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::ColumnarValue; diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index b8491aea2d6f..bf279518d31d 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -26,8 +26,7 @@ use crate::{ }; use arrow::datatypes::Schema; use datafusion_common::{ - exec_err, internal_err, not_impl_err, plan_err, DFSchema, DataFusionError, Result, - ScalarValue, + exec_err, internal_err, not_impl_err, plan_err, DFSchema, Result, ScalarValue, }; use datafusion_expr::expr::{Alias, Cast, InList, ScalarFunction}; use datafusion_expr::{ diff --git a/datafusion/physical-expr/src/string_expressions.rs b/datafusion/physical-expr/src/string_expressions.rs index 6a4a29763e4b..ace7ef2888a3 100644 --- a/datafusion/physical-expr/src/string_expressions.rs +++ b/datafusion/physical-expr/src/string_expressions.rs @@ -37,13 +37,13 @@ use arrow::{ use uuid::Uuid; use datafusion_common::utils::datafusion_strsim; +use datafusion_common::Result; use datafusion_common::{ cast::{ as_generic_string_array, as_int64_array, as_primitive_array, as_string_array, }, exec_err, ScalarValue, }; -use datafusion_common::{DataFusionError, Result}; use datafusion_expr::ColumnarValue; /// applies a unary expression to `args[0]` that is expected to be downcastable to diff --git a/datafusion/physical-expr/src/struct_expressions.rs b/datafusion/physical-expr/src/struct_expressions.rs index b0ccb2a3ccb6..f420e062ef91 100644 --- a/datafusion/physical-expr/src/struct_expressions.rs +++ b/datafusion/physical-expr/src/struct_expressions.rs @@ -19,7 +19,7 @@ use arrow::array::*; use arrow::datatypes::Field; -use datafusion_common::{exec_err, DataFusionError, Result}; +use datafusion_common::{exec_err, Result}; use datafusion_expr::ColumnarValue; use std::sync::Arc; diff --git a/datafusion/physical-expr/src/unicode_expressions.rs b/datafusion/physical-expr/src/unicode_expressions.rs index 3209a6176fad..aa6a84119c34 100644 --- a/datafusion/physical-expr/src/unicode_expressions.rs +++ b/datafusion/physical-expr/src/unicode_expressions.rs @@ -33,7 +33,7 @@ use unicode_segmentation::UnicodeSegmentation; use datafusion_common::{ cast::{as_generic_string_array, as_int64_array}, - exec_err, DataFusionError, Result, + exec_err, Result, }; /// Returns number of characters in the string. diff --git a/datafusion/physical-expr/src/window/nth_value.rs b/datafusion/physical-expr/src/window/nth_value.rs index 05909ab25a07..a7bb31b6e109 100644 --- a/datafusion/physical-expr/src/window/nth_value.rs +++ b/datafusion/physical-expr/src/window/nth_value.rs @@ -29,8 +29,8 @@ use crate::PhysicalExpr; use arrow::array::{Array, ArrayRef}; use arrow::datatypes::{DataType, Field}; +use datafusion_common::Result; use datafusion_common::{exec_err, ScalarValue}; -use datafusion_common::{DataFusionError, Result}; use datafusion_expr::window_state::WindowAggState; use datafusion_expr::PartitionEvaluator; diff --git a/datafusion/physical-expr/src/window/rank.rs b/datafusion/physical-expr/src/window/rank.rs index 437fdbe0b982..fa3d4e487f14 100644 --- a/datafusion/physical-expr/src/window/rank.rs +++ b/datafusion/physical-expr/src/window/rank.rs @@ -28,7 +28,7 @@ use arrow::array::{Float64Array, UInt64Array}; use arrow::datatypes::{DataType, Field}; use arrow_schema::{SchemaRef, SortOptions}; use datafusion_common::utils::get_row_at_idx; -use datafusion_common::{exec_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::PartitionEvaluator; use std::any::Any; diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 156362430558..855408c4baa8 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -36,7 +36,7 @@ use arrow::array::ArrayRef; use arrow::datatypes::{Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::stats::Precision; -use datafusion_common::{internal_err, not_impl_err, plan_err, DataFusionError, Result}; +use datafusion_common::{internal_err, not_impl_err, plan_err, Result}; use datafusion_execution::TaskContext; use datafusion_expr::Accumulator; use datafusion_physical_expr::{ diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index bfcff2853538..1521daee1334 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -29,7 +29,7 @@ use super::{DisplayAs, SendableRecordBatchStream, Statistics}; use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; use arrow::datatypes::SchemaRef; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_execution::TaskContext; use datafusion_physical_expr::EquivalenceProperties; diff --git a/datafusion/physical-plan/src/empty.rs b/datafusion/physical-plan/src/empty.rs index 41c8dbed1453..398edc211de3 100644 --- a/datafusion/physical-plan/src/empty.rs +++ b/datafusion/physical-plan/src/empty.rs @@ -26,7 +26,7 @@ use crate::{memory::MemoryStream, DisplayFormatType, ExecutionPlan, Partitioning use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_execution::TaskContext; use log::trace; diff --git a/datafusion/physical-plan/src/explain.rs b/datafusion/physical-plan/src/explain.rs index e4904ddd3410..babcaf078bdc 100644 --- a/datafusion/physical-plan/src/explain.rs +++ b/datafusion/physical-plan/src/explain.rs @@ -27,7 +27,7 @@ use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; use arrow::{array::StringBuilder, datatypes::SchemaRef, record_batch::RecordBatch}; use datafusion_common::display::StringifiedPlan; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_execution::TaskContext; use log::trace; diff --git a/datafusion/physical-plan/src/insert.rs b/datafusion/physical-plan/src/insert.rs index 81cdfd753fe6..e678425d3753 100644 --- a/datafusion/physical-plan/src/insert.rs +++ b/datafusion/physical-plan/src/insert.rs @@ -33,7 +33,7 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use arrow_array::{ArrayRef, UInt64Array}; use arrow_schema::{DataType, Field, Schema}; -use datafusion_common::{exec_err, internal_err, DataFusionError, Result}; +use datafusion_common::{exec_err, internal_err, Result}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{Distribution, PhysicalSortRequirement}; diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 938c9e4d343d..99bd051da0f7 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -35,7 +35,7 @@ use arrow::datatypes::{Fields, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_array::RecordBatchOptions; use datafusion_common::stats::Precision; -use datafusion_common::{plan_err, DataFusionError, JoinType, Result, ScalarValue}; +use datafusion_common::{plan_err, JoinType, Result, ScalarValue}; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; use datafusion_execution::TaskContext; use datafusion_physical_expr::equivalence::join_equivalence_properties; diff --git a/datafusion/physical-plan/src/joins/nested_loop_join.rs b/datafusion/physical-plan/src/joins/nested_loop_join.rs index f89a2445fd07..1618efd4d0f9 100644 --- a/datafusion/physical-plan/src/joins/nested_loop_join.rs +++ b/datafusion/physical-plan/src/joins/nested_loop_join.rs @@ -44,7 +44,7 @@ use arrow::array::{ use arrow::datatypes::{Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow::util::bit_util; -use datafusion_common::{exec_err, DataFusionError, JoinSide, Result, Statistics}; +use datafusion_common::{exec_err, JoinSide, Result, Statistics}; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryReservation}; use datafusion_execution::TaskContext; use datafusion_expr::JoinType; diff --git a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs index 42c7029edcc1..506324852b21 100644 --- a/datafusion/physical-plan/src/joins/symmetric_hash_join.rs +++ b/datafusion/physical-plan/src/joins/symmetric_hash_join.rs @@ -62,9 +62,7 @@ use arrow::datatypes::{Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::hash_utils::create_hashes; use datafusion_common::utils::bisect; -use datafusion_common::{ - internal_err, plan_err, DataFusionError, JoinSide, JoinType, Result, -}; +use datafusion_common::{internal_err, plan_err, JoinSide, JoinType, Result}; use datafusion_execution::memory_pool::MemoryConsumer; use datafusion_execution::TaskContext; use datafusion_expr::interval_arithmetic::Interval; diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 562e42a7da3b..a15fd470a98b 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -31,7 +31,7 @@ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_common::tree_node::Transformed; use datafusion_common::utils::DataPtr; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_err, Result}; use datafusion_execution::TaskContext; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::{ diff --git a/datafusion/physical-plan/src/limit.rs b/datafusion/physical-plan/src/limit.rs index 417bc4cf977b..680aa23214f9 100644 --- a/datafusion/physical-plan/src/limit.rs +++ b/datafusion/physical-plan/src/limit.rs @@ -32,7 +32,7 @@ use crate::{ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; use datafusion_common::stats::Precision; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_execution::TaskContext; use futures::stream::{Stream, StreamExt}; diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index 7de474fda11c..86bd89e7ebac 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -30,7 +30,7 @@ use super::{ use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; -use datafusion_common::{internal_err, project_schema, DataFusionError, Result}; +use datafusion_common::{internal_err, project_schema, Result}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; diff --git a/datafusion/physical-plan/src/placeholder_row.rs b/datafusion/physical-plan/src/placeholder_row.rs index 3ab3de62f37a..3371148587d1 100644 --- a/datafusion/physical-plan/src/placeholder_row.rs +++ b/datafusion/physical-plan/src/placeholder_row.rs @@ -28,7 +28,7 @@ use arrow::array::{ArrayRef, NullArray}; use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_array::RecordBatchOptions; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_execution::TaskContext; use log::trace; diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index f4b57e8bfb45..81a26cd2188d 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -30,7 +30,7 @@ use crate::{ }; use arrow::datatypes::SchemaRef; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_execution::memory_pool::MemoryConsumer; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, PhysicalSortRequirement}; diff --git a/datafusion/physical-plan/src/sorts/streaming_merge.rs b/datafusion/physical-plan/src/sorts/streaming_merge.rs index 4f8d8063853b..9e6618dd1af5 100644 --- a/datafusion/physical-plan/src/sorts/streaming_merge.rs +++ b/datafusion/physical-plan/src/sorts/streaming_merge.rs @@ -26,7 +26,7 @@ use crate::sorts::{ use crate::{PhysicalSortExpr, SendableRecordBatchStream}; use arrow::datatypes::{DataType, SchemaRef}; use arrow_array::*; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; use datafusion_execution::memory_pool::MemoryReservation; macro_rules! primitive_merge_helper { diff --git a/datafusion/physical-plan/src/stream.rs b/datafusion/physical-plan/src/stream.rs index b780a50cdc90..970194550045 100644 --- a/datafusion/physical-plan/src/stream.rs +++ b/datafusion/physical-plan/src/stream.rs @@ -25,7 +25,7 @@ use std::task::Poll; use crate::displayable; use arrow::{datatypes::SchemaRef, record_batch::RecordBatch}; use datafusion_common::internal_err; -use datafusion_common::DataFusionError; + use datafusion_common::Result; use datafusion_execution::TaskContext; use futures::stream::BoxStream; diff --git a/datafusion/physical-plan/src/streaming.rs b/datafusion/physical-plan/src/streaming.rs index 897682092831..bca37ed942d0 100644 --- a/datafusion/physical-plan/src/streaming.rs +++ b/datafusion/physical-plan/src/streaming.rs @@ -27,7 +27,7 @@ use crate::{ExecutionPlan, Partitioning, SendableRecordBatchStream}; use arrow::datatypes::SchemaRef; use arrow_schema::Schema; -use datafusion_common::{internal_err, plan_err, DataFusionError, Result}; +use datafusion_common::{internal_err, plan_err, Result}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalSortExpr}; diff --git a/datafusion/physical-plan/src/udaf.rs b/datafusion/physical-plan/src/udaf.rs index a82bbe1d0705..fd9279dfd552 100644 --- a/datafusion/physical-plan/src/udaf.rs +++ b/datafusion/physical-plan/src/udaf.rs @@ -28,7 +28,7 @@ use arrow::{ }; use super::{expressions::format_state_name, Accumulator, AggregateExpr}; -use datafusion_common::{not_impl_err, DataFusionError, Result}; +use datafusion_common::{not_impl_err, Result}; pub use datafusion_expr::AggregateUDF; use datafusion_physical_expr::PhysicalExpr; diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index d01ea5507449..62a6d5c0f877 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -39,7 +39,7 @@ use crate::stream::ObservedStream; use arrow::datatypes::{Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::stats::Precision; -use datafusion_common::{exec_err, internal_err, DataFusionError, Result}; +use datafusion_common::{exec_err, internal_err, Result}; use datafusion_execution::TaskContext; use datafusion_physical_expr::EquivalenceProperties; diff --git a/datafusion/physical-plan/src/unnest.rs b/datafusion/physical-plan/src/unnest.rs index b9e732c317af..d5453f0924a3 100644 --- a/datafusion/physical-plan/src/unnest.rs +++ b/datafusion/physical-plan/src/unnest.rs @@ -35,7 +35,7 @@ use arrow::datatypes::{ }; use arrow::record_batch::RecordBatch; use arrow_array::{GenericListArray, OffsetSizeTrait}; -use datafusion_common::{exec_err, DataFusionError, Result, UnnestOptions}; +use datafusion_common::{exec_err, Result, UnnestOptions}; use datafusion_execution::TaskContext; use async_trait::async_trait; diff --git a/datafusion/physical-plan/src/values.rs b/datafusion/physical-plan/src/values.rs index f82f7ea2f869..9c1ce93b2a08 100644 --- a/datafusion/physical-plan/src/values.rs +++ b/datafusion/physical-plan/src/values.rs @@ -29,7 +29,7 @@ use crate::{ use arrow::datatypes::{Schema, SchemaRef}; use arrow::record_batch::{RecordBatch, RecordBatchOptions}; -use datafusion_common::{internal_err, plan_err, DataFusionError, Result, ScalarValue}; +use datafusion_common::{internal_err, plan_err, Result, ScalarValue}; use datafusion_execution::TaskContext; /// Execution plan for values list based relation (produces constant rows) diff --git a/datafusion/physical-plan/src/windows/window_agg_exec.rs b/datafusion/physical-plan/src/windows/window_agg_exec.rs index 6c245f65ba4f..e80102812ebd 100644 --- a/datafusion/physical-plan/src/windows/window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/window_agg_exec.rs @@ -45,7 +45,7 @@ use arrow::{ }; use datafusion_common::stats::Precision; use datafusion_common::utils::evaluate_partition_ranges; -use datafusion_common::{internal_err, plan_err, DataFusionError, Result}; +use datafusion_common::{internal_err, plan_err, Result}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, PhysicalSortRequirement}; diff --git a/datafusion/physical-plan/src/work_table.rs b/datafusion/physical-plan/src/work_table.rs index c74a596f3dae..0f934a76a60f 100644 --- a/datafusion/physical-plan/src/work_table.rs +++ b/datafusion/physical-plan/src/work_table.rs @@ -34,7 +34,7 @@ use super::{ metrics::{ExecutionPlanMetricsSet, MetricsSet}, SendableRecordBatchStream, Statistics, }; -use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_common::{internal_err, Result}; /// The name is from PostgreSQL's terminology. /// See diff --git a/datafusion/proto/src/bytes/mod.rs b/datafusion/proto/src/bytes/mod.rs index 6d5a7c7f3063..d4abb9ed9c6f 100644 --- a/datafusion/proto/src/bytes/mod.rs +++ b/datafusion/proto/src/bytes/mod.rs @@ -23,7 +23,7 @@ use crate::physical_plan::{ AsExecutionPlan, DefaultPhysicalExtensionCodec, PhysicalExtensionCodec, }; use crate::protobuf; -use datafusion_common::{plan_datafusion_err, DataFusionError, Result}; +use datafusion_common::{plan_datafusion_err, Result}; use datafusion_expr::{ create_udaf, create_udf, create_udwf, AggregateUDF, Expr, LogicalPlan, Volatility, WindowUDF, diff --git a/datafusion/proto/src/bytes/registry.rs b/datafusion/proto/src/bytes/registry.rs index 7c993c639991..4bf2bb3d7b79 100644 --- a/datafusion/proto/src/bytes/registry.rs +++ b/datafusion/proto/src/bytes/registry.rs @@ -19,7 +19,7 @@ use std::{collections::HashSet, sync::Arc}; use datafusion::execution::registry::FunctionRegistry; use datafusion_common::plan_err; -use datafusion_common::{DataFusionError, Result}; +use datafusion_common::Result; use datafusion_expr::{AggregateUDF, ScalarUDF, WindowUDF}; /// A default [`FunctionRegistry`] registry that does not resolve any diff --git a/datafusion/sql/examples/sql.rs b/datafusion/sql/examples/sql.rs index 9df65b99a748..8744a905481f 100644 --- a/datafusion/sql/examples/sql.rs +++ b/datafusion/sql/examples/sql.rs @@ -17,7 +17,7 @@ use arrow_schema::{DataType, Field, Schema}; use datafusion_common::config::ConfigOptions; -use datafusion_common::{plan_err, DataFusionError, Result}; +use datafusion_common::{plan_err, Result}; use datafusion_expr::WindowUDF; use datafusion_expr::{ logical_plan::builder::LogicalTableSource, AggregateUDF, ScalarUDF, TableSource, diff --git a/datafusion/sql/src/expr/binary_op.rs b/datafusion/sql/src/expr/binary_op.rs index 78efaca09938..0d37742e5b07 100644 --- a/datafusion/sql/src/expr/binary_op.rs +++ b/datafusion/sql/src/expr/binary_op.rs @@ -16,7 +16,7 @@ // under the License. use crate::planner::{ContextProvider, SqlToRel}; -use datafusion_common::{not_impl_err, DataFusionError, Result}; +use datafusion_common::{not_impl_err, Result}; use datafusion_expr::Operator; use sqlparser::ast::BinaryOperator; diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index db572a23cf99..bc1d672522dd 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -18,8 +18,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use arrow_schema::DataType; use datafusion_common::{ - not_impl_err, plan_datafusion_err, plan_err, DFSchema, DataFusionError, Dependency, - Result, + not_impl_err, plan_datafusion_err, plan_err, DFSchema, Dependency, Result, }; use datafusion_expr::expr::{ScalarFunction, Unnest}; use datafusion_expr::function::suggest_valid_function; diff --git a/datafusion/sql/src/expr/grouping_set.rs b/datafusion/sql/src/expr/grouping_set.rs index 254f5079b7b1..a8b3ef7e20ec 100644 --- a/datafusion/sql/src/expr/grouping_set.rs +++ b/datafusion/sql/src/expr/grouping_set.rs @@ -17,7 +17,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::plan_err; -use datafusion_common::{DFSchema, DataFusionError, Result}; +use datafusion_common::{DFSchema, Result}; use datafusion_expr::{Expr, GroupingSet}; use sqlparser::ast::Expr as SQLExpr; diff --git a/datafusion/sql/src/expr/json_access.rs b/datafusion/sql/src/expr/json_access.rs index 681b72b4e71a..b24482f88297 100644 --- a/datafusion/sql/src/expr/json_access.rs +++ b/datafusion/sql/src/expr/json_access.rs @@ -16,7 +16,7 @@ // under the License. use crate::planner::{ContextProvider, SqlToRel}; -use datafusion_common::{not_impl_err, DataFusionError, Result}; +use datafusion_common::{not_impl_err, Result}; use datafusion_expr::Operator; use sqlparser::ast::JsonOperator; diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index da6c3a6074d4..b058fb79b4a1 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -31,8 +31,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use arrow_schema::DataType; use arrow_schema::TimeUnit; use datafusion_common::{ - internal_err, not_impl_err, plan_err, Column, DFSchema, DataFusionError, Result, - ScalarValue, + internal_err, not_impl_err, plan_err, Column, DFSchema, Result, ScalarValue, }; use datafusion_expr::expr::AggregateFunctionDefinition; use datafusion_expr::expr::InList; diff --git a/datafusion/sql/src/expr/order_by.rs b/datafusion/sql/src/expr/order_by.rs index 772255bd9773..46f19f436ccc 100644 --- a/datafusion/sql/src/expr/order_by.rs +++ b/datafusion/sql/src/expr/order_by.rs @@ -16,9 +16,7 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{ - plan_datafusion_err, plan_err, DFSchema, DataFusionError, Result, -}; +use datafusion_common::{plan_datafusion_err, plan_err, DFSchema, Result}; use datafusion_expr::expr::Sort; use datafusion_expr::Expr; use sqlparser::ast::{Expr as SQLExpr, OrderByExpr, Value}; diff --git a/datafusion/sql/src/expr/substring.rs b/datafusion/sql/src/expr/substring.rs index 71b2a11cd414..a5d1abf0f265 100644 --- a/datafusion/sql/src/expr/substring.rs +++ b/datafusion/sql/src/expr/substring.rs @@ -17,7 +17,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::plan_err; -use datafusion_common::{DFSchema, DataFusionError, Result, ScalarValue}; +use datafusion_common::{DFSchema, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::{BuiltinScalarFunction, Expr}; use sqlparser::ast::Expr as SQLExpr; diff --git a/datafusion/sql/src/expr/unary_op.rs b/datafusion/sql/src/expr/unary_op.rs index 08ff6f2c3622..9fcee7a06124 100644 --- a/datafusion/sql/src/expr/unary_op.rs +++ b/datafusion/sql/src/expr/unary_op.rs @@ -16,7 +16,7 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{not_impl_err, DFSchema, DataFusionError, Result}; +use datafusion_common::{not_impl_err, DFSchema, Result}; use datafusion_expr::Expr; use sqlparser::ast::{Expr as SQLExpr, UnaryOperator, Value}; diff --git a/datafusion/sql/src/relation/join.rs b/datafusion/sql/src/relation/join.rs index b119672eae5f..4ba089f48630 100644 --- a/datafusion/sql/src/relation/join.rs +++ b/datafusion/sql/src/relation/join.rs @@ -16,7 +16,7 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{not_impl_err, Column, DataFusionError, Result}; +use datafusion_common::{not_impl_err, Column, Result}; use datafusion_expr::{JoinType, LogicalPlan, LogicalPlanBuilder}; use sqlparser::ast::{Join, JoinConstraint, JoinOperator, TableWithJoins}; use std::collections::HashSet; diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index b233f47a058f..1d52899160a9 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -16,9 +16,7 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{ - not_impl_err, plan_err, DFSchema, DataFusionError, Result, TableReference, -}; +use datafusion_common::{not_impl_err, plan_err, DFSchema, Result, TableReference}; use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; use sqlparser::ast::{FunctionArg, FunctionArgExpr, TableFactor}; diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index 7300d49be0f5..2cbb68368f72 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -16,7 +16,7 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{not_impl_err, DataFusionError, Result}; +use datafusion_common::{not_impl_err, Result}; use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; use sqlparser::ast::{SetExpr, SetOperator, SetQuantifier}; diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 331d63cc22b2..bc9cc66b7626 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -27,7 +27,7 @@ use std::sync::Arc; use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion::common::{not_impl_err, plan_err, DFSchema, DFSchemaRef}; -use datafusion::error::{DataFusionError, Result}; +use datafusion::error::Result; use datafusion::execution::context::SessionState; use datafusion::execution::registry::SerializerRegistry; use datafusion::execution::runtime_env::RuntimeEnv; From b220f03fffda22c70f03fa84e244cf04f0e6644c Mon Sep 17 00:00:00 2001 From: Jonah Gao Date: Wed, 28 Feb 2024 20:48:20 +0800 Subject: [PATCH 44/45] feat: support for defining ARRAY columns in `CREATE TABLE` (#9381) --- datafusion/sql/src/planner.rs | 5 +-- datafusion/sqllogictest/test_files/array.slt | 36 ++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 012b1c51a5c1..1f21299d8559 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -238,7 +238,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let mut fields = Vec::with_capacity(columns.len()); for column in columns { - let data_type = self.convert_simple_data_type(&column.data_type)?; + let data_type = self.convert_data_type(&column.data_type)?; let not_nullable = column .options .iter() @@ -358,7 +358,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { match sql_type { SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type)) | SQLDataType::Array(ArrayElemTypeDef::SquareBracket(inner_sql_type)) => { - let data_type = self.convert_simple_data_type(inner_sql_type)?; + // Arrays may be multi-dimensional. + let data_type = self.convert_data_type(inner_sql_type)?; Ok(DataType::List(Arc::new(Field::new( "field", data_type, true, diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index da02a80a104f..e64346537150 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6165,6 +6165,39 @@ NULL NULL [60, 59, 58, 57, 56, 55, 54, , 52, 51] [51, 52, , 54, 55, 56, 57, 58, 59, 60] [70, 69, 68, 67, 66, 65, 64, 63, 62, 61] [61, 62, 63, 64, 65, 66, 67, 68, 69, 70] + +# Test defining a table with array columns +statement ok +create table test_create_array_table( + a int[], + b text[], + -- two-dimensional array + c int[][], + d int +); + +query ???I +insert into test_create_array_table values + ([1, 2, 3], ['a', 'b', 'c'], [[4,6], [6,7,8]], 1); +---- +1 + +query ???I +select * from test_create_array_table; +---- +[1, 2, 3] [a, b, c] [[4, 6], [6, 7, 8]] 1 + +query T +select arrow_typeof(a) from test_create_array_table; +---- +List(Field { name: "field", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) + +query T +select arrow_typeof(c) from test_create_array_table; +---- +List(Field { name: "field", data_type: List(Field { name: "field", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) + + ### Delete tables statement ok @@ -6334,3 +6367,6 @@ drop table large_arrays_values_without_nulls; statement ok drop table fixed_size_arrays_values_without_nulls; + +statement ok +drop table test_create_array_table; From fea2174574c1a2a24b24479e966fb232bd971435 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Wed, 28 Feb 2024 08:08:18 -0800 Subject: [PATCH 45/45] Final reviews and cleanups --- datafusion/core/benches/sort.rs | 19 ++++----- datafusion/core/src/dataframe/mod.rs | 3 +- .../core/src/datasource/listing/table.rs | 7 ++-- datafusion/core/src/datasource/memory.rs | 39 +++++++++++-------- .../core/src/datasource/physical_plan/avro.rs | 6 +-- .../core/src/datasource/physical_plan/csv.rs | 12 +++--- .../core/src/datasource/physical_plan/json.rs | 5 +-- .../datasource/physical_plan/parquet/mod.rs | 5 +-- .../src/physical_optimizer/join_selection.rs | 11 +++--- .../limited_distinct_aggregation.rs | 18 +++++---- .../physical_optimizer/pipeline_checker.rs | 3 +- .../physical_optimizer/projection_pushdown.rs | 3 +- .../src/physical_optimizer/sort_pushdown.rs | 3 +- .../physical_optimizer/topk_aggregation.rs | 5 ++- .../core/src/physical_optimizer/utils.rs | 3 +- datafusion/core/src/physical_planner.rs | 5 +-- datafusion/physical-plan/src/analyze.rs | 8 ++-- .../physical-plan/src/coalesce_partitions.rs | 8 ++-- datafusion/physical-plan/src/common.rs | 3 +- datafusion/physical-plan/src/lib.rs | 18 ++++----- datafusion/physical-plan/src/memory.rs | 2 +- .../physical-plan/src/repartition/mod.rs | 22 +++++------ datafusion/physical-plan/src/stream.rs | 17 ++++---- 23 files changed, 110 insertions(+), 115 deletions(-) diff --git a/datafusion/core/benches/sort.rs b/datafusion/core/benches/sort.rs index 34b4a5ebf0dc..94a39bbb2af3 100644 --- a/datafusion/core/benches/sort.rs +++ b/datafusion/core/benches/sort.rs @@ -68,35 +68,32 @@ use std::sync::Arc; -use arrow::array::DictionaryArray; -use arrow::datatypes::Int32Type; use arrow::{ - array::{Float64Array, Int64Array, StringArray}, + array::{DictionaryArray, Float64Array, Int64Array, StringArray}, compute::SortOptions, - datatypes::Schema, + datatypes::{Int32Type, Schema}, record_batch::RecordBatch, }; -/// Benchmarks for SortPreservingMerge stream -use criterion::{criterion_group, criterion_main, Criterion}; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::{ execution::context::TaskContext, physical_plan::{ - memory::MemoryExec, sorts::sort_preserving_merge::SortPreservingMergeExec, - ExecutionPlan, + coalesce_partitions::CoalescePartitionsExec, memory::MemoryExec, + sorts::sort_preserving_merge::SortPreservingMergeExec, ExecutionPlan, + ExecutionPlanProperties, }, prelude::SessionContext, }; use datafusion_physical_expr::{expressions::col, PhysicalSortExpr}; + +/// Benchmarks for SortPreservingMerge stream +use criterion::{criterion_group, criterion_main, Criterion}; use futures::StreamExt; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use tokio::runtime::Runtime; -use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; -use datafusion_physical_plan::ExecutionPlanProperties; - /// Total number of streams to divide each input into /// models 8 partition plan (should it be 16??) const NUM_STREAMS: usize = 8; diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 1372570179fe..d7c31b9bd6b3 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -23,7 +23,6 @@ mod parquet; use std::any::Any; use std::sync::Arc; -use crate::arrow::datatypes::{Schema, SchemaRef}; use crate::arrow::record_batch::RecordBatch; use crate::arrow::util::pretty; use crate::datasource::{provider_as_source, MemTable, TableProvider}; @@ -43,7 +42,7 @@ use crate::prelude::SessionContext; use arrow::array::{Array, ArrayRef, Int64Array, StringArray}; use arrow::compute::{cast, concat}; use arrow::csv::WriterBuilder; -use arrow::datatypes::{DataType, Field}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::file_options::json_writer::JsonWriterOptions; use datafusion_common::parsers::CompressionTypeVariant; diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index a1f3d14aacca..00821a1cdd1a 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -27,7 +27,9 @@ use super::PartitionedFile; #[cfg(feature = "parquet")] use crate::datasource::file_format::parquet::ParquetFormat; use crate::datasource::{ - create_ordering, + create_ordering, get_statistics_with_limit, TableProvider, TableType, +}; +use crate::datasource::{ file_format::{ arrow::ArrowFormat, avro::AvroFormat, @@ -36,10 +38,8 @@ use crate::datasource::{ json::JsonFormat, FileFormat, }, - get_statistics_with_limit, listing::ListingTableUrl, physical_plan::{FileScanConfig, FileSinkConfig}, - TableProvider, TableType, }; use crate::{ error::{DataFusionError, Result}, @@ -921,6 +921,7 @@ mod tests { use datafusion_expr::{BinaryExpr, LogicalPlanBuilder, Operator}; use datafusion_physical_expr::PhysicalSortExpr; use datafusion_physical_plan::ExecutionPlanProperties; + use tempfile::TempDir; #[tokio::test] diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index cbe20f6a63a1..e47122ccdfda 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -17,35 +17,37 @@ //! [`MemTable`] for querying `Vec` by DataFusion. -use datafusion_physical_plan::metrics::MetricsSet; -use futures::StreamExt; -use log::debug; use std::any::Any; use std::collections::HashMap; use std::fmt::{self, Debug}; use std::sync::Arc; -use arrow::datatypes::SchemaRef; -use arrow::record_batch::RecordBatch; -use async_trait::async_trait; -use datafusion_common::{not_impl_err, plan_err, Constraints, DFSchema, SchemaExt}; -use datafusion_execution::TaskContext; -use datafusion_physical_plan::ExecutionPlanProperties; -use parking_lot::Mutex; -use tokio::sync::RwLock; -use tokio::task::JoinSet; - use crate::datasource::{TableProvider, TableType}; use crate::error::Result; use crate::execution::context::SessionState; use crate::logical_expr::Expr; use crate::physical_plan::insert::{DataSink, FileSinkExec}; use crate::physical_plan::memory::MemoryExec; -use crate::physical_plan::{common, SendableRecordBatchStream}; -use crate::physical_plan::{repartition::RepartitionExec, Partitioning}; -use crate::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; +use crate::physical_plan::repartition::RepartitionExec; +use crate::physical_plan::{ + common, DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, + Partitioning, SendableRecordBatchStream, +}; use crate::physical_planner::create_physical_sort_expr; +use arrow::datatypes::SchemaRef; +use arrow::record_batch::RecordBatch; +use datafusion_common::{not_impl_err, plan_err, Constraints, DFSchema, SchemaExt}; +use datafusion_execution::TaskContext; +use datafusion_physical_plan::metrics::MetricsSet; + +use async_trait::async_trait; +use futures::StreamExt; +use log::debug; +use parking_lot::Mutex; +use tokio::sync::RwLock; +use tokio::task::JoinSet; + /// Type alias for partition data pub type PartitionData = Arc>>; @@ -362,17 +364,20 @@ impl DataSink for MemSink { #[cfg(test)] mod tests { + use std::collections::HashMap; + use super::*; use crate::datasource::provider_as_source; use crate::physical_plan::collect; use crate::prelude::SessionContext; + use arrow::array::{AsArray, Int32Array}; use arrow::datatypes::{DataType, Field, Schema, UInt64Type}; use arrow::error::ArrowError; use datafusion_common::DataFusionError; use datafusion_expr::LogicalPlanBuilder; + use futures::StreamExt; - use std::collections::HashMap; #[tokio::test] async fn test_with_projection() -> Result<()> { diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 6e7dcf39069c..2ccd83de80cb 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -219,13 +219,15 @@ mod private { #[cfg(test)] #[cfg(feature = "avro")] mod tests { + use super::*; + use crate::arrow::datatypes::{DataType, Field, SchemaBuilder}; use crate::datasource::file_format::{avro::AvroFormat, FileFormat}; use crate::datasource::listing::PartitionedFile; use crate::datasource::object_store::ObjectStoreUrl; use crate::prelude::SessionContext; use crate::scalar::ScalarValue; use crate::test::object_store::local_unpartitioned_file; - use arrow::datatypes::{DataType, Field, SchemaBuilder}; + use futures::StreamExt; use object_store::chunked::ChunkedStore; use object_store::local::LocalFileSystem; @@ -233,8 +235,6 @@ mod tests { use rstest::*; use url::Url; - use super::*; - #[tokio::test] async fn avro_exec_without_partition() -> Result<()> { test_with_stores(Arc::new(LocalFileSystem::new())).await diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 05a83e8ac0b7..5fcb9f483952 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -32,8 +32,8 @@ use crate::datasource::physical_plan::FileMeta; use crate::error::{DataFusionError, Result}; use crate::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use crate::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, - PlanProperties, SendableRecordBatchStream, Statistics, + DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, ExecutionPlanProperties, + Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, }; use arrow::csv; @@ -41,7 +41,6 @@ use arrow::datatypes::SchemaRef; use datafusion_common::config::ConfigOptions; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; -use datafusion_physical_plan::ExecutionPlanProperties; use bytes::{Buf, Bytes}; use futures::{ready, StreamExt, TryStreamExt}; @@ -512,20 +511,23 @@ pub async fn plan_to_csv( #[cfg(test)] mod tests { + use std::fs::{self, File}; + use std::io::Write; + use super::*; use crate::dataframe::DataFrameWriteOptions; use crate::prelude::*; use crate::test::{partitioned_csv_config, partitioned_file_groups}; use crate::{scalar::ScalarValue, test_util::aggr_test_schema}; + use arrow::datatypes::*; use datafusion_common::test_util::arrow_test_data; use datafusion_common::FileType; + use futures::StreamExt; use object_store::chunked::ChunkedStore; use object_store::local::LocalFileSystem; use rstest::*; - use std::fs::{self, File}; - use std::io::Write; use tempfile::TempDir; use url::Url; diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 6f9af2e6abcf..62b96ea3aefb 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -32,15 +32,14 @@ use crate::datasource::physical_plan::FileMeta; use crate::error::{DataFusionError, Result}; use crate::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; use crate::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, - PlanProperties, SendableRecordBatchStream, Statistics, + DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, ExecutionPlanProperties, + Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, }; use arrow::json::ReaderBuilder; use arrow::{datatypes::SchemaRef, json}; use datafusion_execution::TaskContext; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; -use datafusion_physical_plan::ExecutionPlanProperties; use bytes::{Buf, Bytes}; use futures::{ready, StreamExt, TryStreamExt}; diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 5ccffde26359..12b62fd68068 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -38,15 +38,14 @@ use crate::{ physical_optimizer::pruning::PruningPredicate, physical_plan::{ metrics::{ExecutionPlanMetricsSet, MetricBuilder, MetricsSet}, - DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, PlanProperties, - SendableRecordBatchStream, Statistics, + DisplayFormatType, ExecutionMode, ExecutionPlan, ExecutionPlanProperties, + Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, }, }; use arrow::datatypes::{DataType, SchemaRef}; use arrow::error::ArrowError; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalExpr}; -use datafusion_physical_plan::ExecutionPlanProperties; use bytes::Bytes; use futures::future::BoxFuture; diff --git a/datafusion/core/src/physical_optimizer/join_selection.rs b/datafusion/core/src/physical_optimizer/join_selection.rs index 338bf619f9a0..ee60c65ead0b 100644 --- a/datafusion/core/src/physical_optimizer/join_selection.rs +++ b/datafusion/core/src/physical_optimizer/join_selection.rs @@ -34,16 +34,14 @@ use crate::physical_plan::joins::{ SymmetricHashJoinExec, }; use crate::physical_plan::projection::ProjectionExec; -use crate::physical_plan::ExecutionPlan; +use crate::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use arrow_schema::Schema; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::JoinType; -use datafusion_common::{internal_err, JoinSide}; +use datafusion_common::{internal_err, JoinSide, JoinType}; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::sort_properties::SortProperties; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr}; -use datafusion_physical_plan::ExecutionPlanProperties; /// The [`JoinSelection`] rule tries to modify a given plan so that it can /// accommodate infinite sources and optimize joins in the plan according to @@ -1366,8 +1364,9 @@ mod util_tests { #[cfg(test)] mod hash_join_tests { - use self::tests_statistical::crosscheck_plans; + use std::sync::Arc; + use self::tests_statistical::crosscheck_plans; use super::*; use crate::physical_optimizer::join_selection::swap_join_type; use crate::physical_optimizer::test_utils::SourceType; @@ -1375,12 +1374,12 @@ mod hash_join_tests { use crate::physical_plan::joins::PartitionMode; use crate::physical_plan::projection::ProjectionExec; use crate::test_util::UnboundedExec; + use arrow::datatypes::{DataType, Field, Schema}; use arrow::record_batch::RecordBatch; use datafusion_common::utils::DataPtr; use datafusion_common::JoinType; use datafusion_physical_plan::ExecutionPlanProperties; - use std::sync::Arc; struct TestCase { case: String, diff --git a/datafusion/core/src/physical_optimizer/limited_distinct_aggregation.rs b/datafusion/core/src/physical_optimizer/limited_distinct_aggregation.rs index 036c938c1ca6..7be9acec5092 100644 --- a/datafusion/core/src/physical_optimizer/limited_distinct_aggregation.rs +++ b/datafusion/core/src/physical_optimizer/limited_distinct_aggregation.rs @@ -18,16 +18,18 @@ //! A special-case optimizer rule that pushes limit into a grouped aggregation //! which has no aggregate expressions or sorting requirements +use std::sync::Arc; + use crate::physical_optimizer::PhysicalOptimizerRule; use crate::physical_plan::aggregates::AggregateExec; use crate::physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; -use crate::physical_plan::ExecutionPlan; +use crate::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; + use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::Result; -use datafusion_physical_plan::ExecutionPlanProperties; + use itertools::Itertools; -use std::sync::Arc; /// An optimizer rule that passes a `limit` hint into grouped aggregations which don't require all /// rows in the group to be processed for correctness. Example queries fitting this description are: @@ -189,6 +191,8 @@ impl PhysicalOptimizerRule for LimitedDistinctAggregation { #[cfg(test)] mod tests { + use std::sync::Arc; + use super::*; use crate::error::Result; use crate::physical_optimizer::aggregate_statistics::tests::TestAggregate; @@ -199,6 +203,7 @@ mod tests { use crate::physical_plan::collect; use crate::physical_plan::memory::MemoryExec; use crate::prelude::SessionContext; + use arrow::array::Int32Array; use arrow::compute::SortOptions; use arrow::datatypes::{DataType, Field, Schema}; @@ -207,13 +212,10 @@ mod tests { use arrow_schema::SchemaRef; use datafusion_execution::config::SessionConfig; use datafusion_expr::Operator; - use datafusion_physical_expr::expressions::cast; - use datafusion_physical_expr::expressions::col; - use datafusion_physical_expr::PhysicalSortExpr; - use datafusion_physical_expr::{expressions, PhysicalExpr}; + use datafusion_physical_expr::expressions::{cast, col}; + use datafusion_physical_expr::{expressions, PhysicalExpr, PhysicalSortExpr}; use datafusion_physical_plan::aggregates::AggregateMode; use datafusion_physical_plan::displayable; - use std::sync::Arc; fn mock_data() -> Result> { let schema = Arc::new(Schema::new(vec![ diff --git a/datafusion/core/src/physical_optimizer/pipeline_checker.rs b/datafusion/core/src/physical_optimizer/pipeline_checker.rs index c0a77eb56f87..e783f75378b1 100644 --- a/datafusion/core/src/physical_optimizer/pipeline_checker.rs +++ b/datafusion/core/src/physical_optimizer/pipeline_checker.rs @@ -24,14 +24,13 @@ use std::sync::Arc; use crate::config::ConfigOptions; use crate::error::Result; use crate::physical_optimizer::PhysicalOptimizerRule; -use crate::physical_plan::ExecutionPlan; +use crate::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use datafusion_common::config::OptimizerOptions; use datafusion_common::plan_err; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_physical_expr::intervals::utils::{check_support, is_datatype_supported}; use datafusion_physical_plan::joins::SymmetricHashJoinExec; -use datafusion_physical_plan::ExecutionPlanProperties; /// The PipelineChecker rule rejects non-runnable query plans that use /// pipeline-breaking operators on infinite input(s). diff --git a/datafusion/core/src/physical_optimizer/projection_pushdown.rs b/datafusion/core/src/physical_optimizer/projection_pushdown.rs index 9cb2d6ecbc71..4ed265d59526 100644 --- a/datafusion/core/src/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/projection_pushdown.rs @@ -39,7 +39,7 @@ use crate::physical_plan::projection::ProjectionExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; -use crate::physical_plan::{Distribution, ExecutionPlan}; +use crate::physical_plan::{Distribution, ExecutionPlan, ExecutionPlanProperties}; use arrow_schema::SchemaRef; use datafusion_common::config::ConfigOptions; @@ -52,7 +52,6 @@ use datafusion_physical_expr::{ }; use datafusion_physical_plan::streaming::StreamingTableExec; use datafusion_physical_plan::union::UnionExec; -use datafusion_physical_plan::ExecutionPlanProperties; use itertools::Itertools; diff --git a/datafusion/core/src/physical_optimizer/sort_pushdown.rs b/datafusion/core/src/physical_optimizer/sort_pushdown.rs index 7c7564fdb400..ff82319fba19 100644 --- a/datafusion/core/src/physical_optimizer/sort_pushdown.rs +++ b/datafusion/core/src/physical_optimizer/sort_pushdown.rs @@ -28,7 +28,7 @@ use crate::physical_plan::projection::ProjectionExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::tree_node::PlanContext; -use crate::physical_plan::ExecutionPlan; +use crate::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use datafusion_common::tree_node::Transformed; use datafusion_common::{plan_err, JoinSide, Result}; @@ -37,7 +37,6 @@ use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::{ LexRequirementRef, PhysicalSortExpr, PhysicalSortRequirement, }; -use datafusion_physical_plan::ExecutionPlanProperties; /// This is a "data class" we use within the [`EnforceSorting`] rule to push /// down [`SortExec`] in the plan. In some cases, we can reduce the total diff --git a/datafusion/core/src/physical_optimizer/topk_aggregation.rs b/datafusion/core/src/physical_optimizer/topk_aggregation.rs index 2006402ac59e..0ca709e56bcb 100644 --- a/datafusion/core/src/physical_optimizer/topk_aggregation.rs +++ b/datafusion/core/src/physical_optimizer/topk_aggregation.rs @@ -17,6 +17,8 @@ //! An optimizer rule that detects aggregate operations that could use a limited bucket count +use std::sync::Arc; + use crate::physical_optimizer::PhysicalOptimizerRule; use crate::physical_plan::aggregates::AggregateExec; use crate::physical_plan::coalesce_batches::CoalesceBatchesExec; @@ -24,14 +26,15 @@ use crate::physical_plan::filter::FilterExec; use crate::physical_plan::repartition::RepartitionExec; use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::ExecutionPlan; + use arrow_schema::DataType; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::Result; use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::PhysicalSortExpr; + use itertools::Itertools; -use std::sync::Arc; /// An optimizer rule that passes a `limit` hint to aggregations if the whole result is not needed pub struct TopKAggregation {} diff --git a/datafusion/core/src/physical_optimizer/utils.rs b/datafusion/core/src/physical_optimizer/utils.rs index f4c2c3873f68..8cc543802e3f 100644 --- a/datafusion/core/src/physical_optimizer/utils.rs +++ b/datafusion/core/src/physical_optimizer/utils.rs @@ -25,12 +25,11 @@ use crate::physical_plan::sorts::sort::SortExec; use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use crate::physical_plan::union::UnionExec; use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; -use crate::physical_plan::ExecutionPlan; +use crate::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use datafusion_physical_expr::{LexRequirement, PhysicalSortRequirement}; use datafusion_physical_plan::limit::{GlobalLimitExec, LocalLimitExec}; use datafusion_physical_plan::tree_node::PlanContext; -use datafusion_physical_plan::ExecutionPlanProperties; /// This utility function adds a `SortExec` above an operator according to the /// given ordering requirements while preserving the original partitioning. diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 41a6e4d75be7..bf5f5afc5791 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -66,8 +66,8 @@ use crate::physical_plan::unnest::UnnestExec; use crate::physical_plan::values::ValuesExec; use crate::physical_plan::windows::{BoundedWindowAggExec, WindowAggExec}; use crate::physical_plan::{ - aggregates, displayable, udaf, windows, AggregateExpr, ExecutionPlan, InputOrderMode, - Partitioning, PhysicalExpr, WindowExpr, + aggregates, displayable, udaf, windows, AggregateExpr, ExecutionPlan, + ExecutionPlanProperties, InputOrderMode, Partitioning, PhysicalExpr, WindowExpr, }; use arrow::compute::SortOptions; @@ -93,7 +93,6 @@ use datafusion_expr::{ }; use datafusion_physical_expr::expressions::Literal; use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; -use datafusion_physical_plan::ExecutionPlanProperties; use datafusion_sql::utils::window_expr_common_partition_keys; use async_trait::async_trait; diff --git a/datafusion/physical-plan/src/analyze.rs b/datafusion/physical-plan/src/analyze.rs index f771ac238887..5baedc332951 100644 --- a/datafusion/physical-plan/src/analyze.rs +++ b/datafusion/physical-plan/src/analyze.rs @@ -25,7 +25,6 @@ use super::{ DisplayAs, Distribution, ExecutionPlanProperties, PlanProperties, SendableRecordBatchStream, }; - use crate::display::DisplayableExecutionPlan; use crate::{DisplayFormatType, ExecutionPlan, Partitioning}; @@ -249,9 +248,7 @@ fn create_output_batch( #[cfg(test)] mod tests { - use arrow::datatypes::{DataType, Field, Schema}; - use futures::FutureExt; - + use super::*; use crate::{ collect, test::{ @@ -260,7 +257,8 @@ mod tests { }, }; - use super::*; + use arrow::datatypes::{DataType, Field, Schema}; + use futures::FutureExt; #[tokio::test] async fn test_drop_cancel() -> Result<()> { diff --git a/datafusion/physical-plan/src/coalesce_partitions.rs b/datafusion/physical-plan/src/coalesce_partitions.rs index d2706cb06f90..1e58260a5344 100644 --- a/datafusion/physical-plan/src/coalesce_partitions.rs +++ b/datafusion/physical-plan/src/coalesce_partitions.rs @@ -168,10 +168,6 @@ impl ExecutionPlan for CoalescePartitionsExec { #[cfg(test)] mod tests { - - use arrow::datatypes::{DataType, Field, Schema}; - use futures::FutureExt; - use super::*; use crate::test::exec::{ assert_strong_count_converges_to_zero, BlockingExec, PanicExec, @@ -179,6 +175,10 @@ mod tests { use crate::test::{self, assert_is_pending}; use crate::{collect, common}; + use arrow::datatypes::{DataType, Field, Schema}; + + use futures::FutureExt; + #[tokio::test] async fn merge() -> Result<()> { let task_ctx = Arc::new(TaskContext::default()); diff --git a/datafusion/physical-plan/src/common.rs b/datafusion/physical-plan/src/common.rs index 003c60edd9a8..47cdf3e400e3 100644 --- a/datafusion/physical-plan/src/common.rs +++ b/datafusion/physical-plan/src/common.rs @@ -381,11 +381,10 @@ mod tests { use arrow::compute::SortOptions; use arrow::{ - array::{Float32Array, Float64Array}, + array::{Float32Array, Float64Array, UInt64Array}, datatypes::{DataType, Field, Schema}, record_batch::RecordBatch, }; - use arrow_array::UInt64Array; use datafusion_expr::Operator; use datafusion_physical_expr::expressions::{col, Column}; diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 578ce42f2e9d..b726b587dd1d 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -123,6 +123,7 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { self.properties().schema().clone() } + /// Gets plan properties, such as output ordering(s), partitioning information etc. fn properties(&self) -> &PlanProperties; /// Specifies the data distribution requirements for all the @@ -400,6 +401,8 @@ pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync { } } +/// This extension trait provides an API to fetch various properties of +/// [`ExecutionPlan`] objects. pub trait ExecutionPlanProperties { fn output_partitioning(&self) -> &Partitioning; @@ -419,21 +422,18 @@ impl ExecutionPlanProperties for Arc { /// Specifies whether this plan generates an infinite stream of records. /// If the plan does not support pipelining, but its input(s) are - /// infinite, returns an error to indicate this. + /// infinite, returns [`ExecutionMode::PipelineBreaking`] to indicate this. fn execution_mode(&self) -> ExecutionMode { self.properties().execution_mode() } /// If the output of this `ExecutionPlan` within each partition is sorted, - /// returns `Some(keys)` with the description of how it was sorted. + /// returns `Some(keys)` describing the ordering. A `None` return value + /// indicates no assumptions should be made on the output ordering. /// - /// For example, Sort, (obviously) produces sorted output as does - /// SortPreservingMergeStream. Less obviously `Projection` - /// produces sorted output if its input was sorted as it does not - /// reorder the input rows, - /// - /// It is safe to return `None` here if your `ExecutionPlan` does not - /// have any particular output order here + /// For example, `SortExec` (obviously) produces sorted output as does + /// `SortPreservingMergeStream`. Less obviously, `Projection` produces sorted + /// output if its input is sorted as it does not reorder the input rows. fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { self.properties().output_ordering() } diff --git a/datafusion/physical-plan/src/memory.rs b/datafusion/physical-plan/src/memory.rs index fa2d16530823..ca324a0f7d3b 100644 --- a/datafusion/physical-plan/src/memory.rs +++ b/datafusion/physical-plan/src/memory.rs @@ -287,8 +287,8 @@ mod tests { use std::sync::Arc; use crate::memory::MemoryExec; - use crate::ExecutionPlan; + use arrow_schema::{DataType, Field, Schema, SortOptions}; use datafusion_physical_expr::expressions::col; use datafusion_physical_expr::PhysicalSortExpr; diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index d1befb7c53c0..fe93ea131506 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -926,17 +926,7 @@ impl RecordBatchStream for PerPartitionStream { mod tests { use std::collections::HashSet; - use arrow::array::{ArrayRef, StringArray}; - use arrow::datatypes::{DataType, Field, Schema}; - use arrow::record_batch::RecordBatch; - use arrow_array::UInt32Array; - use futures::FutureExt; - use tokio::task::JoinHandle; - - use datafusion_common::cast::as_string_array; - use datafusion_common::{assert_batches_sorted_eq, exec_err}; - use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; - + use super::*; use crate::{ test::{ assert_is_pending, @@ -948,7 +938,15 @@ mod tests { {collect, expressions::col, memory::MemoryExec}, }; - use super::*; + use arrow::array::{ArrayRef, StringArray, UInt32Array}; + use arrow::datatypes::{DataType, Field, Schema}; + use arrow::record_batch::RecordBatch; + use datafusion_common::cast::as_string_array; + use datafusion_common::{assert_batches_sorted_eq, exec_err}; + use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; + + use futures::FutureExt; + use tokio::task::JoinHandle; #[tokio::test] async fn one_to_many_round_robin() -> Result<()> { diff --git a/datafusion/physical-plan/src/stream.rs b/datafusion/physical-plan/src/stream.rs index dcba5c74daf1..99d9367740be 100644 --- a/datafusion/physical-plan/src/stream.rs +++ b/datafusion/physical-plan/src/stream.rs @@ -22,12 +22,14 @@ use std::sync::Arc; use std::task::Context; use std::task::Poll; +use super::metrics::BaselineMetrics; +use super::{ExecutionPlan, RecordBatchStream, SendableRecordBatchStream}; use crate::displayable; -use arrow::{datatypes::SchemaRef, record_batch::RecordBatch}; -use datafusion_common::internal_err; -use datafusion_common::Result; +use arrow::{datatypes::SchemaRef, record_batch::RecordBatch}; +use datafusion_common::{internal_err, Result}; use datafusion_execution::TaskContext; + use futures::stream::BoxStream; use futures::{Future, Stream, StreamExt}; use log::debug; @@ -35,9 +37,6 @@ use pin_project_lite::pin_project; use tokio::sync::mpsc::{Receiver, Sender}; use tokio::task::JoinSet; -use super::metrics::BaselineMetrics; -use super::{ExecutionPlan, RecordBatchStream, SendableRecordBatchStream}; - /// Creates a stream from a collection of producing tasks, routing panics to the stream. /// /// Note that this is similar to [`ReceiverStream` from tokio-stream], with the differences being: @@ -458,13 +457,13 @@ impl futures::Stream for ObservedStream { #[cfg(test)] mod test { use super::*; - use arrow_schema::{DataType, Field, Schema}; - use datafusion_common::exec_err; - use crate::test::exec::{ assert_strong_count_converges_to_zero, BlockingExec, MockExec, PanicExec, }; + use arrow_schema::{DataType, Field, Schema}; + use datafusion_common::exec_err; + fn schema() -> SchemaRef { Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, true)])) }